feat: add openai realtime models

2026-06-28 08:49:42 +02:00 · 2026-05-16 08:42:43 +05:30 · 2026-05-16 08:42:43 +05:30 · 4d7b681928
commit 4d7b681928
parent 53f1959edf
33 changed files with 1518 additions and 75 deletions
--- a/api/services/pipecat/realtime/init.py
+++ b/api/services/pipecat/realtime/init.py
@ -0,0 +1,9 @@
+"""Dograh-specific subclasses of pipecat realtime LLM services.
+
+Each subclass wires Dograh engine integration quirks (user-mute gating,
+TTSSpeakFrame greeting trigger, node-transition handling, function-call
+deferral, etc.) onto the corresponding pipecat realtime service.
+
+The pipecat fork's services stay close to upstream — Dograh behavior lives
+here.
+"""
--- a/api/services/pipecat/realtime/gemini_live.py
+++ b/api/services/pipecat/realtime/gemini_live.py
@ -0,0 +1,237 @@
+"""Dograh subclass of pipecat's Gemini Live LLM service.
+
+Layers Dograh engine integration quirks onto upstream-pristine
+:class:`GeminiLiveLLMService`:
+
+- **Deferred connect.** Connection is held back until ``system_instruction``
+  is set via :meth:`_update_settings`, so pre-call-fetch template variables
+  land before the live session opens.
+- **Reconnect on node transitions.** Gemini Live cannot update
+  ``system_instruction`` mid-session, so a setting change triggers a
+  reconnect (deferred until the bot turn ends if currently responding).
+- **Function-call deferral.** Tool calls emitted mid-turn are queued and run
+  when the bot stops speaking, to avoid racing the turn's audio.
+- **User-mute audio gating.** ``UserMuteStarted/StoppedFrame`` from the
+  user aggregator gates whether incoming audio is forwarded to Gemini.
+- **TTSSpeakFrame as greeting trigger.** The engine queues a TTSSpeakFrame
+  to kick off the first response after node setup; the service intercepts
+  it and runs the initial-context path.
+- **Finalize-pending on transcriptions.** Marks the transcription emitted
+  immediately after VAD-stop as finalized, distinguishing it from
+  mid-turn partials.
+"""
+
+from typing import Any
+
+from loguru import logger
+
+from pipecat.frames.frames import (
+    BotStoppedSpeakingFrame,
+    Frame,
+    TranscriptionFrame,
+    TTSSpeakFrame,
+    UserMuteStartedFrame,
+    UserMuteStoppedFrame,
+)
+from pipecat.processors.aggregators.llm_context import LLMContext
+from pipecat.processors.frame_processor import FrameDirection
+from pipecat.services.google.gemini_live.llm import GeminiLiveLLMService
+from pipecat.services.llm_service import FunctionCallFromLLM
+from pipecat.utils.time import time_now_iso8601
+from pipecat.utils.tracing.service_decorators import traced_gemini_live
+
+
+class DograhGeminiLiveLLMService(GeminiLiveLLMService):
+    """Gemini Live with Dograh engine integration quirks. See module docstring."""
+
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        # User-mute state, driven by broadcast UserMute{Started,Stopped}Frames.
+        # Audio is not forwarded to Gemini while muted.
+        self._user_is_muted: bool = False
+        # Guards initial-response triggering against double-firing across the
+        # initial TTSSpeakFrame and any LLMContextFrame that may arrive.
+        self._handled_initial_context: bool = False
+        # When a system_instruction change arrives mid-bot-turn, the reconnect
+        # is queued and drained when the turn ends.
+        self._reconnect_pending: bool = False
+        # Function calls emitted by Gemini mid-bot-turn are deferred here and
+        # invoked when the turn ends, so they don't race the turn's audio.
+        self._pending_function_calls: list[FunctionCallFromLLM] = []
+        # Tracks whether the next transcription to arrive should be marked as
+        # the finalized transcription for the current user turn.
+        self._finalize_pending: bool = False
+
+    # ------------------------------------------------------------------
+    # Hooks from upstream GeminiLiveLLMService
+    # ------------------------------------------------------------------
+
+    def _should_connect_on_start(self) -> bool:
+        # Hold the connection until the engine sets a system_instruction. This
+        # lets pre-call fetch populate template variables first.
+        return bool(self._settings.system_instruction)
+
+    async def _handle_changed_settings(self, changed: dict[str, Any]) -> set[str]:
+        if "system_instruction" not in changed:
+            return set()
+        if not self._session:
+            # First-time setting after deferred-connect.
+            await self._connect()
+        elif self._bot_is_responding:
+            # Bot is mid-turn — drain the reconnect when it ends so we don't
+            # cut the bot off mid-utterance.
+            self._reconnect_pending = True
+        else:
+            await self._reconnect()
+        return {"system_instruction"}
+
+    async def _run_or_defer_function_calls(
+        self, function_calls_llm: list[FunctionCallFromLLM]
+    ):
+        if self._bot_is_responding:
+            # Latest batch wins; Gemini emits tool calls as one batch per
+            # tool_call message, so this overwrite is intentional.
+            self._pending_function_calls = function_calls_llm
+            logger.debug(
+                f"{self}: deferring {len(function_calls_llm)} function call(s) "
+                "until bot turn ends"
+            )
+            return
+        await super()._run_or_defer_function_calls(function_calls_llm)
+
+    # ------------------------------------------------------------------
+    # State-transition side effects
+    # ------------------------------------------------------------------
+
+    async def _set_bot_is_responding(self, responding: bool):
+        was_responding = self._bot_is_responding
+        await super()._set_bot_is_responding(responding)
+        if was_responding and not responding:
+            await self._run_pending_function_calls()
+            if self._reconnect_pending:
+                self._reconnect_pending = False
+                await self._reconnect()
+
+    async def _run_pending_function_calls(self):
+        """Run any function calls deferred during the bot's last turn."""
+        if not self._pending_function_calls:
+            return
+        fcs = self._pending_function_calls
+        self._pending_function_calls = []
+        logger.debug(
+            f"{self}: executing {len(fcs)} deferred function call(s) "
+            "after bot turn ended"
+        )
+        await self.run_function_calls(fcs)
+
+    # ------------------------------------------------------------------
+    # Frame handling: mute, TTSSpeakFrame, BotStoppedSpeakingFrame flush
+    # ------------------------------------------------------------------
+
+    async def process_frame(self, frame: Frame, direction: FrameDirection):
+        if isinstance(frame, UserMuteStartedFrame):
+            self._user_is_muted = True
+            await self.push_frame(frame, direction)
+            return
+        if isinstance(frame, UserMuteStoppedFrame):
+            self._user_is_muted = False
+            await self.push_frame(frame, direction)
+            return
+        if isinstance(frame, TTSSpeakFrame):
+            # Greeting trigger: the engine queues a TTSSpeakFrame to start the
+            # bot's first turn after node setup. Gemini Live renders its own
+            # audio, so we don't pass the frame through — we re-enter
+            # _handle_context to kick off the initial response.
+            if not self._handled_initial_context:
+                await self._handle_context(self._context)
+            else:
+                logger.warning(
+                    f"{self}: TTSSpeakFrame after initial context already "
+                    "handled — Gemini Live owns audio generation, ignoring"
+                )
+            return
+        if isinstance(frame, BotStoppedSpeakingFrame):
+            # Belt-and-suspenders: the main drain happens in
+            # _set_bot_is_responding(False), but if Gemini delays turn_complete
+            # past the audible end of the turn, flushing here ensures pending
+            # function calls fire promptly.
+            await self._run_pending_function_calls()
+            # Fall through to super for the actual push.
+        await super().process_frame(frame, direction)
+
+    async def _send_user_audio(self, frame):
+        if self._user_is_muted:
+            return
+        await super()._send_user_audio(frame)
+
+    # ------------------------------------------------------------------
+    # Context lifecycle: Dograh pre-populates self._context via the engine,
+    # so upstream's "first arrival === self._context is None" check doesn't
+    # work. We gate on _handled_initial_context instead and skip the
+    # init-instruction reconciliation (Dograh updates system_instruction at
+    # runtime via _update_settings, not via init).
+    # ------------------------------------------------------------------
+
+    async def _handle_context(self, context: LLMContext):
+        if not self._handled_initial_context:
+            self._handled_initial_context = True
+            self._context = context
+            await self._create_initial_response()
+        else:
+            self._context = context
+            await self._process_completed_function_calls(send_new_results=True)
+
+    # ------------------------------------------------------------------
+    # Session lifecycle: drop upstream's automatic reconnect-seed and
+    # initial-context-seed paths. The TTSSpeakFrame trigger and the
+    # function-call-result LLMContextFrame are the only paths that should
+    # kick off bot turns in the Dograh flow.
+    # ------------------------------------------------------------------
+
+    @traced_gemini_live(operation="llm_setup")
+    async def _handle_session_ready(self, session):
+        logger.debug(
+            f"In _handle_session_ready self._run_llm_when_session_ready: {self._run_llm_when_session_ready}"
+        )
+        self._session = session
+        self._ready_for_realtime_input = True
+        if self._run_llm_when_session_ready:
+            # Context arrived before session was ready — fulfil the queued
+            # initial response now.
+            self._run_llm_when_session_ready = False
+            await self._create_initial_response()
+        await self._drain_pending_tool_results()
+        # Otherwise: no automatic seed. Reconnect after a session-resumption
+        # update relies on the server-side restored state; reconnects without
+        # a handle (e.g. node transitions before any handle was issued) are
+        # followed by a function-call-result LLMContextFrame which feeds the
+        # updated-context branch in _handle_context.
+
+    # ------------------------------------------------------------------
+    # Transcription: broadcast (so downstream voicemail detector and
+    # logs buffer both see it) and set finalized= for turn-boundary
+    # semantics.
+    # ------------------------------------------------------------------
+
+    async def _handle_user_started_speaking(self, frame):
+        await super()._handle_user_started_speaking(frame)
+        # A new VAD start invalidates any pending finalize from a prior stop
+        # that hasn't been paired with a transcription yet.
+        self._finalize_pending = False
+
+    async def _handle_user_stopped_speaking(self, frame):
+        await super()._handle_user_stopped_speaking(frame)
+        self._finalize_pending = True
+
+    async def _push_user_transcription(self, text: str, result=None):
+        await self._handle_user_transcription(text, True, self._settings.language)
+        finalized = self._finalize_pending
+        self._finalize_pending = False
+        await self.broadcast_frame(
+            TranscriptionFrame,
+            text=text,
+            user_id="",
+            timestamp=time_now_iso8601(),
+            result=result,
+            finalized=finalized,
+        )
--- a/api/services/pipecat/realtime/gemini_live_vertex.py
+++ b/api/services/pipecat/realtime/gemini_live_vertex.py
@ -0,0 +1,42 @@
+"""Dograh subclass of pipecat's Gemini Live Vertex AI LLM service.
+
+Diamond inheritance: combines the Dograh engine-integration overrides from
+:class:`DograhGeminiLiveLLMService` with the Vertex-specific tweaks from
+upstream's :class:`GeminiLiveVertexLLMService` (no history config,
+``NON_BLOCKING`` tools disabled, service-account credentials).
+
+MRO::
+
+    DograhGeminiLiveVertexLLMService
+      -> DograhGeminiLiveLLMService
+      -> GeminiLiveVertexLLMService
+      -> GeminiLiveLLMService
+      -> LLMService
+      -> ...
+"""
+
+from api.services.pipecat.realtime.gemini_live import DograhGeminiLiveLLMService
+from pipecat.services.google.gemini_live.vertex.llm import (
+    GeminiLiveVertexLLMService,
+)
+
+
+class DograhGeminiLiveVertexLLMService(
+    DograhGeminiLiveLLMService,
+    GeminiLiveVertexLLMService,
+):
+    """Vertex AI variant of Gemini Live with Dograh integration quirks."""
+
+    pass
+
+
+# Guard against MRO regressions: a future refactor that flips inheritance
+# order or breaks the diamond would silently bypass the Dograh overrides.
+_mro = DograhGeminiLiveVertexLLMService.__mro__
+assert _mro[1] is DograhGeminiLiveLLMService, (
+    f"Expected DograhGeminiLiveLLMService at MRO[1], got {_mro[1]}"
+)
+assert _mro[2] is GeminiLiveVertexLLMService, (
+    f"Expected GeminiLiveVertexLLMService at MRO[2], got {_mro[2]}"
+)
+del _mro
--- a/api/services/pipecat/realtime/openai_realtime.py
+++ b/api/services/pipecat/realtime/openai_realtime.py
@ -0,0 +1,177 @@
+"""Dograh subclass of pipecat's OpenAI Realtime LLM service.
+
+Layers Dograh engine integration quirks onto upstream-pristine
+:class:`OpenAIRealtimeLLMService`. Substantially smaller than the Gemini
+subclass because OpenAI Realtime supports runtime ``session.update`` for
+both ``system_instruction`` and tools — no reconnect/defer-tool-call
+machinery needed.
+
+Adds:
+
+- **User-mute audio gating** via ``UserMuteStarted/StoppedFrame``.
+- **TTSSpeakFrame as initial-response trigger** so the engine's greeting
+  flow kicks off the bot's first response.
+- **finalized=True on TranscriptionFrame** for parity with the Gemini
+  service (every OpenAI transcription via the ``completed`` event is
+  final by construction).
+"""
+
+import json
+
+from loguru import logger
+
+from pipecat.frames.frames import (
+    BotStartedSpeakingFrame,
+    BotStoppedSpeakingFrame,
+    Frame,
+    TranscriptionFrame,
+    TTSSpeakFrame,
+    UserMuteStartedFrame,
+    UserMuteStoppedFrame,
+)
+from pipecat.processors.aggregators.llm_context import LLMContext
+from pipecat.processors.frame_processor import FrameDirection
+from pipecat.services.llm_service import FunctionCallFromLLM
+from pipecat.services.openai.realtime.llm import OpenAIRealtimeLLMService
+from pipecat.transcriptions.language import Language
+from pipecat.utils.time import time_now_iso8601
+
+
+class DograhOpenAIRealtimeLLMService(OpenAIRealtimeLLMService):
+    """OpenAI Realtime with Dograh engine integration quirks. See module docstring."""
+
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        self._user_is_muted: bool = False
+        # Dograh pre-populates self._context via the engine before the first
+        # LLMContextFrame arrives, so upstream's "first arrival means
+        # self._context is None" check no longer works.
+        self._handled_initial_context: bool = False
+        # Track bot speech locally so tool calls can be deferred until the bot
+        # has finished speaking, matching Dograh's Gemini Live behavior.
+        self._bot_is_speaking: bool = False
+        self._deferred_function_calls: list[FunctionCallFromLLM] = []
+
+    # ------------------------------------------------------------------
+    # Frame handling: mute, TTSSpeakFrame as greeting trigger
+    # ------------------------------------------------------------------
+
+    async def process_frame(self, frame: Frame, direction: FrameDirection):
+        if isinstance(frame, UserMuteStartedFrame):
+            self._user_is_muted = True
+            await self.push_frame(frame, direction)
+            return
+        if isinstance(frame, UserMuteStoppedFrame):
+            self._user_is_muted = False
+            await self.push_frame(frame, direction)
+            return
+        if isinstance(frame, TTSSpeakFrame):
+            # Greeting trigger: the engine queues a TTSSpeakFrame after node
+            # setup. OpenAI Realtime renders its own audio, so we don't pass
+            # the frame to TTS. Route through _handle_context so the initial
+            # response and later tool-result turns share the same context
+            # lifecycle even when Dograh has already pre-populated self._context.
+            if not self._handled_initial_context:
+                await self._handle_context(self._context)
+            else:
+                logger.warning(
+                    f"{self}: TTSSpeakFrame after initial context already "
+                    "handled — OpenAI Realtime owns audio generation, ignoring"
+                )
+            # Don't forward the frame; the audio path is owned by the realtime
+            # service itself.
+            return
+        if isinstance(frame, BotStartedSpeakingFrame):
+            self._bot_is_speaking = True
+        elif isinstance(frame, BotStoppedSpeakingFrame):
+            self._bot_is_speaking = False
+            await self._run_pending_function_calls()
+        await super().process_frame(frame, direction)
+
+    async def _handle_context(self, context: LLMContext):
+        if not self._handled_initial_context:
+            if context is None:
+                logger.warning(
+                    f"{self}: received initial context trigger before context was set"
+                )
+                return
+            self._handled_initial_context = True
+            self._context = context
+            await self._create_response()
+        else:
+            self._context = context
+            await self._process_completed_function_calls(send_new_results=True)
+
+    async def _send_user_audio(self, frame):
+        if self._user_is_muted:
+            return
+        await super()._send_user_audio(frame)
+
+    async def _run_pending_function_calls(self):
+        if not self._deferred_function_calls:
+            return
+        function_calls = self._deferred_function_calls
+        self._deferred_function_calls = []
+        logger.debug(
+            f"{self}: executing {len(function_calls)} deferred function call(s) "
+            "after bot turn ended"
+        )
+        await self.run_function_calls(function_calls)
+
+    async def _handle_evt_function_call_arguments_done(self, evt):
+        """Process or defer tool calls until the bot finishes speaking."""
+        try:
+            args = json.loads(evt.arguments)
+
+            function_call_item = self._pending_function_calls.get(evt.call_id)
+            if function_call_item:
+                del self._pending_function_calls[evt.call_id]
+
+                function_calls = [
+                    FunctionCallFromLLM(
+                        context=self._context,
+                        tool_call_id=evt.call_id,
+                        function_name=function_call_item.name,
+                        arguments=args,
+                    )
+                ]
+
+                if self._bot_is_speaking:
+                    self._deferred_function_calls.extend(function_calls)
+                    logger.debug(
+                        f"{self}: deferring function call {function_call_item.name} "
+                        "until bot stops speaking"
+                    )
+                else:
+                    await self.run_function_calls(function_calls)
+                    logger.debug(f"Processed function call: {function_call_item.name}")
+            else:
+                logger.warning(
+                    f"No tracked function call found for call_id: {evt.call_id}"
+                )
+                logger.warning(
+                    f"Available pending calls: {list(self._pending_function_calls.keys())}"
+                )
+
+        except Exception as e:
+            logger.error(f"Failed to process function call arguments: {e}")
+
+    # ------------------------------------------------------------------
+    # Transcription: broadcast with finalized=True for parity with the
+    # Gemini service (consumers that check `finalized` should see True
+    # for every completed-transcription event from OpenAI).
+    # ------------------------------------------------------------------
+
+    async def handle_evt_input_audio_transcription_completed(self, evt):
+        await self._call_event_handler(
+            "on_conversation_item_updated", evt.item_id, None
+        )
+        await self.broadcast_frame(
+            TranscriptionFrame,
+            text=evt.transcript,
+            user_id="",
+            timestamp=time_now_iso8601(),
+            result=evt,
+            finalized=True,
+        )
+        await self._handle_user_transcription(evt.transcript, True, Language.EN)
--- a/api/services/pipecat/run_pipeline.py
+++ b/api/services/pipecat/run_pipeline.py
@ -86,6 +86,43 @@ from pipecat.utils.run_context import set_current_org_id, set_current_run_id
 ensure_tracing()


+def _create_realtime_user_turn_config(provider: str):
+    """Return user turn strategies and optional local VAD for realtime providers."""
+    if provider in {
+        ServiceProviders.GOOGLE_REALTIME.value,
+        ServiceProviders.GOOGLE_VERTEX_REALTIME.value,
+    }:
+        # Let Gemini Live own barge-in via its server-side VAD, but keep local
+        # Silero VAD for early user-turn start and speaking-state tracking.
+        return (
+            UserTurnStrategies(
+                start=[VADUserTurnStartStrategy(enable_interruptions=False)],
+                stop=[SpeechTimeoutUserTurnStopStrategy()],
+            ),
+            SileroVADAnalyzer(params=VADParams(stop_secs=0.2)),
+        )
+
+    if provider == ServiceProviders.OPENAI_REALTIME.value:
+        # OpenAI Realtime already emits speaking-state frames and interruption
+        # events from the provider, so the aggregator should follow those
+        # external signals rather than run its own local VAD.
+        return (
+            UserTurnStrategies(
+                start=[ExternalUserTurnStartStrategy()],
+                stop=[ExternalUserTurnStopStrategy()],
+            ),
+            None,
+        )
+
+    return (
+        UserTurnStrategies(
+            start=[VADUserTurnStartStrategy()],
+            stop=[SpeechTimeoutUserTurnStopStrategy()],
+        ),
+        SileroVADAnalyzer(params=VADParams(stop_secs=0.2)),
+    )
+
+
 async def run_pipeline_telephony(
    websocket,
    *,
@ -138,6 +175,20 @@ async def run_pipeline_telephony(
            "telephony_configuration_id"
        )

+    # Resolve effective user config here so the transport can tune its
+    # bot-stopped-speaking fallback based on is_realtime; pass the resolved
+    # values into _run_pipeline so it doesn't fetch them again.
+    from api.services.configuration.resolve import resolve_effective_config
+
+    user_config = await db_client.get_user_configurations(user_id)
+    run_configs = (
+        (workflow_run.definition.workflow_configurations or {}) if workflow_run else {}
+    )
+    user_config = resolve_effective_config(
+        user_config, run_configs.get("model_overrides")
+    )
+    is_realtime = bool(user_config.is_realtime and user_config.realtime is not None)
+
    spec = telephony_registry.get(provider_name)
    audio_config = create_audio_config(provider_name)

@ -148,6 +199,7 @@ async def run_pipeline_telephony(
        workflow.organization_id,
        ambient_noise_config=ambient_noise_config,
        telephony_configuration_id=telephony_configuration_id,
+        is_realtime=is_realtime,
        **transport_kwargs,
    )

@ -158,6 +210,8 @@ async def run_pipeline_telephony(
            workflow_run_id,
            user_id,
            audio_config=audio_config,
+            workflow_run=workflow_run,
+            resolved_user_config=user_config,
        )
    except Exception as e:
        logger.error(
@ -198,11 +252,27 @@ async def run_pipeline_smallwebrtc(
    # Create audio configuration for WebRTC
    audio_config = create_audio_config(WorkflowRunMode.SMALLWEBRTC.value)

+    # Resolve workflow_run + effective user_config here so the transport can
+    # tune its bot-stopped-speaking fallback based on is_realtime. _run_pipeline
+    # reuses these via kwargs so we don't fetch twice.
+    from api.services.configuration.resolve import resolve_effective_config
+
+    workflow_run = await db_client.get_workflow_run(workflow_run_id, user_id)
+    user_config = await db_client.get_user_configurations(user_id)
+    run_configs = (
+        (workflow_run.definition.workflow_configurations or {}) if workflow_run else {}
+    )
+    user_config = resolve_effective_config(
+        user_config, run_configs.get("model_overrides")
+    )
+    is_realtime = bool(user_config.is_realtime and user_config.realtime is not None)
+
    transport = await create_webrtc_transport(
        webrtc_connection,
        workflow_run_id,
        audio_config,
        ambient_noise_config,
+        is_realtime=is_realtime,
    )
    await _run_pipeline(
        transport,
@ -212,6 +282,8 @@ async def run_pipeline_smallwebrtc(
        call_context_vars=call_context_vars,
        audio_config=audio_config,
        user_provider_id=user_provider_id,
+        workflow_run=workflow_run,
+        resolved_user_config=user_config,
    )


@ -223,6 +295,8 @@ async def _run_pipeline(
    call_context_vars: dict = {},
    audio_config: AudioConfig = None,
    user_provider_id: str | None = None,
+    workflow_run=None,
+    resolved_user_config=None,
 ) -> None:
    """
    Run the pipeline with the given transport and configuration
@ -232,9 +306,12 @@ async def _run_pipeline(
        workflow_id: The ID of the workflow
        workflow_run_id: The ID of the workflow run
        user_id: The ID of the user
-        mode: The mode of the pipeline (twilio or smallwebrtc)
+        workflow_run: Pre-fetched workflow run row. Fetched here if None.
+        resolved_user_config: User configuration with model_overrides already
+            applied. Fetched and resolved here if None.
    """
-    workflow_run = await db_client.get_workflow_run(workflow_run_id, user_id)
+    if workflow_run is None:
+        workflow_run = await db_client.get_workflow_run(workflow_run_id, user_id)

    # If the workflow run is already completed, we don't need to run it again
    if workflow_run.is_completed:
@ -246,9 +323,6 @@ async def _run_pipeline(
    if call_context_vars:
        merged_call_context_vars = {**merged_call_context_vars, **call_context_vars}

-    # Get user configuration
-    user_config = await db_client.get_user_configurations(user_id)
-
    # Get workflow for metadata (name, organization_id, call_disposition_codes)
    workflow = await db_client.get_workflow(workflow_id, user_id)
    if not workflow:
@ -286,11 +360,17 @@ async def _run_pipeline(
                    term.strip() for term in dictionary.split(",") if term.strip()
                ]

-    # Resolve model overrides from the version onto global user config
-    from api.services.configuration.resolve import resolve_effective_config
+    # Resolve model overrides from the version onto global user config (skip
+    # when the caller already resolved it).
+    if resolved_user_config is None:
+        from api.services.configuration.resolve import resolve_effective_config

-    model_overrides = run_configs.get("model_overrides")
-    user_config = resolve_effective_config(user_config, model_overrides)
+        user_config = await db_client.get_user_configurations(user_id)
+        user_config = resolve_effective_config(
+            user_config, run_configs.get("model_overrides")
+        )
+    else:
+        user_config = resolved_user_config

    # Detect realtime mode (speech-to-speech services like OpenAI Realtime, Gemini Live)
    is_realtime = user_config.is_realtime and user_config.realtime is not None
@ -453,23 +533,20 @@ async def _run_pipeline(
        correct_aggregation_callback=engine.create_aggregation_correction_callback(),
    )

+    user_mute_strategies = [
+        MuteUntilFirstBotCompleteUserMuteStrategy(),
+        FunctionCallUserMuteStrategy(),
+        CallbackUserMuteStrategy(should_mute_callback=engine.should_mute_user),
+    ]
+    user_vad_analyzer = SileroVADAnalyzer(params=VADParams(stop_secs=0.2))
+
    # Configure turn strategies based on STT provider, model, and workflow configuration
    if is_realtime:
-        # Realtime services do server-side turn detection for response generation,
-        # but we still need a client-side stop strategy so the user aggregator emits
-        # UserStoppedSpeakingFrame. Without it, downstream consumers (e.g. voicemail
-        # detector) and Gemini Live's _finalize_pending flag never see a turn end.
-        user_turn_strategies = UserTurnStrategies(
-            start=[VADUserTurnStartStrategy()],
-            stop=[SpeechTimeoutUserTurnStopStrategy()],
+        # Realtime services still need user-turn tracking even when the model
+        # itself owns speech generation and interruption behavior.
+        user_turn_strategies, user_vad_analyzer = _create_realtime_user_turn_config(
+            user_config.realtime.provider
        )
-
-        # Lets not start the pipeline as muted for Realtime
-        # - CallbackUserMuteStrategy: mutes based on engine's _mute_pipeline state
-        user_mute_strategies = [
-            FunctionCallUserMuteStrategy(),
-            CallbackUserMuteStrategy(should_mute_callback=engine.should_mute_user),
-        ]
    else:
        # Deepgram Flux uses external turn detection (VAD + External start/stop)
        # Other models use configurable turn detection strategy
@ -510,18 +587,11 @@ async def _run_pipeline(
                stop=[SpeechTimeoutUserTurnStopStrategy()],
            )

-        # - CallbackUserMuteStrategy: mutes based on engine's _mute_pipeline state
-        user_mute_strategies = [
-            MuteUntilFirstBotCompleteUserMuteStrategy(),
-            FunctionCallUserMuteStrategy(),
-            CallbackUserMuteStrategy(should_mute_callback=engine.should_mute_user),
-        ]
-
    user_params = LLMUserAggregatorParams(
        user_turn_strategies=user_turn_strategies,
        user_mute_strategies=user_mute_strategies,
        user_idle_timeout=max_user_idle_timeout,
-        vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.2)),
+        vad_analyzer=user_vad_analyzer,
    )
    context_aggregator = LLMContextAggregatorPair(
        context, assistant_params=assistant_params, user_params=user_params
@ -562,15 +632,14 @@ async def _run_pipeline(
    )
    engine.set_fetch_recording_audio(fetch_audio)

-    # Voicemail detection works in both modes. In realtime mode the detector sits
-    # after the realtime LLM and consumes the TranscriptionFrames it broadcasts;
-    # the LLM gate / TTS gate are not used (the realtime LLM responds to audio
-    # directly, not LLMContextFrames), so on detection we rely on
-    # end_call_with_reason to drop the call.
    voicemail_config = (workflow.workflow_configurations or {}).get(
        "voicemail_detection", {}
    )
-    if voicemail_config.get("enabled", False):
+    if is_realtime and voicemail_config.get("enabled", False):
+        logger.info(
+            f"Disabling voicemail detection for realtime workflow run {workflow_run_id}"
+        )
+    if voicemail_config.get("enabled", False) and not is_realtime:
        logger.info(f"Voicemail detection enabled for workflow run {workflow_run_id}")
        # Create a separate LLM instance for the voicemail sub-pipeline
        # (can't share with main pipeline as it would mess up frame linking)
--- a/api/services/pipecat/service_factory.py
+++ b/api/services/pipecat/service_factory.py
@ -493,6 +493,9 @@ def create_realtime_llm_service(user_config, audio_config: "AudioConfig"):
    )

    if provider == ServiceProviders.OPENAI_REALTIME.value:
+        from api.services.pipecat.realtime.openai_realtime import (
+            DograhOpenAIRealtimeLLMService,
+        )
        from pipecat.services.openai.realtime.events import (
            AudioConfiguration,
            AudioInput,
@ -500,11 +503,10 @@ def create_realtime_llm_service(user_config, audio_config: "AudioConfig"):
            InputAudioTranscription,
            SessionProperties,
        )
-        from pipecat.services.openai.realtime.llm import OpenAIRealtimeLLMService

-        return OpenAIRealtimeLLMService(
+        return DograhOpenAIRealtimeLLMService(
            api_key=api_key,
-            settings=OpenAIRealtimeLLMService.Settings(
+            settings=DograhOpenAIRealtimeLLMService.Settings(
                model=model,
                session_properties=SessionProperties(
                    audio=AudioConfiguration(
@ -519,7 +521,9 @@ def create_realtime_llm_service(user_config, audio_config: "AudioConfig"):
            ),
        )
    elif provider == ServiceProviders.GOOGLE_REALTIME.value:
-        from pipecat.services.google.gemini_live.llm import GeminiLiveLLMService
+        from api.services.pipecat.realtime.gemini_live import (
+            DograhGeminiLiveLLMService,
+        )

        # Gemini Live enables input/output audio transcription by default
        # in its _connect() method — no need to configure it explicitly.
@ -529,9 +533,30 @@ def create_realtime_llm_service(user_config, audio_config: "AudioConfig"):
        }
        if language:
            settings_kwargs["language"] = language
-        return GeminiLiveLLMService(
+        return DograhGeminiLiveLLMService(
            api_key=api_key,
-            settings=GeminiLiveLLMService.Settings(**settings_kwargs),
+            settings=DograhGeminiLiveLLMService.Settings(**settings_kwargs),
+        )
+    elif provider == ServiceProviders.GOOGLE_VERTEX_REALTIME.value:
+        from api.services.pipecat.realtime.gemini_live_vertex import (
+            DograhGeminiLiveVertexLLMService,
+        )
+
+        project_id = getattr(realtime_config, "project_id", None)
+        location = getattr(realtime_config, "location", None) or "us-east4"
+        credentials = getattr(realtime_config, "credentials", None)
+
+        settings_kwargs = {
+            "model": model,
+            "voice": voice or "Charon",
+        }
+        if language:
+            settings_kwargs["language"] = language
+        return DograhGeminiLiveVertexLLMService(
+            credentials=credentials,
+            project_id=project_id,
+            location=location,
+            settings=DograhGeminiLiveVertexLLMService.Settings(**settings_kwargs),
        )
    else:
        raise HTTPException(
--- a/api/services/pipecat/transport_params.py
+++ b/api/services/pipecat/transport_params.py
@ -0,0 +1,25 @@
+"""Shared helpers for tuning pipecat ``TransportParams`` per run mode.
+
+These live outside ``transport_setup.py`` (which is non-telephony only) so
+that both the WebRTC factory there and the telephony provider factories
+under ``api.services.telephony.providers/<name>/transport.py`` can call
+into the same place.
+"""
+
+# Realtime (speech-to-speech) LLMs don't emit ``TTSStoppedFrame``, so the
+# bot-stopped-speaking signal relies on the output-queue-drained fallback.
+# The default 3s tail leaves a long gap before the assistant aggregator
+# closes its turn; 0.5s keeps the conversation snappy without cutting into
+# the bot's own audio (audio chunks arrive far more frequently than this).
+REALTIME_BOT_VAD_STOP_SECS = 0.5
+
+
+def realtime_param_overrides(is_realtime: bool) -> dict:
+    """Return kwargs to splat into ``TransportParams`` for the given run mode.
+
+    Currently this only tunes ``bot_vad_stop_secs``; new realtime-specific
+    knobs should be added here so each transport stays a thin shim.
+    """
+    if not is_realtime:
+        return {}
+    return {"bot_vad_stop_secs": REALTIME_BOT_VAD_STOP_SECS}
--- a/api/services/pipecat/transport_setup.py
+++ b/api/services/pipecat/transport_setup.py
@ -6,6 +6,7 @@ This module hosts only the shared, non-telephony transports (WebRTC, internal/Lo

 from api.services.pipecat.audio_config import AudioConfig
 from api.services.pipecat.audio_mixer import build_audio_out_mixer
+from api.services.pipecat.transport_params import realtime_param_overrides
 from pipecat.transports.base_transport import TransportParams
 from pipecat.transports.smallwebrtc.connection import SmallWebRTCConnection
 from pipecat.transports.smallwebrtc.transport import SmallWebRTCTransport
@ -16,6 +17,7 @@ async def create_webrtc_transport(
    workflow_run_id: int,
    audio_config: AudioConfig,
    ambient_noise_config: dict | None = None,
+    is_realtime: bool = False,
 ):
    """Create a transport for WebRTC connections."""
    mixer = await build_audio_out_mixer(
@ -30,6 +32,7 @@ async def create_webrtc_transport(
            audio_in_sample_rate=audio_config.transport_in_sample_rate,
            audio_out_sample_rate=audio_config.transport_out_sample_rate,
            audio_out_mixer=mixer,
+            **realtime_param_overrides(is_realtime),
        ),
    )