feat: add smart turn v3

2026-07-22 11:51:04 +02:00 · 2025-12-10 09:07:03 +07:00 · 2025-12-10 09:07:03 +07:00 · 4640f69f9b
commit 4640f69f9b
parent 94b7d7e2f2
4 changed files with 4 additions and 37 deletions
--- a/api/services/pipecat/transport_setup.py
+++ b/api/services/pipecat/transport_setup.py
@ -7,9 +7,6 @@ from api.db import db_client
 from api.enums import OrganizationConfigurationKey
 from api.services.looptalk.internal_transport import InternalTransport
 from api.services.pipecat.audio_config import AudioConfig
-from api.services.smart_turn.websocket_smart_turn import (
-    WebSocketSmartTurnAnalyzer,
-)
 from api.services.telephony.stasis_rtp_connection import StasisRTPConnection
 from api.services.telephony.stasis_rtp_serializer import StasisRTPFrameSerializer
 from api.services.telephony.stasis_rtp_transport import (
@ -20,6 +17,7 @@ from pipecat.audio.filters.rnnoise_filter import RNNoiseFilter
 from pipecat.audio.mixers.silence_mixer import SilenceAudioMixer
 from pipecat.audio.mixers.soundfile_mixer import SoundfileMixer
 from pipecat.audio.turn.smart_turn.base_smart_turn import SmartTurnParams
+from pipecat.audio.turn.smart_turn.local_smart_turn_v3 import LocalSmartTurnAnalyzerV3
 from pipecat.audio.vad.silero import SileroVADAnalyzer, VADParams
 from pipecat.serializers.plivo import PlivoFrameSerializer
 from pipecat.serializers.twilio import TwilioFrameSerializer
@ -45,30 +43,7 @@ def create_turn_analyzer(workflow_run_id: int, audio_config: AudioConfig):
        audio_config: Audio configuration containing pipeline sample rate
    """
    if ENABLE_SMART_TURN:
-        service_url = os.getenv(
-            "SMART_TURN_WS_SERVICE_ENDPOINT", "ws://localhost:8010/ws"
-        )
-
-        # Prepare optional authentication headers for Smart Turn service
-        secret_key = os.getenv("SMART_TURN_HTTP_SERVICE_KEY")
-        headers = {"X-API-Key": secret_key} if secret_key else None
-
-        return WebSocketSmartTurnAnalyzer(
-            url=service_url,
-            headers=headers,
-            sample_rate=audio_config.pipeline_sample_rate,
-            params=SmartTurnParams(
-                stop_secs=1.5,  # send turn complete if silent for stop_secs seconds
-                pre_speech_ms=0,  # send speech segments before speech was detected by VAD
-                max_duration_secs=5,  # max duration of speech to be sent to the end of turn analyzer
-                # we don't want to _clear except when we have end of turn prediction as 1 from last run
-                # else if we have speaking -> queit -> trigger end of turn -> clear() and then
-                # we have speak -> queit, we may end up sending a very small segment of speech
-                # to end of turn model, which is not good
-                use_only_last_vad_segment=False,
-            ),
-            service_context=workflow_run_id,
-        )
+        return LocalSmartTurnAnalyzerV3(params=SmartTurnParams())

    return None

--- a/api/services/workflow/pipecat_engine_callbacks.py
+++ b/api/services/workflow/pipecat_engine_callbacks.py
@ -67,14 +67,6 @@ def create_user_idle_callback(engine: "PipecatEngine"):
    ) -> bool:
        logger.debug(f"Handling user_idle, attempt: {retry_count}")

-        # Check if we're on a StartNode - if yes, directly disconnect
-        if engine._current_node and engine._current_node.is_start:
-            logger.debug("User idle on StartNode - disconnecting immediately")
-            await engine.send_end_task_frame(
-                EndTaskReason.USER_IDLE_MAX_DURATION_EXCEEDED.value
-            )
-            return False
-
        if retry_count == 1:
            # Simulate an LLM generation, so that we can have the LLM context
            # updated with the new message