make turn detection configurable

2026-06-16 08:25:18 +02:00 · 2026-02-05 13:09:11 +05:30 · 2026-02-05 13:09:11 +05:30 · 8f67e48d45
commit 8f67e48d45
parent 2d4a7b49b0
5 changed files with 115 additions and 111 deletions
--- a/api/Dockerfile
+++ b/api/Dockerfile
@ -13,6 +13,11 @@ RUN apt-get update && apt-get install -y \
 # Copy and install requirements
 COPY api/requirements.txt .

+# Install CPU-only PyTorch FIRST to prevent CUDA/NVIDIA dependencies
+# This satisfies torch dependency before other packages try to pull GPU version
+RUN pip install --user --no-cache-dir torch --index-url https://download.pytorch.org/whl/cpu && \
+    rm -rf /root/.cache/pip
+
 # Install dependencies to user directory for easy copying
 RUN pip install --user --no-cache-dir -r requirements.txt && \
    # Clean up pip cache after installation
--- a/api/services/pipecat/run_pipeline.py
+++ b/api/services/pipecat/run_pipeline.py
@ -44,6 +44,7 @@ from api.services.telephony.stasis_rtp_connection import StasisRTPConnection
 from api.services.workflow.dto import ReactFlowDTO
 from api.services.workflow.pipecat_engine import PipecatEngine
 from api.services.workflow.workflow import WorkflowGraph
+from pipecat.audio.turn.smart_turn.base_smart_turn import SmartTurnParams
 from pipecat.audio.turn.smart_turn.local_smart_turn_v3 import LocalSmartTurnAnalyzerV3
 from pipecat.extensions.voicemail.voicemail_detector import VoicemailDetector
 from pipecat.pipeline.base_task import PipelineTaskParams
@ -66,6 +67,7 @@ from pipecat.turns.user_start.vad_user_turn_start_strategy import (
 )
 from pipecat.turns.user_stop import (
    ExternalUserTurnStopStrategy,
+    TranscriptionUserTurnStopStrategy,
    TurnAnalyzerUserTurnStopStrategy,
 )
 from pipecat.turns.user_turn_strategies import UserTurnStrategies
@ -453,6 +455,8 @@ async def _run_pipeline(
    # Extract configurations from workflow configurations
    max_call_duration_seconds = 300  # Default 5 minutes
    max_user_idle_timeout = 10.0  # Default 10 seconds
+    smart_turn_stop_secs = 2.0  # Default 2 seconds for incomplete turn timeout
+    turn_stop_strategy = "transcription"  # Default to transcription-based detection
    keyterms = None  # Dictionary words for STT boosting

    if workflow.workflow_configurations:
@ -468,6 +472,16 @@ async def _run_pipeline(
                "max_user_idle_timeout"
            ]

+        # Use workflow-specific smart turn stop timeout if provided
+        if "smart_turn_stop_secs" in workflow.workflow_configurations:
+            smart_turn_stop_secs = workflow.workflow_configurations[
+                "smart_turn_stop_secs"
+            ]
+
+        # Use workflow-specific turn stop strategy if provided
+        if "turn_stop_strategy" in workflow.workflow_configurations:
+            turn_stop_strategy = workflow.workflow_configurations["turn_stop_strategy"]
+
        # Extract dictionary words and convert to keyterms list
        if "dictionary" in workflow.workflow_configurations:
            dictionary = workflow.workflow_configurations["dictionary"]
@ -551,9 +565,9 @@ async def _run_pipeline(
        correct_aggregation_callback=engine.create_aggregation_correction_callback(),
    )

-    # Configure turn strategies based on STT provider and model
+    # Configure turn strategies based on STT provider, model, and workflow configuration
    # Deepgram Flux uses external turn detection (VAD + External start/stop)
-    # Other models use transcription-based turn detection with smart turn analyzer
+    # Other models use configurable turn detection strategy
    is_deepgram_flux = (
        user_config.stt.provider == ServiceProviders.DEEPGRAM.value
        and user_config.stt.model == "flux-general-en"
@ -564,15 +578,23 @@ async def _run_pipeline(
            start=[VADUserTurnStartStrategy(), TranscriptionUserTurnStartStrategy()],
            stop=[ExternalUserTurnStopStrategy()],
        )
-    else:
+    elif turn_stop_strategy == "turn_analyzer":
+        # Smart Turn Analyzer: best for longer responses with natural pauses
+        smart_turn_params = SmartTurnParams(stop_secs=smart_turn_stop_secs)
        user_turn_strategies = UserTurnStrategies(
            start=[VADUserTurnStartStrategy(), TranscriptionUserTurnStartStrategy()],
            stop=[
                TurnAnalyzerUserTurnStopStrategy(
-                    turn_analyzer=LocalSmartTurnAnalyzerV3()
+                    turn_analyzer=LocalSmartTurnAnalyzerV3(params=smart_turn_params)
                )
            ],
        )
+    else:
+        # Transcription-based (default): best for short 1-2 word responses
+        user_turn_strategies = UserTurnStrategies(
+            start=[VADUserTurnStartStrategy(), TranscriptionUserTurnStartStrategy()],
+            stop=[TranscriptionUserTurnStopStrategy()],
+        )

    # Create user mute strategies
    # - CallbackUserMuteStrategy: mutes based on engine's _mute_pipeline state
--- a/api/tasks/knowledge_base_processing.py
+++ b/api/tasks/knowledge_base_processing.py
@ -125,15 +125,11 @@ async def process_knowledge_base_document(
        embeddings_api_key = None
        embeddings_model = None
        if document.created_by:
-            user_config = await db_client.get_user_configurations(
-                document.created_by
-            )
+            user_config = await db_client.get_user_configurations(document.created_by)
            if user_config.embeddings:
                embeddings_api_key = user_config.embeddings.api_key
                embeddings_model = user_config.embeddings.model
-                logger.info(
-                    f"Using user embeddings config: model={embeddings_model}"
-                )
+                logger.info(f"Using user embeddings config: model={embeddings_model}")

        # Check if API key is configured
        if not embeddings_api_key: