Merge branch 'main' into feat/call-tags

2026-06-16 08:25:18 +02:00 · 2026-02-16 13:07:23 +05:30 · 2026-02-16 13:07:23 +05:30 · ea0967fd9c
commit ea0967fd9c
parent 15809e03a4 5d14d17ceb
41 changed files with 480 additions and 317 deletions
--- a/api/services/configuration/registry.py
+++ b/api/services/configuration/registry.py
@ -278,7 +278,48 @@ class DograhTTSService(BaseTTSConfiguration):


 SARVAM_TTS_MODELS = ["bulbul:v2", "bulbul:v3"]
-SARVAM_VOICES = ["anushka", "manisha", "vidya", "arya", "abhilash", "karun", "hitesh"]
+SARVAM_V2_VOICES = ["anushka", "manisha", "vidya", "arya", "abhilash", "karun", "hitesh"]
+SARVAM_V3_VOICES = [
+    "shubh",
+    "aditya",
+    "ritu",
+    "priya",
+    "neha",
+    "rahul",
+    "pooja",
+    "rohan",
+    "simran",
+    "kavya",
+    "amit",
+    "dev",
+    "ishita",
+    "shreya",
+    "ratan",
+    "varun",
+    "manan",
+    "sumit",
+    "roopa",
+    "kabir",
+    "aayan",
+    "ashutosh",
+    "advait",
+    "amelia",
+    "sophia",
+    "anand",
+    "tanya",
+    "tarun",
+    "sunny",
+    "mani",
+    "gokul",
+    "vijay",
+    "shruti",
+    "suhani",
+    "mohit",
+    "kavitha",
+    "rehan",
+    "soham",
+    "rupali",
+]
 SARVAM_LANGUAGES = [
    "bn-IN",
    "en-IN",
@ -301,7 +342,16 @@ class SarvamTTSConfiguration(BaseTTSConfiguration):
    model: str = Field(
        default="bulbul:v2", json_schema_extra={"examples": SARVAM_TTS_MODELS}
    )
-    voice: str = Field(default="anushka", json_schema_extra={"examples": SARVAM_VOICES})
+    voice: str = Field(
+        default="anushka",
+        json_schema_extra={
+            "examples": SARVAM_V2_VOICES,
+            "model_options": {
+                "bulbul:v2": SARVAM_V2_VOICES,
+                "bulbul:v3": SARVAM_V3_VOICES,
+            },
+        },
+    )
    language: str = Field(
        default="hi-IN", json_schema_extra={"examples": SARVAM_LANGUAGES}
    )
@ -322,39 +372,89 @@ TTSConfig = Annotated[
 ###################################################### STT ########################################################################


-DEEPGRAM_STT_MODELS = ["nova-2", "nova-3-general", "flux-general-en"]
+DEEPGRAM_STT_MODELS = ["nova-3-general", "flux-general-en"]
 DEEPGRAM_LANGUAGES = [
    "multi",
+    "ar",
+    "ar-AE",
+    "ar-SA",
+    "ar-QA",
+    "ar-KW",
+    "ar-SY",
+    "ar-LB",
+    "ar-PS",
+    "ar-JO",
+    "ar-EG",
+    "ar-SD",
+    "ar-TD",
+    "ar-MA",
+    "ar-DZ",
+    "ar-TN",
+    "ar-IQ",
+    "ar-IR",
+    "be",
+    "bn",
+    "bs",
+    "bg",
+    "ca",
+    "cs",
+    "da",
+    "da-DK",
+    "de",
+    "de-CH",
+    "el",
    "en",
    "en-US",
-    "en-GB",
    "en-AU",
+    "en-GB",
    "en-IN",
+    "en-NZ",
    "es",
    "es-419",
+    "et",
+    "fa",
+    "fi",
    "fr",
    "fr-CA",
-    "de",
+    "he",
+    "hi",
+    "hr",
+    "hu",
+    "id",
    "it",
+    "ja",
+    "kn",
+    "ko",
+    "ko-KR",
+    "lt",
+    "lv",
+    "mk",
+    "mr",
+    "ms",
+    "nl",
+    "nl-BE",
+    "no",
+    "pl",
    "pt",
    "pt-BR",
-    "nl",
-    "hi",
-    "ja",
-    "ko",
-    "zh-CN",
-    "zh-TW",
+    "pt-PT",
+    "ro",
    "ru",
-    "pl",
+    "sk",
+    "sl",
+    "sr",
+    "sv",
+    "sv-SE",
+    "ta",
+    "te",
+    "th",
+    "tl",
    "tr",
    "uk",
+    "ur",
    "vi",
-    "sv",
-    "da",
-    "no",
-    "fi",
-    "id",
-    "th",
+    "zh-CN",
+    "zh-TW",
 ]


@ -365,7 +465,14 @@ class DeepgramSTTConfiguration(BaseSTTConfiguration):
        default="nova-3-general", json_schema_extra={"examples": DEEPGRAM_STT_MODELS}
    )
    language: str = Field(
-        default="multi", json_schema_extra={"examples": DEEPGRAM_LANGUAGES}
+        default="multi",
+        json_schema_extra={
+            "examples": DEEPGRAM_LANGUAGES,
+            "model_options": {
+                "nova-3-general": DEEPGRAM_LANGUAGES,
+                "flux-general-en": ["en"],
+            },
+        },
    )
    api_key: str

@ -390,39 +497,7 @@ class OpenAISTTConfiguration(BaseSTTConfiguration):

 # Dograh STT Service
 DOGRAH_STT_MODELS = ["default"]
-DOGRAH_STT_LANGUAGES = [
-    "multi",
-    "en",
-    "en-US",
-    "en-GB",
-    "en-AU",
-    "en-IN",
-    "es",
-    "es-419",
-    "fr",
-    "fr-CA",
-    "de",
-    "it",
-    "pt",
-    "pt-BR",
-    "nl",
-    "hi",
-    "ja",
-    "ko",
-    "zh-CN",
-    "zh-TW",
-    "ru",
-    "pl",
-    "tr",
-    "uk",
-    "vi",
-    "sv",
-    "da",
-    "no",
-    "fi",
-    "id",
-    "th",
-]
+DOGRAH_STT_LANGUAGES = DEEPGRAM_LANGUAGES


@register_stt
--- a/api/services/looptalk/orchestrator.py
+++ b/api/services/looptalk/orchestrator.py
@ -14,7 +14,7 @@ from api.services.looptalk.internal_transport import (
 )
 from api.services.pipecat.transport_setup import create_internal_transport
 from pipecat.pipeline.task import PipelineTask
-from pipecat.utils.context import set_current_run_id
+from pipecat.utils.run_context import set_current_run_id

 from .core.pipeline_builder import LoopTalkPipelineBuilder
 from .core.recording_manager import RecordingManager
--- a/api/services/pipecat/pipeline_builder.py
+++ b/api/services/pipecat/pipeline_builder.py
@ -10,7 +10,7 @@ from pipecat.pipeline.pipeline import Pipeline
 from pipecat.pipeline.task import PipelineParams, PipelineTask
 from pipecat.processors.aggregators.llm_context import LLMContext
 from pipecat.processors.audio.audio_buffer_processor import AudioBufferProcessor
-from pipecat.utils.context import turn_var
+from pipecat.utils.run_context import turn_var


 def create_pipeline_components(audio_config: AudioConfig):
--- a/api/services/pipecat/run_pipeline.py
+++ b/api/services/pipecat/run_pipeline.py
@ -62,6 +62,7 @@ from pipecat.turns.user_mute import (
    MuteUntilFirstBotCompleteUserMuteStrategy,
 )
 from pipecat.turns.user_start import (
+    ExternalUserTurnStartStrategy,
    TranscriptionUserTurnStartStrategy,
 )
 from pipecat.turns.user_start.vad_user_turn_start_strategy import (
@ -69,12 +70,12 @@ from pipecat.turns.user_start.vad_user_turn_start_strategy import (
 )
 from pipecat.turns.user_stop import (
    ExternalUserTurnStopStrategy,
-    TranscriptionUserTurnStopStrategy,
+    SpeechTimeoutUserTurnStopStrategy,
    TurnAnalyzerUserTurnStopStrategy,
 )
 from pipecat.turns.user_turn_strategies import UserTurnStrategies
-from pipecat.utils.context import set_current_run_id
 from pipecat.utils.enums import EndTaskReason
+from pipecat.utils.run_context import set_current_run_id
 from pipecat.utils.tracing.context_registry import ContextProviderRegistry

 # Setup tracing if enabled
@ -265,7 +266,6 @@ async def run_pipeline_vobiz(
 async def run_pipeline_cloudonix(
    websocket_client: WebSocket,
    stream_sid: str,
-    call_sid: str,
    workflow_id: int,
    workflow_run_id: int,
    user_id: int,
@ -274,10 +274,15 @@ async def run_pipeline_cloudonix(
    logger.debug(
        f"Running pipeline for Cloudonix connection with workflow_id: {workflow_id} and workflow_run_id: {workflow_run_id}"
    )
-    set_current_run_id(workflow_run_id)
+
+    workflow_run = await db_client.get_workflow_run_by_id(workflow_run_id)
+    call_id = workflow_run.gathered_context.get("call_id")
+    if not call_id:
+        logger.warning("call_id not found in gathered_context")
+        raise Exception()

    # Store call ID in cost_info for later cost calculation (provider-agnostic)
-    cost_info = {"call_id": call_sid}
+    cost_info = {"call_id": call_id}
    await db_client.update_workflow_run(workflow_run_id, cost_info=cost_info)

    # Get workflow to extract all pipeline configurations
@ -292,26 +297,18 @@ async def run_pipeline_cloudonix(
                "ambient_noise_configuration"
            ]

-    # Retrieve session_token from workflow_run gathered_context
-    workflow_run = await db_client.get_workflow_run(workflow_run_id)
-    session_token = None
-    if workflow_run and workflow_run.gathered_context:
-        session_token = workflow_run.gathered_context.get("session_token")
-        logger.debug(f"Retrieved session_token from workflow_run: {session_token}")
-
    # Create audio configuration for Cloudonix
    audio_config = create_audio_config(WorkflowRunMode.CLOUDONIX.value)

    transport = await create_cloudonix_transport(
        websocket_client,
+        call_id,
        stream_sid,
-        call_sid,
        workflow_run_id,
        audio_config,
        workflow.organization_id,
        vad_config,
        ambient_noise_config,
-        session_token,
    )
    await _run_pipeline(
        transport,
@ -580,7 +577,10 @@ async def _run_pipeline(

    if is_deepgram_flux:
        user_turn_strategies = UserTurnStrategies(
-            start=[VADUserTurnStartStrategy(), TranscriptionUserTurnStartStrategy()],
+            start=[
+                VADUserTurnStartStrategy(),
+                ExternalUserTurnStartStrategy(enable_interruptions=True),
+            ],
            stop=[ExternalUserTurnStopStrategy()],
        )
    elif turn_stop_strategy == "turn_analyzer":
@ -598,7 +598,7 @@ async def _run_pipeline(
        # Transcription-based (default): best for short 1-2 word responses
        user_turn_strategies = UserTurnStrategies(
            start=[VADUserTurnStartStrategy(), TranscriptionUserTurnStartStrategy()],
-            stop=[TranscriptionUserTurnStopStrategy()],
+            stop=[SpeechTimeoutUserTurnStopStrategy()],
        )

    # Create user mute strategies
--- a/api/services/pipecat/service_factory.py
+++ b/api/services/pipecat/service_factory.py
@ -30,7 +30,9 @@ if TYPE_CHECKING:
    from api.services.pipecat.audio_config import AudioConfig


-def create_stt_service(user_config, audio_config: "AudioConfig", keyterms: list[str] | None = None):
+def create_stt_service(
+    user_config, audio_config: "AudioConfig", keyterms: list[str] | None = None
+):
    """Create and return appropriate STT service based on user configuration

    Args:
@ -53,7 +55,7 @@ def create_stt_service(user_config, audio_config: "AudioConfig", keyterms: list[
                    keyterm=keyterms or [],
                ),
                should_interrupt=False,  # Let UserAggregator take care of sending InterruptionFrame
-                sample_rate=audio_config.transport_in_sample_rate
+                sample_rate=audio_config.transport_in_sample_rate,
            )

        # Other models than flux
@ -64,21 +66,24 @@ def create_stt_service(user_config, audio_config: "AudioConfig", keyterms: list[
            profanity_filter=False,
            endpointing=100,
            model=user_config.stt.model,
-            keyterm=keyterms or []
+            keyterm=keyterms or [],
        )
        logger.debug(f"Using DeepGram Model - {user_config.stt.model}")
        return DeepgramSTTService(
            live_options=live_options,
            api_key=user_config.stt.api_key,
            should_interrupt=False,  # Let UserAggregator take care of sending InterruptionFrame
-            sample_rate=audio_config.transport_in_sample_rate
+            sample_rate=audio_config.transport_in_sample_rate,
        )
    elif user_config.stt.provider == ServiceProviders.OPENAI.value:
        return OpenAISTTService(
            api_key=user_config.stt.api_key, model=user_config.stt.model
        )
    elif user_config.stt.provider == ServiceProviders.CARTESIA.value:
-        return CartesiaSTTService(api_key=user_config.stt.api_key, sample_rate=audio_config.transport_in_sample_rate)
+        return CartesiaSTTService(
+            api_key=user_config.stt.api_key,
+            sample_rate=audio_config.transport_in_sample_rate,
+        )
    elif user_config.stt.provider == ServiceProviders.DOGRAH.value:
        base_url = MPS_API_URL.replace("http://", "ws://").replace("https://", "wss://")
        language = getattr(user_config.stt, "language", None) or "multi"
@ -88,7 +93,7 @@ def create_stt_service(user_config, audio_config: "AudioConfig", keyterms: list[
            model=user_config.stt.model,
            language=language,
            keyterms=keyterms,
-            sample_rate=audio_config.transport_in_sample_rate
+            sample_rate=audio_config.transport_in_sample_rate,
        )
    elif user_config.stt.provider == ServiceProviders.SARVAM.value:
        # Map Sarvam language code to pipecat Language enum
@ -112,7 +117,7 @@ def create_stt_service(user_config, audio_config: "AudioConfig", keyterms: list[
            api_key=user_config.stt.api_key,
            model=user_config.stt.model,
            params=SarvamSTTService.InputParams(language=pipecat_language),
-            sample_rate=audio_config.transport_in_sample_rate
+            sample_rate=audio_config.transport_in_sample_rate,
        )
    elif user_config.stt.provider == ServiceProviders.SPEECHMATICS.value:
        from pipecat.services.speechmatics.stt import (
@ -138,7 +143,7 @@ def create_stt_service(user_config, audio_config: "AudioConfig", keyterms: list[
                operating_point=operating_point,
                additional_vocab=additional_vocab,
            ),
-            sample_rate=audio_config.transport_in_sample_rate
+            sample_rate=audio_config.transport_in_sample_rate,
        )
    else:
        raise HTTPException(
--- a/api/services/pipecat/transport_setup.py
+++ b/api/services/pipecat/transport_setup.py
@ -94,14 +94,13 @@ async def create_twilio_transport(

 async def create_cloudonix_transport(
    websocket_client: WebSocket,
+    call_id: str,
    stream_sid: str,
-    call_sid: str,
    workflow_run_id: int,
    audio_config: AudioConfig,
    organization_id: int,
    vad_config: dict | None = None,
    ambient_noise_config: dict | None = None,
-    session_token: str | None = None,
 ):
    """Create a transport for Cloudonix connections"""

@ -125,11 +124,10 @@ async def create_cloudonix_transport(
    from pipecat.serializers.cloudonix import CloudonixFrameSerializer

    serializer = CloudonixFrameSerializer(
+        call_id=call_id,
        stream_sid=stream_sid,
-        call_sid=call_sid,
        domain_id=domain_id,
        bearer_token=bearer_token,
-        session_token=session_token,
    )

    return FastAPIWebsocketTransport(
--- a/api/services/pipecat/turn_context.py
+++ b/api/services/pipecat/turn_context.py
@ -8,7 +8,7 @@ propagate through asyncio.create_task() calls.
 import asyncio
 from typing import Dict, Optional

-from pipecat.utils.context import turn_var
+from pipecat.utils.run_context import turn_var


 class TurnContextManager:
--- a/api/services/telephony/providers/cloudonix_provider.py
+++ b/api/services/telephony/providers/cloudonix_provider.py
@ -395,10 +395,6 @@ class CloudonixProvider(TelephonyProvider):
                await websocket.close(code=4400, reason="Expected connected event")
                return

-            logger.debug(
-                f"Cloudonix WebSocket connected for workflow_run {workflow_run_id}"
-            )
-
            # Wait for "start" event with stream details
            start_msg = await websocket.receive_text()
            logger.debug(f"Received start message: {start_msg}")
@ -418,9 +414,14 @@ class CloudonixProvider(TelephonyProvider):
                await websocket.close(code=4400, reason="Missing stream identifiers")
                return

+            logger.debug(
+                f"Cloudonix WebSocket connected for workflow_run {workflow_run_id} "
+                f"stream_sid: {stream_sid} call_sid: {call_sid}"
+            )
+
            # Run the Cloudonix pipeline
            await run_pipeline_cloudonix(
-                websocket, stream_sid, call_sid, workflow_id, workflow_run_id, user_id
+                websocket, stream_sid, workflow_id, workflow_run_id, user_id
            )

        except Exception as e:
--- a/api/services/telephony/providers/twilio_provider.py
+++ b/api/services/telephony/providers/twilio_provider.py
@ -110,7 +110,7 @@ class TwilioProvider(TelephonyProvider):
                return CallInitiationResult(
                    call_id=response_data["sid"],
                    status=response_data.get("status", "queued"),
-                    provider_metadata={},  # Twilio doesn't need to persist extra data
+                    provider_metadata={"call_id": response_data["sid"]},
                    raw_response=response_data,
                )

--- a/api/services/telephony/providers/vobiz_provider.py
+++ b/api/services/telephony/providers/vobiz_provider.py
@ -150,7 +150,7 @@ class VobizProvider(TelephonyProvider):
                return CallInitiationResult(
                    call_id=call_id,
                    status="queued",  # Vobiz returns "message": "call fired"
-                    provider_metadata={},
+                    provider_metadata={"call_id": call_id},
                    raw_response=response_data,
                )

--- a/api/services/telephony/providers/vonage_provider.py
+++ b/api/services/telephony/providers/vonage_provider.py
@ -138,10 +138,8 @@ class VonageProvider(TelephonyProvider):
                    call_id=response_data["uuid"],
                    status=response_data.get("status", "started"),
                    provider_metadata={
-                        "call_uuid": response_data[
-                            "uuid"
-                        ]  # Vonage needs UUID persisted for WebSocket
-                    },
+                        "call_uuid": response_data["uuid"]
+                    },  # Vonage needs UUID persisted for WebSocket
                    raw_response=response_data,
                )

--- a/api/services/telephony/worker_event_subscriber.py
+++ b/api/services/telephony/worker_event_subscriber.py
@ -23,7 +23,7 @@ from api.services.telephony.stasis_event_protocol import (
    parse_event,
 )
 from api.services.telephony.stasis_rtp_connection import StasisRTPConnection
-from pipecat.utils.context import set_current_run_id
+from pipecat.utils.run_context import set_current_run_id


 class WorkerEventSubscriber:
--- a/api/services/workflow/pipecat_engine_callbacks.py
+++ b/api/services/workflow/pipecat_engine_callbacks.py
@ -116,6 +116,10 @@ def create_aggregation_correction_callback(engine: "PipecatEngine"):
        if corrupted in ref or len(alnum_ref) < len(alnum_corr) or len(alnum_corr) < 10:
            return corrupted

+        logger.debug(
+            f"In correct_corrupted_aggregation: ref: {ref} corrupted: {corrupted}"
+        )
+
        # 2) Find where in `ref` we should start aligning.
        #    We take the first N (N=10) characters of `corrupted`
        #    and look for all their occurrences in `ref`.