Merge branch 'main' into feat/call-tags

This commit is contained in:
Abhishek Kumar 2026-02-16 13:07:23 +05:30
commit ea0967fd9c
41 changed files with 480 additions and 317 deletions

View file

@ -278,7 +278,48 @@ class DograhTTSService(BaseTTSConfiguration):
SARVAM_TTS_MODELS = ["bulbul:v2", "bulbul:v3"]
SARVAM_VOICES = ["anushka", "manisha", "vidya", "arya", "abhilash", "karun", "hitesh"]
SARVAM_V2_VOICES = ["anushka", "manisha", "vidya", "arya", "abhilash", "karun", "hitesh"]
SARVAM_V3_VOICES = [
"shubh",
"aditya",
"ritu",
"priya",
"neha",
"rahul",
"pooja",
"rohan",
"simran",
"kavya",
"amit",
"dev",
"ishita",
"shreya",
"ratan",
"varun",
"manan",
"sumit",
"roopa",
"kabir",
"aayan",
"ashutosh",
"advait",
"amelia",
"sophia",
"anand",
"tanya",
"tarun",
"sunny",
"mani",
"gokul",
"vijay",
"shruti",
"suhani",
"mohit",
"kavitha",
"rehan",
"soham",
"rupali",
]
SARVAM_LANGUAGES = [
"bn-IN",
"en-IN",
@ -301,7 +342,16 @@ class SarvamTTSConfiguration(BaseTTSConfiguration):
model: str = Field(
default="bulbul:v2", json_schema_extra={"examples": SARVAM_TTS_MODELS}
)
voice: str = Field(default="anushka", json_schema_extra={"examples": SARVAM_VOICES})
voice: str = Field(
default="anushka",
json_schema_extra={
"examples": SARVAM_V2_VOICES,
"model_options": {
"bulbul:v2": SARVAM_V2_VOICES,
"bulbul:v3": SARVAM_V3_VOICES,
},
},
)
language: str = Field(
default="hi-IN", json_schema_extra={"examples": SARVAM_LANGUAGES}
)
@ -322,39 +372,89 @@ TTSConfig = Annotated[
###################################################### STT ########################################################################
DEEPGRAM_STT_MODELS = ["nova-2", "nova-3-general", "flux-general-en"]
DEEPGRAM_STT_MODELS = ["nova-3-general", "flux-general-en"]
DEEPGRAM_LANGUAGES = [
"multi",
"ar",
"ar-AE",
"ar-SA",
"ar-QA",
"ar-KW",
"ar-SY",
"ar-LB",
"ar-PS",
"ar-JO",
"ar-EG",
"ar-SD",
"ar-TD",
"ar-MA",
"ar-DZ",
"ar-TN",
"ar-IQ",
"ar-IR",
"be",
"bn",
"bs",
"bg",
"ca",
"cs",
"da",
"da-DK",
"de",
"de-CH",
"el",
"en",
"en-US",
"en-GB",
"en-AU",
"en-GB",
"en-IN",
"en-NZ",
"es",
"es-419",
"et",
"fa",
"fi",
"fr",
"fr-CA",
"de",
"he",
"hi",
"hr",
"hu",
"id",
"it",
"ja",
"kn",
"ko",
"ko-KR",
"lt",
"lv",
"mk",
"mr",
"ms",
"nl",
"nl-BE",
"no",
"pl",
"pt",
"pt-BR",
"nl",
"hi",
"ja",
"ko",
"zh-CN",
"zh-TW",
"pt-PT",
"ro",
"ru",
"pl",
"sk",
"sl",
"sr",
"sv",
"sv-SE",
"ta",
"te",
"th",
"tl",
"tr",
"uk",
"ur",
"vi",
"sv",
"da",
"no",
"fi",
"id",
"th",
"zh-CN",
"zh-TW",
]
@ -365,7 +465,14 @@ class DeepgramSTTConfiguration(BaseSTTConfiguration):
default="nova-3-general", json_schema_extra={"examples": DEEPGRAM_STT_MODELS}
)
language: str = Field(
default="multi", json_schema_extra={"examples": DEEPGRAM_LANGUAGES}
default="multi",
json_schema_extra={
"examples": DEEPGRAM_LANGUAGES,
"model_options": {
"nova-3-general": DEEPGRAM_LANGUAGES,
"flux-general-en": ["en"],
},
},
)
api_key: str
@ -390,39 +497,7 @@ class OpenAISTTConfiguration(BaseSTTConfiguration):
# Dograh STT Service
DOGRAH_STT_MODELS = ["default"]
DOGRAH_STT_LANGUAGES = [
"multi",
"en",
"en-US",
"en-GB",
"en-AU",
"en-IN",
"es",
"es-419",
"fr",
"fr-CA",
"de",
"it",
"pt",
"pt-BR",
"nl",
"hi",
"ja",
"ko",
"zh-CN",
"zh-TW",
"ru",
"pl",
"tr",
"uk",
"vi",
"sv",
"da",
"no",
"fi",
"id",
"th",
]
DOGRAH_STT_LANGUAGES = DEEPGRAM_LANGUAGES
@register_stt

View file

@ -14,7 +14,7 @@ from api.services.looptalk.internal_transport import (
)
from api.services.pipecat.transport_setup import create_internal_transport
from pipecat.pipeline.task import PipelineTask
from pipecat.utils.context import set_current_run_id
from pipecat.utils.run_context import set_current_run_id
from .core.pipeline_builder import LoopTalkPipelineBuilder
from .core.recording_manager import RecordingManager

View file

@ -10,7 +10,7 @@ from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.task import PipelineParams, PipelineTask
from pipecat.processors.aggregators.llm_context import LLMContext
from pipecat.processors.audio.audio_buffer_processor import AudioBufferProcessor
from pipecat.utils.context import turn_var
from pipecat.utils.run_context import turn_var
def create_pipeline_components(audio_config: AudioConfig):

View file

@ -62,6 +62,7 @@ from pipecat.turns.user_mute import (
MuteUntilFirstBotCompleteUserMuteStrategy,
)
from pipecat.turns.user_start import (
ExternalUserTurnStartStrategy,
TranscriptionUserTurnStartStrategy,
)
from pipecat.turns.user_start.vad_user_turn_start_strategy import (
@ -69,12 +70,12 @@ from pipecat.turns.user_start.vad_user_turn_start_strategy import (
)
from pipecat.turns.user_stop import (
ExternalUserTurnStopStrategy,
TranscriptionUserTurnStopStrategy,
SpeechTimeoutUserTurnStopStrategy,
TurnAnalyzerUserTurnStopStrategy,
)
from pipecat.turns.user_turn_strategies import UserTurnStrategies
from pipecat.utils.context import set_current_run_id
from pipecat.utils.enums import EndTaskReason
from pipecat.utils.run_context import set_current_run_id
from pipecat.utils.tracing.context_registry import ContextProviderRegistry
# Setup tracing if enabled
@ -265,7 +266,6 @@ async def run_pipeline_vobiz(
async def run_pipeline_cloudonix(
websocket_client: WebSocket,
stream_sid: str,
call_sid: str,
workflow_id: int,
workflow_run_id: int,
user_id: int,
@ -274,10 +274,15 @@ async def run_pipeline_cloudonix(
logger.debug(
f"Running pipeline for Cloudonix connection with workflow_id: {workflow_id} and workflow_run_id: {workflow_run_id}"
)
set_current_run_id(workflow_run_id)
workflow_run = await db_client.get_workflow_run_by_id(workflow_run_id)
call_id = workflow_run.gathered_context.get("call_id")
if not call_id:
logger.warning("call_id not found in gathered_context")
raise Exception()
# Store call ID in cost_info for later cost calculation (provider-agnostic)
cost_info = {"call_id": call_sid}
cost_info = {"call_id": call_id}
await db_client.update_workflow_run(workflow_run_id, cost_info=cost_info)
# Get workflow to extract all pipeline configurations
@ -292,26 +297,18 @@ async def run_pipeline_cloudonix(
"ambient_noise_configuration"
]
# Retrieve session_token from workflow_run gathered_context
workflow_run = await db_client.get_workflow_run(workflow_run_id)
session_token = None
if workflow_run and workflow_run.gathered_context:
session_token = workflow_run.gathered_context.get("session_token")
logger.debug(f"Retrieved session_token from workflow_run: {session_token}")
# Create audio configuration for Cloudonix
audio_config = create_audio_config(WorkflowRunMode.CLOUDONIX.value)
transport = await create_cloudonix_transport(
websocket_client,
call_id,
stream_sid,
call_sid,
workflow_run_id,
audio_config,
workflow.organization_id,
vad_config,
ambient_noise_config,
session_token,
)
await _run_pipeline(
transport,
@ -580,7 +577,10 @@ async def _run_pipeline(
if is_deepgram_flux:
user_turn_strategies = UserTurnStrategies(
start=[VADUserTurnStartStrategy(), TranscriptionUserTurnStartStrategy()],
start=[
VADUserTurnStartStrategy(),
ExternalUserTurnStartStrategy(enable_interruptions=True),
],
stop=[ExternalUserTurnStopStrategy()],
)
elif turn_stop_strategy == "turn_analyzer":
@ -598,7 +598,7 @@ async def _run_pipeline(
# Transcription-based (default): best for short 1-2 word responses
user_turn_strategies = UserTurnStrategies(
start=[VADUserTurnStartStrategy(), TranscriptionUserTurnStartStrategy()],
stop=[TranscriptionUserTurnStopStrategy()],
stop=[SpeechTimeoutUserTurnStopStrategy()],
)
# Create user mute strategies

View file

@ -30,7 +30,9 @@ if TYPE_CHECKING:
from api.services.pipecat.audio_config import AudioConfig
def create_stt_service(user_config, audio_config: "AudioConfig", keyterms: list[str] | None = None):
def create_stt_service(
user_config, audio_config: "AudioConfig", keyterms: list[str] | None = None
):
"""Create and return appropriate STT service based on user configuration
Args:
@ -53,7 +55,7 @@ def create_stt_service(user_config, audio_config: "AudioConfig", keyterms: list[
keyterm=keyterms or [],
),
should_interrupt=False, # Let UserAggregator take care of sending InterruptionFrame
sample_rate=audio_config.transport_in_sample_rate
sample_rate=audio_config.transport_in_sample_rate,
)
# Other models than flux
@ -64,21 +66,24 @@ def create_stt_service(user_config, audio_config: "AudioConfig", keyterms: list[
profanity_filter=False,
endpointing=100,
model=user_config.stt.model,
keyterm=keyterms or []
keyterm=keyterms or [],
)
logger.debug(f"Using DeepGram Model - {user_config.stt.model}")
return DeepgramSTTService(
live_options=live_options,
api_key=user_config.stt.api_key,
should_interrupt=False, # Let UserAggregator take care of sending InterruptionFrame
sample_rate=audio_config.transport_in_sample_rate
sample_rate=audio_config.transport_in_sample_rate,
)
elif user_config.stt.provider == ServiceProviders.OPENAI.value:
return OpenAISTTService(
api_key=user_config.stt.api_key, model=user_config.stt.model
)
elif user_config.stt.provider == ServiceProviders.CARTESIA.value:
return CartesiaSTTService(api_key=user_config.stt.api_key, sample_rate=audio_config.transport_in_sample_rate)
return CartesiaSTTService(
api_key=user_config.stt.api_key,
sample_rate=audio_config.transport_in_sample_rate,
)
elif user_config.stt.provider == ServiceProviders.DOGRAH.value:
base_url = MPS_API_URL.replace("http://", "ws://").replace("https://", "wss://")
language = getattr(user_config.stt, "language", None) or "multi"
@ -88,7 +93,7 @@ def create_stt_service(user_config, audio_config: "AudioConfig", keyterms: list[
model=user_config.stt.model,
language=language,
keyterms=keyterms,
sample_rate=audio_config.transport_in_sample_rate
sample_rate=audio_config.transport_in_sample_rate,
)
elif user_config.stt.provider == ServiceProviders.SARVAM.value:
# Map Sarvam language code to pipecat Language enum
@ -112,7 +117,7 @@ def create_stt_service(user_config, audio_config: "AudioConfig", keyterms: list[
api_key=user_config.stt.api_key,
model=user_config.stt.model,
params=SarvamSTTService.InputParams(language=pipecat_language),
sample_rate=audio_config.transport_in_sample_rate
sample_rate=audio_config.transport_in_sample_rate,
)
elif user_config.stt.provider == ServiceProviders.SPEECHMATICS.value:
from pipecat.services.speechmatics.stt import (
@ -138,7 +143,7 @@ def create_stt_service(user_config, audio_config: "AudioConfig", keyterms: list[
operating_point=operating_point,
additional_vocab=additional_vocab,
),
sample_rate=audio_config.transport_in_sample_rate
sample_rate=audio_config.transport_in_sample_rate,
)
else:
raise HTTPException(

View file

@ -94,14 +94,13 @@ async def create_twilio_transport(
async def create_cloudonix_transport(
websocket_client: WebSocket,
call_id: str,
stream_sid: str,
call_sid: str,
workflow_run_id: int,
audio_config: AudioConfig,
organization_id: int,
vad_config: dict | None = None,
ambient_noise_config: dict | None = None,
session_token: str | None = None,
):
"""Create a transport for Cloudonix connections"""
@ -125,11 +124,10 @@ async def create_cloudonix_transport(
from pipecat.serializers.cloudonix import CloudonixFrameSerializer
serializer = CloudonixFrameSerializer(
call_id=call_id,
stream_sid=stream_sid,
call_sid=call_sid,
domain_id=domain_id,
bearer_token=bearer_token,
session_token=session_token,
)
return FastAPIWebsocketTransport(

View file

@ -8,7 +8,7 @@ propagate through asyncio.create_task() calls.
import asyncio
from typing import Dict, Optional
from pipecat.utils.context import turn_var
from pipecat.utils.run_context import turn_var
class TurnContextManager:

View file

@ -395,10 +395,6 @@ class CloudonixProvider(TelephonyProvider):
await websocket.close(code=4400, reason="Expected connected event")
return
logger.debug(
f"Cloudonix WebSocket connected for workflow_run {workflow_run_id}"
)
# Wait for "start" event with stream details
start_msg = await websocket.receive_text()
logger.debug(f"Received start message: {start_msg}")
@ -418,9 +414,14 @@ class CloudonixProvider(TelephonyProvider):
await websocket.close(code=4400, reason="Missing stream identifiers")
return
logger.debug(
f"Cloudonix WebSocket connected for workflow_run {workflow_run_id} "
f"stream_sid: {stream_sid} call_sid: {call_sid}"
)
# Run the Cloudonix pipeline
await run_pipeline_cloudonix(
websocket, stream_sid, call_sid, workflow_id, workflow_run_id, user_id
websocket, stream_sid, workflow_id, workflow_run_id, user_id
)
except Exception as e:

View file

@ -110,7 +110,7 @@ class TwilioProvider(TelephonyProvider):
return CallInitiationResult(
call_id=response_data["sid"],
status=response_data.get("status", "queued"),
provider_metadata={}, # Twilio doesn't need to persist extra data
provider_metadata={"call_id": response_data["sid"]},
raw_response=response_data,
)

View file

@ -150,7 +150,7 @@ class VobizProvider(TelephonyProvider):
return CallInitiationResult(
call_id=call_id,
status="queued", # Vobiz returns "message": "call fired"
provider_metadata={},
provider_metadata={"call_id": call_id},
raw_response=response_data,
)

View file

@ -138,10 +138,8 @@ class VonageProvider(TelephonyProvider):
call_id=response_data["uuid"],
status=response_data.get("status", "started"),
provider_metadata={
"call_uuid": response_data[
"uuid"
] # Vonage needs UUID persisted for WebSocket
},
"call_uuid": response_data["uuid"]
}, # Vonage needs UUID persisted for WebSocket
raw_response=response_data,
)

View file

@ -23,7 +23,7 @@ from api.services.telephony.stasis_event_protocol import (
parse_event,
)
from api.services.telephony.stasis_rtp_connection import StasisRTPConnection
from pipecat.utils.context import set_current_run_id
from pipecat.utils.run_context import set_current_run_id
class WorkerEventSubscriber:

View file

@ -116,6 +116,10 @@ def create_aggregation_correction_callback(engine: "PipecatEngine"):
if corrupted in ref or len(alnum_ref) < len(alnum_corr) or len(alnum_corr) < 10:
return corrupted
logger.debug(
f"In correct_corrupted_aggregation: ref: {ref} corrupted: {corrupted}"
)
# 2) Find where in `ref` we should start aligning.
# We take the first N (N=10) characters of `corrupted`
# and look for all their occurrences in `ref`.