From 7a102026fbe90bed89455919ed1f9912cbab634b Mon Sep 17 00:00:00 2001 From: Abhishek Kumar Date: Mon, 9 Feb 2026 16:25:09 +0530 Subject: [PATCH] fix: send sample rate to STT services --- api/services/looptalk/core/pipeline_builder.py | 2 +- api/services/pipecat/audio_config.py | 8 ++++---- api/services/pipecat/run_pipeline.py | 2 +- api/services/pipecat/service_factory.py | 11 ++++++++--- api/services/pipecat/transport_setup.py | 2 +- pipecat | 2 +- 6 files changed, 16 insertions(+), 11 deletions(-) diff --git a/api/services/looptalk/core/pipeline_builder.py b/api/services/looptalk/core/pipeline_builder.py index 1426937..bc7b186 100644 --- a/api/services/looptalk/core/pipeline_builder.py +++ b/api/services/looptalk/core/pipeline_builder.py @@ -91,7 +91,7 @@ class LoopTalkPipelineBuilder: logger.info(f"Using {len(keyterms)} keyterms for STT: {keyterms}") # Create services - stt = create_stt_service(user_config, keyterms=keyterms) + stt = create_stt_service(user_config, audio_config, keyterms=keyterms) llm = create_llm_service(user_config) tts = create_tts_service(user_config, audio_config) diff --git a/api/services/pipecat/audio_config.py b/api/services/pipecat/audio_config.py index 829a8be..6bb0e8c 100644 --- a/api/services/pipecat/audio_config.py +++ b/api/services/pipecat/audio_config.py @@ -104,7 +104,7 @@ def create_audio_config(transport_type: str) -> AudioConfig: transport_out_sample_rate=8000, vad_sample_rate=8000, # Use matching VAD rate pipeline_sample_rate=8000, # Keep at 8kHz to avoid resampling - buffer_size_seconds=1.0, + buffer_size_seconds=5.0, ) elif transport_type == WorkflowRunMode.VONAGE.value: # Vonage uses 16kHz Linear PCM @@ -113,7 +113,7 @@ def create_audio_config(transport_type: str) -> AudioConfig: transport_out_sample_rate=16000, vad_sample_rate=16000, # Use matching VAD rate pipeline_sample_rate=16000, # Keep at 16kHz to avoid resampling - buffer_size_seconds=1.0, + buffer_size_seconds=5.0, ) elif transport_type in [ WorkflowRunMode.WEBRTC.value, @@ -126,7 +126,7 @@ def create_audio_config(transport_type: str) -> AudioConfig: transport_out_sample_rate=16000, # Transport will resample to 24kHz vad_sample_rate=16000, # VAD native rate pipeline_sample_rate=16000, # Keep pipeline at 16kHz - buffer_size_seconds=1.0, + buffer_size_seconds=5.0, ) else: # Default configuration @@ -138,5 +138,5 @@ def create_audio_config(transport_type: str) -> AudioConfig: transport_out_sample_rate=16000, vad_sample_rate=16000, pipeline_sample_rate=16000, - buffer_size_seconds=1.0, + buffer_size_seconds=5.0, ) diff --git a/api/services/pipecat/run_pipeline.py b/api/services/pipecat/run_pipeline.py index 1f6356a..484d511 100644 --- a/api/services/pipecat/run_pipeline.py +++ b/api/services/pipecat/run_pipeline.py @@ -494,7 +494,7 @@ async def _run_pipeline( ] # Create services based on user configuration - stt = create_stt_service(user_config, keyterms=keyterms) + stt = create_stt_service(user_config, audio_config, keyterms=keyterms) tts = create_tts_service(user_config, audio_config) llm = create_llm_service(user_config) diff --git a/api/services/pipecat/service_factory.py b/api/services/pipecat/service_factory.py index 129aa00..4fb3b67 100644 --- a/api/services/pipecat/service_factory.py +++ b/api/services/pipecat/service_factory.py @@ -30,7 +30,7 @@ if TYPE_CHECKING: from api.services.pipecat.audio_config import AudioConfig -def create_stt_service(user_config, keyterms: list[str] | None = None): +def create_stt_service(user_config, audio_config: "AudioConfig", keyterms: list[str] | None = None): """Create and return appropriate STT service based on user configuration Args: @@ -53,6 +53,7 @@ def create_stt_service(user_config, keyterms: list[str] | None = None): keyterm=keyterms or [], ), should_interrupt=False, # Let UserAggregator take care of sending InterruptionFrame + sample_rate=audio_config.transport_in_sample_rate ) # Other models than flux @@ -63,20 +64,21 @@ def create_stt_service(user_config, keyterms: list[str] | None = None): profanity_filter=False, endpointing=100, model=user_config.stt.model, - keyterm=keyterms or [], + keyterm=keyterms or [] ) logger.debug(f"Using DeepGram Model - {user_config.stt.model}") return DeepgramSTTService( live_options=live_options, api_key=user_config.stt.api_key, should_interrupt=False, # Let UserAggregator take care of sending InterruptionFrame + sample_rate=audio_config.transport_in_sample_rate ) elif user_config.stt.provider == ServiceProviders.OPENAI.value: return OpenAISTTService( api_key=user_config.stt.api_key, model=user_config.stt.model ) elif user_config.stt.provider == ServiceProviders.CARTESIA.value: - return CartesiaSTTService(api_key=user_config.stt.api_key) + return CartesiaSTTService(api_key=user_config.stt.api_key, sample_rate=audio_config.transport_in_sample_rate) elif user_config.stt.provider == ServiceProviders.DOGRAH.value: base_url = MPS_API_URL.replace("http://", "ws://").replace("https://", "wss://") language = getattr(user_config.stt, "language", None) or "multi" @@ -86,6 +88,7 @@ def create_stt_service(user_config, keyterms: list[str] | None = None): model=user_config.stt.model, language=language, keyterms=keyterms, + sample_rate=audio_config.transport_in_sample_rate ) elif user_config.stt.provider == ServiceProviders.SARVAM.value: # Map Sarvam language code to pipecat Language enum @@ -109,6 +112,7 @@ def create_stt_service(user_config, keyterms: list[str] | None = None): api_key=user_config.stt.api_key, model=user_config.stt.model, params=SarvamSTTService.InputParams(language=pipecat_language), + sample_rate=audio_config.transport_in_sample_rate ) elif user_config.stt.provider == ServiceProviders.SPEECHMATICS.value: from pipecat.services.speechmatics.stt import ( @@ -134,6 +138,7 @@ def create_stt_service(user_config, keyterms: list[str] | None = None): operating_point=operating_point, additional_vocab=additional_vocab, ), + sample_rate=audio_config.transport_in_sample_rate ) else: raise HTTPException( diff --git a/api/services/pipecat/transport_setup.py b/api/services/pipecat/transport_setup.py index 5c2752b..0290062 100644 --- a/api/services/pipecat/transport_setup.py +++ b/api/services/pipecat/transport_setup.py @@ -368,7 +368,7 @@ def create_stasis_transport( audio_out_enabled=True, audio_out_sample_rate=audio_config.transport_out_sample_rate, audio_in_sample_rate=audio_config.transport_in_sample_rate, - audio_out_10ms_chunks=2, # Send 20ms packets + # audio_out_10ms_chunks=2, # ToDo: Check if we cant support 40 ms packets? audio_out_mixer=( SoundfileMixer( sound_files={ diff --git a/pipecat b/pipecat index 5313e8c..d67983b 160000 --- a/pipecat +++ b/pipecat @@ -1 +1 @@ -Subproject commit 5313e8cd94443f220cc56c10cc2fc2aa98d8b6ba +Subproject commit d67983b3b165f945a93e5ce594f47781a96bff9b