diff --git a/api/services/configuration/registry.py b/api/services/configuration/registry.py index f433981..4e828b2 100644 --- a/api/services/configuration/registry.py +++ b/api/services/configuration/registry.py @@ -332,6 +332,7 @@ class CartesiaTTSConfiguration(BaseTTSConfiguration): default="sonic-3", json_schema_extra={"examples": CARTESIA_TTS_MODELS} ) voice: str = Field(default="3faa81ae-d3d8-4ab1-9e44-e50e46d33c30") + speed: float = Field(default=1.0, ge=0.6, le=1.5, description="Speed of the voice") SARVAM_TTS_MODELS = ["bulbul:v2", "bulbul:v3"] diff --git a/api/services/pipecat/service_factory.py b/api/services/pipecat/service_factory.py index 0ffd44e..caf3234 100644 --- a/api/services/pipecat/service_factory.py +++ b/api/services/pipecat/service_factory.py @@ -8,7 +8,7 @@ from api.services.configuration.registry import ServiceProviders from pipecat.services.aws.llm import AWSBedrockLLMService, AWSBedrockLLMSettings from pipecat.services.azure.llm import AzureLLMService, AzureLLMSettings from pipecat.services.cartesia.stt import CartesiaSTTService -from pipecat.services.cartesia.tts import CartesiaTTSService, CartesiaTTSSettings +from pipecat.services.cartesia.tts import CartesiaTTSService, CartesiaTTSSettings, GenerationConfig from pipecat.services.deepgram.flux.stt import ( DeepgramFluxSTTService, DeepgramFluxSTTSettings, @@ -211,11 +211,14 @@ def create_tts_service(user_config, audio_config: "AudioConfig"): silence_time_s=1.0, ) elif user_config.tts.provider == ServiceProviders.CARTESIA.value: + speed = getattr(user_config.tts, "speed", None) + generation_config = GenerationConfig(speed=speed) if speed and speed != 1.0 else None return CartesiaTTSService( api_key=user_config.tts.api_key, settings=CartesiaTTSSettings( voice=user_config.tts.voice, model=user_config.tts.model, + **({"generation_config": generation_config} if generation_config else {}), ), text_filters=[xml_function_tag_filter], silence_time_s=1.0,