feat: add voice selectors in elevenlabs (#88)

2026-06-25 08:48:13 +02:00 · 2025-12-25 15:05:53 +05:30 · 2025-12-25 15:05:53 +05:30 · 45c5b7c304
commit 45c5b7c304
parent 480e8a5f60
22 changed files with 978 additions and 166 deletions
--- a/api/Dockerfile
+++ b/api/Dockerfile
@ -20,7 +20,7 @@ RUN pip install --user --no-cache-dir -r requirements.txt && \

 # Copy and install pipecat from local submodule
 COPY pipecat /tmp/pipecat
-RUN pip install --user --no-cache-dir '/tmp/pipecat[cartesia,deepgram,openai,elevenlabs,groq,google,azure,soundfile,silero,webrtc,local-smart-turn-v3]' && \
+RUN pip install --user --no-cache-dir '/tmp/pipecat[cartesia,deepgram,openai,elevenlabs,groq,google,azure,sarvam,soundfile,silero,webrtc,local-smart-turn-v3]' && \
    # Clean up pip cache and temporary pipecat directory
    rm -rf /root/.cache/pip /tmp/pipecat

--- a/api/db/filters.py
+++ b/api/db/filters.py
@ -111,9 +111,7 @@ def apply_workflow_run_filters(
                    # (subscript [] only works in PostgreSQL 14+)
                    filter_conditions.append(
                        cast(WorkflowRunModel.gathered_context, JSONB)
-                        .op("->>")(
-                            "mapped_call_disposition"
-                        )
+                        .op("->>")("mapped_call_disposition")
                        .in_(codes)
                    )

@ -147,9 +145,7 @@ def apply_workflow_run_filters(
                    # Use ->> operator for compatibility with all PostgreSQL versions
                    filter_conditions.append(
                        cast(WorkflowRunModel.initial_context, JSONB)
-                        .op("->>")(
-                            "phone"
-                        )
+                        .op("->>")("phone")
                        .contains(phone)
                    )

@ -178,13 +174,9 @@ def apply_workflow_run_filters(
                        "total_cost_usd"
                    )
                    if min_val is not None:
-                        filter_conditions.append(
-                            cast(cost_text, Integer) >= min_val
-                        )
+                        filter_conditions.append(cast(cost_text, Integer) >= min_val)
                    if max_val is not None:
-                        filter_conditions.append(
-                            cast(cost_text, Integer) <= max_val
-                        )
+                        filter_conditions.append(cast(cost_text, Integer) <= max_val)

    if filter_conditions:
        base_query = base_query.where(and_(*filter_conditions))
--- a/api/routes/user.py
+++ b/api/routes/user.py
@ -1,7 +1,8 @@
 from datetime import datetime, timedelta
-from typing import List, Optional, TypedDict, Union
+from typing import List, Literal, Optional, TypedDict, Union

 from fastapi import APIRouter, Depends, HTTPException, Query
+from loguru import logger
 from pydantic import BaseModel

 from api.db import db_client
@ -17,6 +18,7 @@ from api.services.configuration.defaults import DEFAULT_SERVICE_PROVIDERS
 from api.services.configuration.masking import mask_user_config
 from api.services.configuration.merge import merge_user_configurations
 from api.services.configuration.registry import REGISTRY, ServiceType
+from api.services.mps_service_key_client import mps_service_key_client

 router = APIRouter(prefix="/user")

@ -274,3 +276,46 @@ async def reactivate_api_key(
        raise HTTPException(status_code=500, detail="Failed to reactivate API key")

    return {"success": True, "message": "API key reactivated successfully"}
+
+
+# Voice Configuration Endpoints
+TTSProvider = Literal["elevenlabs", "deepgram", "sarvam", "cartesia", "dograh"]
+
+
+class VoiceInfo(BaseModel):
+    voice_id: str
+    name: str
+    description: Optional[str] = None
+    accent: Optional[str] = None
+    gender: Optional[str] = None
+    language: Optional[str] = None
+    preview_url: Optional[str] = None
+
+
+class VoicesResponse(BaseModel):
+    provider: str
+    voices: List[VoiceInfo]
+
+
+@router.get("/configurations/voices/{provider}")
+async def get_voices(
+    provider: TTSProvider,
+    user: UserModel = Depends(get_user),
+) -> VoicesResponse:
+    """Get available voices for a TTS provider."""
+    try:
+        result = await mps_service_key_client.get_voices(
+            provider=provider,
+            organization_id=user.selected_organization_id,
+            created_by=user.provider_id,
+        )
+        return VoicesResponse(
+            provider=result.get("provider", provider),
+            voices=[VoiceInfo(**voice) for voice in result.get("voices", [])],
+        )
+    except Exception as e:
+        logger.error(f"Failed to fetch voices for {provider}: {e}")
+        raise HTTPException(
+            status_code=500,
+            detail=f"Failed to fetch voices for {provider}",
+        )
--- a/api/services/auth/depends.py
+++ b/api/services/auth/depends.py
@ -11,10 +11,8 @@ from api.db.models import UserModel
 from api.schemas.user_configuration import UserConfiguration
 from api.services.auth.stack_auth import stackauth
 from api.services.configuration.registry import (
-    DograhLLMModel,
    DograhSTTModel,
    DograhTTSModel,
-    DograhVoice,
    ServiceProviders,
 )

@ -244,13 +242,13 @@ async def create_user_configuration_with_mps_key(
                    "llm": {
                        "provider": ServiceProviders.DOGRAH.value,
                        "api_key": service_key,
-                        "model": DograhLLMModel.DEFAULT.value,  # Default model
+                        "model": "default",  # Default model
                    },
                    "tts": {
                        "provider": ServiceProviders.DOGRAH.value,
                        "api_key": service_key,
                        "model": DograhTTSModel.DEFAULT.value,  # Default model
-                        "voice": DograhVoice.DEFAULT.value,  # Default voice
+                        "voice": "default",  # Default voice
                    },
                    "stt": {
                        "provider": ServiceProviders.DOGRAH.value,
--- a/api/services/configuration/check_validity.py
+++ b/api/services/configuration/check_validity.py
@ -38,6 +38,7 @@ class UserConfigurationValidator:
            ServiceProviders.AZURE.value: self._check_azure_api_key,
            ServiceProviders.CARTESIA.value: self._check_cartesia_api_key,
            ServiceProviders.DOGRAH.value: self._check_dograh_api_key,
+            ServiceProviders.SARVAM.value: self._check_sarvam_api_key,
        }

    async def validate(self, configuration: UserConfiguration) -> APIKeyStatusResponse:
@ -134,20 +135,5 @@ class UserConfigurationValidator:
    def _check_dograh_api_key(self, model: str, api_key: str) -> bool:
        return True

-    # def _check_neuphonic_api_key(self, model: str, api_key: str) -> bool:
-    #     if not Neuphonic:
-    #         self._provider_api_key_validity_status[model] = False
-    #         return self._provider_api_key_validity_status[model]
-
-    #     if model in self._provider_api_key_validity_status:
-    #         return self._provider_api_key_validity_status[model]
-
-    #     client = Neuphonic(api_key=api_key)
-    #     try:
-    #         response = client.voices.list()  # get's all available voices
-    #         voices = response.data["voices"]
-    #         self._provider_api_key_validity_status[model] = True
-    #     except Exception:
-    #         self._provider_api_key_validity_status[model] = False
-
-    #     return self._provider_api_key_validity_status[model]
+    def _check_sarvam_api_key(self, model: str, api_key: str) -> bool:
+        return True
--- a/api/services/configuration/registry.py
+++ b/api/services/configuration/registry.py
@ -20,6 +20,7 @@ class ServiceProviders(str, Enum):
    GOOGLE = "google"
    AZURE = "azure"
    DOGRAH = "dograh"
+    SARVAM = "sarvam"


 class BaseServiceConfiguration(BaseModel):
@ -31,6 +32,7 @@ class BaseServiceConfiguration(BaseModel):
        ServiceProviders.GOOGLE,
        ServiceProviders.AZURE,
        ServiceProviders.DOGRAH,
+        # ServiceProviders.SARVAM,
    ]
    api_key: str

@ -92,82 +94,56 @@ def register_stt(cls: Type[BaseSTTConfiguration]):

 ###################################################### LLM ########################################################################

-
-class OpenAIModel(str, Enum):
-    GPT3_5_TURBO = "gpt-3.5-turbo"
-    GPT4_1 = "gpt-4.1"
-    GPT4_1_MINI = "gpt-4.1-mini"
-    GPT4_1_NANO = "gpt-4.1-nano"
-    GPT5 = "gpt-5"
-    GPT5_MINI = "gpt-5-mini"
-    GPT5_NANO = "gpt-5-nano"
+# Suggested models for each provider (used for UI dropdown)
+OPENAI_MODELS = ["gpt-4.1", "gpt-4.1-mini", "gpt-4.1-nano", "gpt-5", "gpt-5-mini", "gpt-5-nano", "gpt-3.5-turbo"]
+GOOGLE_MODELS = ["gemini-2.0-flash", "gemini-2.0-flash-lite", "gemini-2.5-flash", "gemini-2.5-flash-lite"]
+GROQ_MODELS = [
+    "llama-3.3-70b-versatile",
+    "deepseek-r1-distill-llama-70b",
+    "qwen-qwq-32b",
+    "meta-llama/llama-4-scout-17b-16e-instruct",
+    "meta-llama/llama-4-maverick-17b-128e-instruct",
+    "gemma2-9b-it",
+    "llama-3.1-8b-instant",
+    "openai/gpt-oss-120b",
+]
+AZURE_MODELS = ["gpt-4.1-mini"]
+DOGRAH_LLM_MODELS = ["default", "accurate", "fast", "lite", "zen", "zen_lite"]


@register_llm
 class OpenAILLMService(BaseLLMConfiguration):
    provider: Literal[ServiceProviders.OPENAI] = ServiceProviders.OPENAI
-    model: OpenAIModel = OpenAIModel.GPT4_1
+    model: str = Field(default="gpt-4.1", json_schema_extra={"examples": OPENAI_MODELS})
    api_key: str


-class GoogleModel(str, Enum):
-    GEMINI_2_0_FLASH = "gemini-2.0-flash"
-    GEMINI_2_0_FLASH_LITE = "gemini-2.0-flash-lite"
-    GEMINI_2_5_FLASH = "gemini-2.5-flash"
-    GEMINI_2_5_FLASH_LITE = "gemini-2.5-flash-lite"
-
-
@register_llm
 class GoogleLLMService(BaseLLMConfiguration):
    provider: Literal[ServiceProviders.GOOGLE] = ServiceProviders.GOOGLE
-    model: GoogleModel = GoogleModel.GEMINI_2_0_FLASH
+    model: str = Field(default="gemini-2.0-flash", json_schema_extra={"examples": GOOGLE_MODELS})
    api_key: str


-class GroqModel(str, Enum):
-    LLAMA_3_3_70B = "llama-3.3-70b-versatile"
-    DEEPSEEK_R1_DISTILL_LLAMA_70B = "deepseek-r1-distill-llama-70b"
-    QUEN_QWQ_32B = "qwen-qwq-32b"
-    LLAMA_4_SCOUT_17B_16E_INSTRUCT = "meta-llama/llama-4-scout-17b-16e-instruct"
-    LLAMA_4_MAVERICK_17B_128E_INSTRUCT = "meta-llama/llama-4-maverick-17b-128e-instruct"
-    GEMMA2_9B_IT = "gemma2-9b-it"
-    LLAMA_3_1_8B_INSTANT = "llama-3.1-8b-instant"
-    OPENAI_GPT_OSS_120B = "openai/gpt-oss-120b"
-
-
@register_llm
 class GroqLLMService(BaseLLMConfiguration):
    provider: Literal[ServiceProviders.GROQ] = ServiceProviders.GROQ
-    model: GroqModel = GroqModel.LLAMA_3_3_70B
+    model: str = Field(default="llama-3.3-70b-versatile", json_schema_extra={"examples": GROQ_MODELS})
    api_key: str


-class AzureModel(str, Enum):
-    GPT4_1_MINI = "gpt-4.1-mini"
-
-
@register_llm
 class AzureLLMService(BaseLLMConfiguration):
    provider: Literal[ServiceProviders.AZURE] = ServiceProviders.AZURE
-    model: AzureModel = AzureModel.GPT4_1_MINI
+    model: str = Field(default="gpt-4.1-mini", json_schema_extra={"examples": AZURE_MODELS})
    api_key: str
    endpoint: str


-# Dograh LLM Service
-class DograhLLMModel(str, Enum):
-    DEFAULT = "default"
-    ACCURATE = "accurate"
-    FAST = "fast"
-    LITE = "lite"
-    ZEN = "zen"
-    ZEN_LITE = "zen_lite"
-
-
@register_llm
 class DograhLLMService(BaseLLMConfiguration):
    provider: Literal[ServiceProviders.DOGRAH] = ServiceProviders.DOGRAH
-    model: DograhLLMModel = DograhLLMModel.DEFAULT
+    model: str = Field(default="default", json_schema_extra={"examples": DOGRAH_LLM_MODELS})
    api_key: str


@ -185,15 +161,10 @@ LLMConfig = Annotated[
 ###################################################### TTS ########################################################################


-class DeepgramVoice(str, Enum):
-    HELENA = "aura-2-helena-en"
-    THALIA = "aura-2-thalia-en"
-
-
@register_tts
 class DeepgramTTSConfiguration(BaseServiceConfiguration):
    provider: Literal[ServiceProviders.DEEPGRAM] = ServiceProviders.DEEPGRAM
-    voice: DeepgramVoice = DeepgramVoice.HELENA
+    voice: str = "aura-2-helena-en"
    api_key: str

    @computed_field
@ -210,42 +181,6 @@ class DeepgramTTSConfiguration(BaseServiceConfiguration):
            return "aura-2"


-class ElevenlabsVoice(str, Enum):
-    ALEXANDRA = "Alexandra - 3dzJXoCYueSQiptQ6euE"
-    AMY = "Amy - oGn4Ha2pe2vSJkmIJgLQ"
-    ANGELA = "Angela - FUfBrNit0NNZAwb58KWH"
-    ARIA = "Aria - 9BWtsMINqrJLrRacOk9x"
-    CHELSEA = "Chelsea - NHRgOEwqx5WZNClv5sat"
-    CHRISTINA = "Christina - X03mvPuTfprif8QBAVeJ"
-    CLARA = "Clara - ZIlrSGI4jZqobxRKprJz"
-    CLYDE = "Clyde - 2EiwWnXFnvU5JabPnv8n"
-    DAVE = "Dave - CYw3kZ02Hs0563khs1Fj"
-    DOMI = "Domi - AZnzlk1XvdvUeBnXmlld"
-    DREW = "Drew - 29vD33N1CtxCmqQRPOHJ"
-    ELENA = "Elena_German - iFJwt4O7E3aafIpJFfcu"
-    EVE = "Eve - BZgkqPqms7Kj9ulSkVzn"
-    FIN = "Fin - D38z5RcWu1voky8WS1ja"
-    HOPE_BESTIE = "Hope_Bestie - uYXf8XasLslADfZ2MB4u"
-    HOPE_NATURAL = "Hope_Natural - OYTbf65OHHFELVut7v2H"
-    JARNATHAN = "Jarnathan - c6SfcYrb2t09NHXiT80T"
-    JENNA = "Jenna - C2BkQxlGNzBn7WD2bqfR"
-    JESSICA = "Jessica - cgSgspJ2msm6clMCkdW9"
-    JOHANNA = "Johanna_German - YYDsZT3K2y6tv7X1aj6N"
-    JUNIPER = "Juniper - aMSt68OGf4xUZAnLpTU8"
-    LAUREN = "Lauren - 3liN8q8YoeB9Hk6AboKe"
-    LINA = "Lina - oWjuL7HSoaEJRMDMP3HD"
-    MONIKA = "Monika_Hindi_8 - 2bNrEsM0omyhLiEyOwqY"
-    NEHA = "Neha_Hindi - QTKSa2Iyv0yoxvXY2V8a"
-    OLIVIA = "Olivia - 1rviaVF7GGGkTU36HNpz"
-    PAUL = "Paul - 5Q0t7uMcjvnagumLfvZi"
-    RACHEL = "Rachel - 21m00Tcm4TlvDq8ikWAM"
-    ROGER = "Roger - CwhRBWXzGAHq8TQ4Fs17"
-    SAMI_REAL = "Sami_Real - O4cGUVdAocn0z4EpQ9yF"
-    SARAH = "Sarah - EXAVITQu4vr4xnSDxMaL"
-    SIA = "Sia_Hindi_10 - ryIIztHPLYSJ74ueXxnO"
-    ZARA = "Zara - MmQVkVZnQ0dUbfWzcW6f"
-
-
 class ElevenlabsModel(str, Enum):
    FLASH_2 = "eleven_flash_v2_5"

@ -253,16 +188,12 @@ class ElevenlabsModel(str, Enum):
@register_tts
 class ElevenlabsTTSConfiguration(BaseServiceConfiguration):
    provider: Literal[ServiceProviders.ELEVENLABS] = ServiceProviders.ELEVENLABS
-    voice: ElevenlabsVoice = ElevenlabsVoice.RACHEL
+    voice: str = "21m00Tcm4TlvDq8ikWAM"  # Rachel voice ID
    speed: float = Field(default=1.0, ge=0.1, le=2.0, description="Speed of the voice")
    model: ElevenlabsModel = ElevenlabsModel.FLASH_2
    api_key: str


-class OpenAIVoice(str, Enum):
-    ALLY = "alloy"
-
-
 class OpenAITTSModel(str, Enum):
    GPT_4o_MINI = "gpt-4o-mini-tts"

@ -271,29 +202,10 @@ class OpenAITTSModel(str, Enum):
 class OpenAITTSService(BaseTTSConfiguration):
    provider: Literal[ServiceProviders.OPENAI] = ServiceProviders.OPENAI
    model: OpenAITTSModel = OpenAITTSModel.GPT_4o_MINI
-    voice: OpenAIVoice = OpenAIVoice.ALLY
+    voice: str = "alloy"
    api_key: str


-# class NeuphonicVoice(str, Enum):
-#     EMILY = "Emily - fc854436-2dac-4d21-aa69-ae17b54e98eb"
-
-
-# @register_tts
-# class NeuphonicTTSService(BaseTTSConfiguration):
-#     provider: Literal[ServiceProviders.NEUPHONIC] = ServiceProviders.NEUPHONIC
-#     voice: NeuphonicVoice = NeuphonicVoice.EMILY
-#     model: str = "NA"
-#     api_key: str
-
-
-# Dograh TTS Service
-class DograhVoice(str, Enum):
-    DEFAULT = "default"
-    JOEY = "joey"
-    RACHEL = "rachel"
-
-
 class DograhTTSModel(str, Enum):
    DEFAULT = "default"

@ -302,16 +214,58 @@ class DograhTTSModel(str, Enum):
 class DograhTTSService(BaseTTSConfiguration):
    provider: Literal[ServiceProviders.DOGRAH] = ServiceProviders.DOGRAH
    model: DograhTTSModel = DograhTTSModel.DEFAULT
-    voice: DograhVoice = DograhVoice.DEFAULT
+    voice: str = "default"
    api_key: str


+class SarvamTTSModel(str, Enum):
+    BULBUL_V2 = "bulbul:v2"
+    BULBUL_V3 = "bulbul:v3"
+
+
+class SarvamVoice(str, Enum):
+    # Female voices
+    ANUSHKA = "anushka"
+    MANISHA = "manisha"
+    VIDYA = "vidya"
+    ARYA = "arya"
+    # Male voices
+    ABHILASH = "abhilash"
+    KARUN = "karun"
+    HITESH = "hitesh"
+
+
+class SarvamLanguage(str, Enum):
+    BENGALI = "bn-IN"
+    ENGLISH_INDIA = "en-IN"
+    GUJARATI = "gu-IN"
+    HINDI = "hi-IN"
+    KANNADA = "kn-IN"
+    MALAYALAM = "ml-IN"
+    MARATHI = "mr-IN"
+    ODIA = "od-IN"
+    PUNJABI = "pa-IN"
+    TAMIL = "ta-IN"
+    TELUGU = "te-IN"
+    ASSAMESE = "as-IN"
+
+
+# @register_tts
+# class SarvamTTSConfiguration(BaseTTSConfiguration):
+#     provider: Literal[ServiceProviders.SARVAM] = ServiceProviders.SARVAM
+#     model: SarvamTTSModel = SarvamTTSModel.BULBUL_V2
+#     voice: SarvamVoice = SarvamVoice.ANUSHKA
+#     language: SarvamLanguage = SarvamLanguage.HINDI
+#     api_key: str
+
+
 TTSConfig = Annotated[
    Union[
        DeepgramTTSConfiguration,
        OpenAITTSService,
        ElevenlabsTTSConfiguration,
        DograhTTSService,
+        # SarvamTTSConfiguration,
    ],
    Field(discriminator="provider"),
 ]
@ -323,10 +277,45 @@ class DeepgramSTTModel(str, Enum):
    NOVA_3_GENERAL = "nova-3-general"


+class DeepgramLanguage(str, Enum):
+    MULTI = "multi"
+    ENGLISH = "en"
+    ENGLISH_US = "en-US"
+    ENGLISH_GB = "en-GB"
+    ENGLISH_AU = "en-AU"
+    ENGLISH_IN = "en-IN"
+    SPANISH = "es"
+    SPANISH_LATAM = "es-419"
+    FRENCH = "fr"
+    FRENCH_CA = "fr-CA"
+    GERMAN = "de"
+    ITALIAN = "it"
+    PORTUGUESE = "pt"
+    PORTUGUESE_BR = "pt-BR"
+    DUTCH = "nl"
+    HINDI = "hi"
+    JAPANESE = "ja"
+    KOREAN = "ko"
+    CHINESE_SIMPLIFIED = "zh-CN"
+    CHINESE_TRADITIONAL = "zh-TW"
+    RUSSIAN = "ru"
+    POLISH = "pl"
+    TURKISH = "tr"
+    UKRAINIAN = "uk"
+    VIETNAMESE = "vi"
+    SWEDISH = "sv"
+    DANISH = "da"
+    NORWEGIAN = "no"
+    FINNISH = "fi"
+    INDONESIAN = "id"
+    THAI = "th"
+
+
@register_stt
 class DeepgramSTTConfiguration(BaseSTTConfiguration):
    provider: Literal[ServiceProviders.DEEPGRAM] = ServiceProviders.DEEPGRAM
    model: DeepgramSTTModel = DeepgramSTTModel.NOVA_3_GENERAL
+    language: DeepgramLanguage = DeepgramLanguage.MULTI
    api_key: str


@ -359,8 +348,27 @@ class DograhSTTService(BaseSTTConfiguration):
    api_key: str


+# Sarvam STT Service
+class SarvamSTTModel(str, Enum):
+    SAARIKA_V2_5 = "saarika:v2.5"
+    SAARAS_V2 = "saaras:v2"  # STT-Translate model (auto-detects language)
+
+
+# @register_stt
+# class SarvamSTTConfiguration(BaseSTTConfiguration):
+#     provider: Literal[ServiceProviders.SARVAM] = ServiceProviders.SARVAM
+#     model: SarvamSTTModel = SarvamSTTModel.SAARIKA_V2_5
+#     language: SarvamLanguage = SarvamLanguage.HINDI
+#     api_key: str
+
+
 STTConfig = Annotated[
-    Union[DeepgramSTTConfiguration, OpenAISTTConfiguration, DograhSTTService],
+    Union[
+        DeepgramSTTConfiguration,
+        OpenAISTTConfiguration,
+        DograhSTTService,
+        # SarvamSTTConfiguration,
+    ],
    Field(discriminator="provider"),
 ]

--- a/api/services/mps_service_key_client.py
+++ b/api/services/mps_service_key_client.py
@ -285,6 +285,44 @@ class MPSServiceKeyClient:
                    response=response,
                )

+    async def get_voices(
+        self,
+        provider: str,
+        organization_id: Optional[int] = None,
+        created_by: Optional[str] = None,
+    ) -> dict:
+        """
+        Get available voices for a TTS provider from MPS.
+
+        Args:
+            provider: TTS provider name (elevenlabs, deepgram, sarvam, cartesia)
+            organization_id: Organization ID (for authenticated mode)
+            created_by: User provider ID (for OSS mode)
+
+        Returns:
+            Dictionary containing provider name and list of voices
+
+        Raises:
+            HTTPException: If the API call fails
+        """
+        async with httpx.AsyncClient(timeout=self.timeout) as client:
+            response = await client.get(
+                f"{self.base_url}/api/v1/voice-proxy/{provider}/voices",
+                headers=self._get_headers(organization_id, created_by),
+            )
+
+            if response.status_code == 200:
+                return response.json()
+            else:
+                logger.error(
+                    f"Failed to get voices for {provider}: {response.status_code} - {response.text}"
+                )
+                raise httpx.HTTPStatusError(
+                    f"Failed to get voices: {response.text}",
+                    request=response.request,
+                    response=response,
+                )
+
    async def call_workflow_api(
        self,
        call_type: str,
--- a/api/services/pipecat/service_factory.py
+++ b/api/services/pipecat/service_factory.py
@ -17,6 +17,9 @@ from pipecat.services.groq.llm import GroqLLMService
 from pipecat.services.openai.llm import OpenAILLMService
 from pipecat.services.openai.stt import OpenAISTTService
 from pipecat.services.openai.tts import OpenAITTSService
+from pipecat.services.sarvam.stt import SarvamSTTService
+from pipecat.services.sarvam.tts import SarvamTTSService
+from pipecat.transcriptions.language import Language
 from pipecat.utils.text.xml_function_tag_filter import XMLFunctionTagFilter

 if TYPE_CHECKING:
@ -26,8 +29,13 @@ if TYPE_CHECKING:
 def create_stt_service(user_config):
    """Create and return appropriate STT service based on user configuration"""
    if user_config.stt.provider == ServiceProviders.DEEPGRAM.value:
+        # Use language from user config, defaulting to "multi" for multilingual support
+        language = getattr(user_config.stt, "language", None)
+        language_value = (
+            language.value if hasattr(language, "value") else (language or "multi")
+        )
        live_options = LiveOptions(
-            language="multi", profanity_filter=False, endpointing=100
+            language=language_value, profanity_filter=False, endpointing=100
        )
        return DeepgramSTTService(
            live_options=live_options,
@ -53,6 +61,32 @@ def create_stt_service(user_config):
            model=user_config.stt.model.value,
            audio_passthrough=False,  # Disable passthrough since audio is buffered separately
        )
+    elif user_config.stt.provider == ServiceProviders.SARVAM.value:
+        # Map Sarvam language code to pipecat Language enum
+        language_mapping = {
+            "bn-IN": Language.BN_IN,
+            "gu-IN": Language.GU_IN,
+            "hi-IN": Language.HI_IN,
+            "kn-IN": Language.KN_IN,
+            "ml-IN": Language.ML_IN,
+            "mr-IN": Language.MR_IN,
+            "ta-IN": Language.TA_IN,
+            "te-IN": Language.TE_IN,
+            "pa-IN": Language.PA_IN,
+            "od-IN": Language.OR_IN,
+            "en-IN": Language.EN_IN,
+            "as-IN": Language.AS_IN,
+        }
+        language = getattr(user_config.stt, "language", None)
+        language_value = language.value if hasattr(language, "value") else language
+        pipecat_language = language_mapping.get(language_value, Language.HI_IN)
+
+        return SarvamSTTService(
+            api_key=user_config.stt.api_key,
+            model=user_config.stt.model.value,
+            params=SarvamSTTService.InputParams(language=pipecat_language),
+            audio_passthrough=False,
+        )
    else:
        raise HTTPException(
            status_code=400, detail=f"Invalid STT provider {user_config.stt.provider}"
@ -81,7 +115,12 @@ def create_tts_service(user_config, audio_config: "AudioConfig"):
            text_filters=[xml_function_tag_filter],
        )
    elif user_config.tts.provider == ServiceProviders.ELEVENLABS.value:
-        voice_id = user_config.tts.voice.split(" - ")[1]
+        # Backward compatible with older configuration "Name - voice_id"
+        try:
+            voice_id = user_config.tts.voice.split(" - ")[1]
+        except IndexError:
+            voice_id = user_config.tts.voice
+
        return ElevenLabsTTSService(
            reconnect_on_error=False,
            api_key=user_config.tts.api_key,
@ -103,6 +142,35 @@ def create_tts_service(user_config, audio_config: "AudioConfig"):
            voice=user_config.tts.voice.value,
            text_filters=[xml_function_tag_filter],
        )
+    elif user_config.tts.provider == ServiceProviders.SARVAM.value:
+        # Map Sarvam language code to pipecat Language enum for TTS
+        language_mapping = {
+            "bn-IN": Language.BN,
+            "en-IN": Language.EN,
+            "gu-IN": Language.GU,
+            "hi-IN": Language.HI,
+            "kn-IN": Language.KN,
+            "ml-IN": Language.ML,
+            "mr-IN": Language.MR,
+            "od-IN": Language.OR,
+            "pa-IN": Language.PA,
+            "ta-IN": Language.TA,
+            "te-IN": Language.TE,
+        }
+        language = getattr(user_config.tts, "language", None)
+        language_value = language.value if hasattr(language, "value") else language
+        pipecat_language = language_mapping.get(language_value, Language.HI)
+
+        voice = getattr(user_config.tts, "voice", None)
+        voice_value = voice.value if hasattr(voice, "value") else (voice or "anushka")
+
+        return SarvamTTSService(
+            api_key=user_config.tts.api_key,
+            model=user_config.tts.model.value,
+            voice_id=voice_value,
+            params=SarvamTTSService.InputParams(language=pipecat_language),
+            text_filters=[xml_function_tag_filter],
+        )
    else:
        raise HTTPException(
            status_code=400, detail=f"Invalid TTS provider {user_config.tts.provider}"
--- a/api/tests/test_configuration_masking_merge.py
+++ b/api/tests/test_configuration_masking_merge.py
@ -5,7 +5,6 @@ from api.schemas.user_configuration import UserConfiguration
 from api.services.configuration.masking import is_mask_of, mask_key, mask_user_config
 from api.services.configuration.merge import merge_user_configurations
 from api.services.configuration.registry import (
-    GroqModel,
    OpenAILLMService,
 )

@ -70,7 +69,7 @@ def test_merge_drops_old_key_when_provider_changes():
    incoming_partial = {
        "llm": {
            "provider": "groq",
-            "model": GroqModel.LLAMA_3_3_70B,
+            "model": "llama-3.3-70b-versatile",
            # api_key intentionally absent – should NOT inherit old key
        }
    }