feat: add voice selectors in elevenlabs (#88)

2026-07-22 11:51:04 +02:00 · 2025-12-25 15:05:53 +05:30 · 2025-12-25 15:05:53 +05:30 · 45c5b7c304
commit 45c5b7c304
parent 480e8a5f60
22 changed files with 978 additions and 166 deletions
--- a/api/Dockerfile
+++ b/api/Dockerfile
@ -20,7 +20,7 @@ RUN pip install --user --no-cache-dir -r requirements.txt && \

 # Copy and install pipecat from local submodule
 COPY pipecat /tmp/pipecat
-RUN pip install --user --no-cache-dir '/tmp/pipecat[cartesia,deepgram,openai,elevenlabs,groq,google,azure,soundfile,silero,webrtc,local-smart-turn-v3]' && \
+RUN pip install --user --no-cache-dir '/tmp/pipecat[cartesia,deepgram,openai,elevenlabs,groq,google,azure,sarvam,soundfile,silero,webrtc,local-smart-turn-v3]' && \
    # Clean up pip cache and temporary pipecat directory
    rm -rf /root/.cache/pip /tmp/pipecat

--- a/api/db/filters.py
+++ b/api/db/filters.py
@ -111,9 +111,7 @@ def apply_workflow_run_filters(
                    # (subscript [] only works in PostgreSQL 14+)
                    filter_conditions.append(
                        cast(WorkflowRunModel.gathered_context, JSONB)
-                        .op("->>")(
-                            "mapped_call_disposition"
-                        )
+                        .op("->>")("mapped_call_disposition")
                        .in_(codes)
                    )

@ -147,9 +145,7 @@ def apply_workflow_run_filters(
                    # Use ->> operator for compatibility with all PostgreSQL versions
                    filter_conditions.append(
                        cast(WorkflowRunModel.initial_context, JSONB)
-                        .op("->>")(
-                            "phone"
-                        )
+                        .op("->>")("phone")
                        .contains(phone)
                    )

@ -178,13 +174,9 @@ def apply_workflow_run_filters(
                        "total_cost_usd"
                    )
                    if min_val is not None:
-                        filter_conditions.append(
-                            cast(cost_text, Integer) >= min_val
-                        )
+                        filter_conditions.append(cast(cost_text, Integer) >= min_val)
                    if max_val is not None:
-                        filter_conditions.append(
-                            cast(cost_text, Integer) <= max_val
-                        )
+                        filter_conditions.append(cast(cost_text, Integer) <= max_val)

    if filter_conditions:
        base_query = base_query.where(and_(*filter_conditions))
--- a/api/routes/user.py
+++ b/api/routes/user.py
@ -1,7 +1,8 @@
 from datetime import datetime, timedelta
-from typing import List, Optional, TypedDict, Union
+from typing import List, Literal, Optional, TypedDict, Union

 from fastapi import APIRouter, Depends, HTTPException, Query
+from loguru import logger
 from pydantic import BaseModel

 from api.db import db_client
@ -17,6 +18,7 @@ from api.services.configuration.defaults import DEFAULT_SERVICE_PROVIDERS
 from api.services.configuration.masking import mask_user_config
 from api.services.configuration.merge import merge_user_configurations
 from api.services.configuration.registry import REGISTRY, ServiceType
+from api.services.mps_service_key_client import mps_service_key_client

 router = APIRouter(prefix="/user")

@ -274,3 +276,46 @@ async def reactivate_api_key(
        raise HTTPException(status_code=500, detail="Failed to reactivate API key")

    return {"success": True, "message": "API key reactivated successfully"}
+
+
+# Voice Configuration Endpoints
+TTSProvider = Literal["elevenlabs", "deepgram", "sarvam", "cartesia", "dograh"]
+
+
+class VoiceInfo(BaseModel):
+    voice_id: str
+    name: str
+    description: Optional[str] = None
+    accent: Optional[str] = None
+    gender: Optional[str] = None
+    language: Optional[str] = None
+    preview_url: Optional[str] = None
+
+
+class VoicesResponse(BaseModel):
+    provider: str
+    voices: List[VoiceInfo]
+
+
+@router.get("/configurations/voices/{provider}")
+async def get_voices(
+    provider: TTSProvider,
+    user: UserModel = Depends(get_user),
+) -> VoicesResponse:
+    """Get available voices for a TTS provider."""
+    try:
+        result = await mps_service_key_client.get_voices(
+            provider=provider,
+            organization_id=user.selected_organization_id,
+            created_by=user.provider_id,
+        )
+        return VoicesResponse(
+            provider=result.get("provider", provider),
+            voices=[VoiceInfo(**voice) for voice in result.get("voices", [])],
+        )
+    except Exception as e:
+        logger.error(f"Failed to fetch voices for {provider}: {e}")
+        raise HTTPException(
+            status_code=500,
+            detail=f"Failed to fetch voices for {provider}",
+        )
--- a/api/services/auth/depends.py
+++ b/api/services/auth/depends.py
@ -11,10 +11,8 @@ from api.db.models import UserModel
 from api.schemas.user_configuration import UserConfiguration
 from api.services.auth.stack_auth import stackauth
 from api.services.configuration.registry import (
-    DograhLLMModel,
    DograhSTTModel,
    DograhTTSModel,
-    DograhVoice,
    ServiceProviders,
 )

@ -244,13 +242,13 @@ async def create_user_configuration_with_mps_key(
                    "llm": {
                        "provider": ServiceProviders.DOGRAH.value,
                        "api_key": service_key,
-                        "model": DograhLLMModel.DEFAULT.value,  # Default model
+                        "model": "default",  # Default model
                    },
                    "tts": {
                        "provider": ServiceProviders.DOGRAH.value,
                        "api_key": service_key,
                        "model": DograhTTSModel.DEFAULT.value,  # Default model
-                        "voice": DograhVoice.DEFAULT.value,  # Default voice
+                        "voice": "default",  # Default voice
                    },
                    "stt": {
                        "provider": ServiceProviders.DOGRAH.value,
--- a/api/services/configuration/check_validity.py
+++ b/api/services/configuration/check_validity.py
@ -38,6 +38,7 @@ class UserConfigurationValidator:
            ServiceProviders.AZURE.value: self._check_azure_api_key,
            ServiceProviders.CARTESIA.value: self._check_cartesia_api_key,
            ServiceProviders.DOGRAH.value: self._check_dograh_api_key,
+            ServiceProviders.SARVAM.value: self._check_sarvam_api_key,
        }

    async def validate(self, configuration: UserConfiguration) -> APIKeyStatusResponse:
@ -134,20 +135,5 @@ class UserConfigurationValidator:
    def _check_dograh_api_key(self, model: str, api_key: str) -> bool:
        return True

-    # def _check_neuphonic_api_key(self, model: str, api_key: str) -> bool:
-    #     if not Neuphonic:
-    #         self._provider_api_key_validity_status[model] = False
-    #         return self._provider_api_key_validity_status[model]
-
-    #     if model in self._provider_api_key_validity_status:
-    #         return self._provider_api_key_validity_status[model]
-
-    #     client = Neuphonic(api_key=api_key)
-    #     try:
-    #         response = client.voices.list()  # get's all available voices
-    #         voices = response.data["voices"]
-    #         self._provider_api_key_validity_status[model] = True
-    #     except Exception:
-    #         self._provider_api_key_validity_status[model] = False
-
-    #     return self._provider_api_key_validity_status[model]
+    def _check_sarvam_api_key(self, model: str, api_key: str) -> bool:
+        return True
--- a/api/services/configuration/registry.py
+++ b/api/services/configuration/registry.py
@ -20,6 +20,7 @@ class ServiceProviders(str, Enum):
    GOOGLE = "google"
    AZURE = "azure"
    DOGRAH = "dograh"
+    SARVAM = "sarvam"


 class BaseServiceConfiguration(BaseModel):
@ -31,6 +32,7 @@ class BaseServiceConfiguration(BaseModel):
        ServiceProviders.GOOGLE,
        ServiceProviders.AZURE,
        ServiceProviders.DOGRAH,
+        # ServiceProviders.SARVAM,
    ]
    api_key: str

@ -92,82 +94,56 @@ def register_stt(cls: Type[BaseSTTConfiguration]):

 ###################################################### LLM ########################################################################

-
-class OpenAIModel(str, Enum):
-    GPT3_5_TURBO = "gpt-3.5-turbo"
-    GPT4_1 = "gpt-4.1"
-    GPT4_1_MINI = "gpt-4.1-mini"
-    GPT4_1_NANO = "gpt-4.1-nano"
-    GPT5 = "gpt-5"
-    GPT5_MINI = "gpt-5-mini"
-    GPT5_NANO = "gpt-5-nano"
+# Suggested models for each provider (used for UI dropdown)
+OPENAI_MODELS = ["gpt-4.1", "gpt-4.1-mini", "gpt-4.1-nano", "gpt-5", "gpt-5-mini", "gpt-5-nano", "gpt-3.5-turbo"]
+GOOGLE_MODELS = ["gemini-2.0-flash", "gemini-2.0-flash-lite", "gemini-2.5-flash", "gemini-2.5-flash-lite"]
+GROQ_MODELS = [
+    "llama-3.3-70b-versatile",
+    "deepseek-r1-distill-llama-70b",
+    "qwen-qwq-32b",
+    "meta-llama/llama-4-scout-17b-16e-instruct",
+    "meta-llama/llama-4-maverick-17b-128e-instruct",
+    "gemma2-9b-it",
+    "llama-3.1-8b-instant",
+    "openai/gpt-oss-120b",
+]
+AZURE_MODELS = ["gpt-4.1-mini"]
+DOGRAH_LLM_MODELS = ["default", "accurate", "fast", "lite", "zen", "zen_lite"]


@register_llm
 class OpenAILLMService(BaseLLMConfiguration):
    provider: Literal[ServiceProviders.OPENAI] = ServiceProviders.OPENAI
-    model: OpenAIModel = OpenAIModel.GPT4_1
+    model: str = Field(default="gpt-4.1", json_schema_extra={"examples": OPENAI_MODELS})
    api_key: str


-class GoogleModel(str, Enum):
-    GEMINI_2_0_FLASH = "gemini-2.0-flash"
-    GEMINI_2_0_FLASH_LITE = "gemini-2.0-flash-lite"
-    GEMINI_2_5_FLASH = "gemini-2.5-flash"
-    GEMINI_2_5_FLASH_LITE = "gemini-2.5-flash-lite"
-
-
@register_llm
 class GoogleLLMService(BaseLLMConfiguration):
    provider: Literal[ServiceProviders.GOOGLE] = ServiceProviders.GOOGLE
-    model: GoogleModel = GoogleModel.GEMINI_2_0_FLASH
+    model: str = Field(default="gemini-2.0-flash", json_schema_extra={"examples": GOOGLE_MODELS})
    api_key: str


-class GroqModel(str, Enum):
-    LLAMA_3_3_70B = "llama-3.3-70b-versatile"
-    DEEPSEEK_R1_DISTILL_LLAMA_70B = "deepseek-r1-distill-llama-70b"
-    QUEN_QWQ_32B = "qwen-qwq-32b"
-    LLAMA_4_SCOUT_17B_16E_INSTRUCT = "meta-llama/llama-4-scout-17b-16e-instruct"
-    LLAMA_4_MAVERICK_17B_128E_INSTRUCT = "meta-llama/llama-4-maverick-17b-128e-instruct"
-    GEMMA2_9B_IT = "gemma2-9b-it"
-    LLAMA_3_1_8B_INSTANT = "llama-3.1-8b-instant"
-    OPENAI_GPT_OSS_120B = "openai/gpt-oss-120b"
-
-
@register_llm
 class GroqLLMService(BaseLLMConfiguration):
    provider: Literal[ServiceProviders.GROQ] = ServiceProviders.GROQ
-    model: GroqModel = GroqModel.LLAMA_3_3_70B
+    model: str = Field(default="llama-3.3-70b-versatile", json_schema_extra={"examples": GROQ_MODELS})
    api_key: str


-class AzureModel(str, Enum):
-    GPT4_1_MINI = "gpt-4.1-mini"
-
-
@register_llm
 class AzureLLMService(BaseLLMConfiguration):
    provider: Literal[ServiceProviders.AZURE] = ServiceProviders.AZURE
-    model: AzureModel = AzureModel.GPT4_1_MINI
+    model: str = Field(default="gpt-4.1-mini", json_schema_extra={"examples": AZURE_MODELS})
    api_key: str
    endpoint: str


-# Dograh LLM Service
-class DograhLLMModel(str, Enum):
-    DEFAULT = "default"
-    ACCURATE = "accurate"
-    FAST = "fast"
-    LITE = "lite"
-    ZEN = "zen"
-    ZEN_LITE = "zen_lite"
-
-
@register_llm
 class DograhLLMService(BaseLLMConfiguration):
    provider: Literal[ServiceProviders.DOGRAH] = ServiceProviders.DOGRAH
-    model: DograhLLMModel = DograhLLMModel.DEFAULT
+    model: str = Field(default="default", json_schema_extra={"examples": DOGRAH_LLM_MODELS})
    api_key: str


@ -185,15 +161,10 @@ LLMConfig = Annotated[
 ###################################################### TTS ########################################################################


-class DeepgramVoice(str, Enum):
-    HELENA = "aura-2-helena-en"
-    THALIA = "aura-2-thalia-en"
-
-
@register_tts
 class DeepgramTTSConfiguration(BaseServiceConfiguration):
    provider: Literal[ServiceProviders.DEEPGRAM] = ServiceProviders.DEEPGRAM
-    voice: DeepgramVoice = DeepgramVoice.HELENA
+    voice: str = "aura-2-helena-en"
    api_key: str

    @computed_field
@ -210,42 +181,6 @@ class DeepgramTTSConfiguration(BaseServiceConfiguration):
            return "aura-2"


-class ElevenlabsVoice(str, Enum):
-    ALEXANDRA = "Alexandra - 3dzJXoCYueSQiptQ6euE"
-    AMY = "Amy - oGn4Ha2pe2vSJkmIJgLQ"
-    ANGELA = "Angela - FUfBrNit0NNZAwb58KWH"
-    ARIA = "Aria - 9BWtsMINqrJLrRacOk9x"
-    CHELSEA = "Chelsea - NHRgOEwqx5WZNClv5sat"
-    CHRISTINA = "Christina - X03mvPuTfprif8QBAVeJ"
-    CLARA = "Clara - ZIlrSGI4jZqobxRKprJz"
-    CLYDE = "Clyde - 2EiwWnXFnvU5JabPnv8n"
-    DAVE = "Dave - CYw3kZ02Hs0563khs1Fj"
-    DOMI = "Domi - AZnzlk1XvdvUeBnXmlld"
-    DREW = "Drew - 29vD33N1CtxCmqQRPOHJ"
-    ELENA = "Elena_German - iFJwt4O7E3aafIpJFfcu"
-    EVE = "Eve - BZgkqPqms7Kj9ulSkVzn"
-    FIN = "Fin - D38z5RcWu1voky8WS1ja"
-    HOPE_BESTIE = "Hope_Bestie - uYXf8XasLslADfZ2MB4u"
-    HOPE_NATURAL = "Hope_Natural - OYTbf65OHHFELVut7v2H"
-    JARNATHAN = "Jarnathan - c6SfcYrb2t09NHXiT80T"
-    JENNA = "Jenna - C2BkQxlGNzBn7WD2bqfR"
-    JESSICA = "Jessica - cgSgspJ2msm6clMCkdW9"
-    JOHANNA = "Johanna_German - YYDsZT3K2y6tv7X1aj6N"
-    JUNIPER = "Juniper - aMSt68OGf4xUZAnLpTU8"
-    LAUREN = "Lauren - 3liN8q8YoeB9Hk6AboKe"
-    LINA = "Lina - oWjuL7HSoaEJRMDMP3HD"
-    MONIKA = "Monika_Hindi_8 - 2bNrEsM0omyhLiEyOwqY"
-    NEHA = "Neha_Hindi - QTKSa2Iyv0yoxvXY2V8a"
-    OLIVIA = "Olivia - 1rviaVF7GGGkTU36HNpz"
-    PAUL = "Paul - 5Q0t7uMcjvnagumLfvZi"
-    RACHEL = "Rachel - 21m00Tcm4TlvDq8ikWAM"
-    ROGER = "Roger - CwhRBWXzGAHq8TQ4Fs17"
-    SAMI_REAL = "Sami_Real - O4cGUVdAocn0z4EpQ9yF"
-    SARAH = "Sarah - EXAVITQu4vr4xnSDxMaL"
-    SIA = "Sia_Hindi_10 - ryIIztHPLYSJ74ueXxnO"
-    ZARA = "Zara - MmQVkVZnQ0dUbfWzcW6f"
-
-
 class ElevenlabsModel(str, Enum):
    FLASH_2 = "eleven_flash_v2_5"

@ -253,16 +188,12 @@ class ElevenlabsModel(str, Enum):
@register_tts
 class ElevenlabsTTSConfiguration(BaseServiceConfiguration):
    provider: Literal[ServiceProviders.ELEVENLABS] = ServiceProviders.ELEVENLABS
-    voice: ElevenlabsVoice = ElevenlabsVoice.RACHEL
+    voice: str = "21m00Tcm4TlvDq8ikWAM"  # Rachel voice ID
    speed: float = Field(default=1.0, ge=0.1, le=2.0, description="Speed of the voice")
    model: ElevenlabsModel = ElevenlabsModel.FLASH_2
    api_key: str


-class OpenAIVoice(str, Enum):
-    ALLY = "alloy"
-
-
 class OpenAITTSModel(str, Enum):
    GPT_4o_MINI = "gpt-4o-mini-tts"

@ -271,29 +202,10 @@ class OpenAITTSModel(str, Enum):
 class OpenAITTSService(BaseTTSConfiguration):
    provider: Literal[ServiceProviders.OPENAI] = ServiceProviders.OPENAI
    model: OpenAITTSModel = OpenAITTSModel.GPT_4o_MINI
-    voice: OpenAIVoice = OpenAIVoice.ALLY
+    voice: str = "alloy"
    api_key: str


-# class NeuphonicVoice(str, Enum):
-#     EMILY = "Emily - fc854436-2dac-4d21-aa69-ae17b54e98eb"
-
-
-# @register_tts
-# class NeuphonicTTSService(BaseTTSConfiguration):
-#     provider: Literal[ServiceProviders.NEUPHONIC] = ServiceProviders.NEUPHONIC
-#     voice: NeuphonicVoice = NeuphonicVoice.EMILY
-#     model: str = "NA"
-#     api_key: str
-
-
-# Dograh TTS Service
-class DograhVoice(str, Enum):
-    DEFAULT = "default"
-    JOEY = "joey"
-    RACHEL = "rachel"
-
-
 class DograhTTSModel(str, Enum):
    DEFAULT = "default"

@ -302,16 +214,58 @@ class DograhTTSModel(str, Enum):
 class DograhTTSService(BaseTTSConfiguration):
    provider: Literal[ServiceProviders.DOGRAH] = ServiceProviders.DOGRAH
    model: DograhTTSModel = DograhTTSModel.DEFAULT
-    voice: DograhVoice = DograhVoice.DEFAULT
+    voice: str = "default"
    api_key: str


+class SarvamTTSModel(str, Enum):
+    BULBUL_V2 = "bulbul:v2"
+    BULBUL_V3 = "bulbul:v3"
+
+
+class SarvamVoice(str, Enum):
+    # Female voices
+    ANUSHKA = "anushka"
+    MANISHA = "manisha"
+    VIDYA = "vidya"
+    ARYA = "arya"
+    # Male voices
+    ABHILASH = "abhilash"
+    KARUN = "karun"
+    HITESH = "hitesh"
+
+
+class SarvamLanguage(str, Enum):
+    BENGALI = "bn-IN"
+    ENGLISH_INDIA = "en-IN"
+    GUJARATI = "gu-IN"
+    HINDI = "hi-IN"
+    KANNADA = "kn-IN"
+    MALAYALAM = "ml-IN"
+    MARATHI = "mr-IN"
+    ODIA = "od-IN"
+    PUNJABI = "pa-IN"
+    TAMIL = "ta-IN"
+    TELUGU = "te-IN"
+    ASSAMESE = "as-IN"
+
+
+# @register_tts
+# class SarvamTTSConfiguration(BaseTTSConfiguration):
+#     provider: Literal[ServiceProviders.SARVAM] = ServiceProviders.SARVAM
+#     model: SarvamTTSModel = SarvamTTSModel.BULBUL_V2
+#     voice: SarvamVoice = SarvamVoice.ANUSHKA
+#     language: SarvamLanguage = SarvamLanguage.HINDI
+#     api_key: str
+
+
 TTSConfig = Annotated[
    Union[
        DeepgramTTSConfiguration,
        OpenAITTSService,
        ElevenlabsTTSConfiguration,
        DograhTTSService,
+        # SarvamTTSConfiguration,
    ],
    Field(discriminator="provider"),
 ]
@ -323,10 +277,45 @@ class DeepgramSTTModel(str, Enum):
    NOVA_3_GENERAL = "nova-3-general"


+class DeepgramLanguage(str, Enum):
+    MULTI = "multi"
+    ENGLISH = "en"
+    ENGLISH_US = "en-US"
+    ENGLISH_GB = "en-GB"
+    ENGLISH_AU = "en-AU"
+    ENGLISH_IN = "en-IN"
+    SPANISH = "es"
+    SPANISH_LATAM = "es-419"
+    FRENCH = "fr"
+    FRENCH_CA = "fr-CA"
+    GERMAN = "de"
+    ITALIAN = "it"
+    PORTUGUESE = "pt"
+    PORTUGUESE_BR = "pt-BR"
+    DUTCH = "nl"
+    HINDI = "hi"
+    JAPANESE = "ja"
+    KOREAN = "ko"
+    CHINESE_SIMPLIFIED = "zh-CN"
+    CHINESE_TRADITIONAL = "zh-TW"
+    RUSSIAN = "ru"
+    POLISH = "pl"
+    TURKISH = "tr"
+    UKRAINIAN = "uk"
+    VIETNAMESE = "vi"
+    SWEDISH = "sv"
+    DANISH = "da"
+    NORWEGIAN = "no"
+    FINNISH = "fi"
+    INDONESIAN = "id"
+    THAI = "th"
+
+
@register_stt
 class DeepgramSTTConfiguration(BaseSTTConfiguration):
    provider: Literal[ServiceProviders.DEEPGRAM] = ServiceProviders.DEEPGRAM
    model: DeepgramSTTModel = DeepgramSTTModel.NOVA_3_GENERAL
+    language: DeepgramLanguage = DeepgramLanguage.MULTI
    api_key: str


@ -359,8 +348,27 @@ class DograhSTTService(BaseSTTConfiguration):
    api_key: str


+# Sarvam STT Service
+class SarvamSTTModel(str, Enum):
+    SAARIKA_V2_5 = "saarika:v2.5"
+    SAARAS_V2 = "saaras:v2"  # STT-Translate model (auto-detects language)
+
+
+# @register_stt
+# class SarvamSTTConfiguration(BaseSTTConfiguration):
+#     provider: Literal[ServiceProviders.SARVAM] = ServiceProviders.SARVAM
+#     model: SarvamSTTModel = SarvamSTTModel.SAARIKA_V2_5
+#     language: SarvamLanguage = SarvamLanguage.HINDI
+#     api_key: str
+
+
 STTConfig = Annotated[
-    Union[DeepgramSTTConfiguration, OpenAISTTConfiguration, DograhSTTService],
+    Union[
+        DeepgramSTTConfiguration,
+        OpenAISTTConfiguration,
+        DograhSTTService,
+        # SarvamSTTConfiguration,
+    ],
    Field(discriminator="provider"),
 ]

--- a/api/services/mps_service_key_client.py
+++ b/api/services/mps_service_key_client.py
@ -285,6 +285,44 @@ class MPSServiceKeyClient:
                    response=response,
                )

+    async def get_voices(
+        self,
+        provider: str,
+        organization_id: Optional[int] = None,
+        created_by: Optional[str] = None,
+    ) -> dict:
+        """
+        Get available voices for a TTS provider from MPS.
+
+        Args:
+            provider: TTS provider name (elevenlabs, deepgram, sarvam, cartesia)
+            organization_id: Organization ID (for authenticated mode)
+            created_by: User provider ID (for OSS mode)
+
+        Returns:
+            Dictionary containing provider name and list of voices
+
+        Raises:
+            HTTPException: If the API call fails
+        """
+        async with httpx.AsyncClient(timeout=self.timeout) as client:
+            response = await client.get(
+                f"{self.base_url}/api/v1/voice-proxy/{provider}/voices",
+                headers=self._get_headers(organization_id, created_by),
+            )
+
+            if response.status_code == 200:
+                return response.json()
+            else:
+                logger.error(
+                    f"Failed to get voices for {provider}: {response.status_code} - {response.text}"
+                )
+                raise httpx.HTTPStatusError(
+                    f"Failed to get voices: {response.text}",
+                    request=response.request,
+                    response=response,
+                )
+
    async def call_workflow_api(
        self,
        call_type: str,
--- a/api/services/pipecat/service_factory.py
+++ b/api/services/pipecat/service_factory.py
@ -17,6 +17,9 @@ from pipecat.services.groq.llm import GroqLLMService
 from pipecat.services.openai.llm import OpenAILLMService
 from pipecat.services.openai.stt import OpenAISTTService
 from pipecat.services.openai.tts import OpenAITTSService
+from pipecat.services.sarvam.stt import SarvamSTTService
+from pipecat.services.sarvam.tts import SarvamTTSService
+from pipecat.transcriptions.language import Language
 from pipecat.utils.text.xml_function_tag_filter import XMLFunctionTagFilter

 if TYPE_CHECKING:
@ -26,8 +29,13 @@ if TYPE_CHECKING:
 def create_stt_service(user_config):
    """Create and return appropriate STT service based on user configuration"""
    if user_config.stt.provider == ServiceProviders.DEEPGRAM.value:
+        # Use language from user config, defaulting to "multi" for multilingual support
+        language = getattr(user_config.stt, "language", None)
+        language_value = (
+            language.value if hasattr(language, "value") else (language or "multi")
+        )
        live_options = LiveOptions(
-            language="multi", profanity_filter=False, endpointing=100
+            language=language_value, profanity_filter=False, endpointing=100
        )
        return DeepgramSTTService(
            live_options=live_options,
@ -53,6 +61,32 @@ def create_stt_service(user_config):
            model=user_config.stt.model.value,
            audio_passthrough=False,  # Disable passthrough since audio is buffered separately
        )
+    elif user_config.stt.provider == ServiceProviders.SARVAM.value:
+        # Map Sarvam language code to pipecat Language enum
+        language_mapping = {
+            "bn-IN": Language.BN_IN,
+            "gu-IN": Language.GU_IN,
+            "hi-IN": Language.HI_IN,
+            "kn-IN": Language.KN_IN,
+            "ml-IN": Language.ML_IN,
+            "mr-IN": Language.MR_IN,
+            "ta-IN": Language.TA_IN,
+            "te-IN": Language.TE_IN,
+            "pa-IN": Language.PA_IN,
+            "od-IN": Language.OR_IN,
+            "en-IN": Language.EN_IN,
+            "as-IN": Language.AS_IN,
+        }
+        language = getattr(user_config.stt, "language", None)
+        language_value = language.value if hasattr(language, "value") else language
+        pipecat_language = language_mapping.get(language_value, Language.HI_IN)
+
+        return SarvamSTTService(
+            api_key=user_config.stt.api_key,
+            model=user_config.stt.model.value,
+            params=SarvamSTTService.InputParams(language=pipecat_language),
+            audio_passthrough=False,
+        )
    else:
        raise HTTPException(
            status_code=400, detail=f"Invalid STT provider {user_config.stt.provider}"
@ -81,7 +115,12 @@ def create_tts_service(user_config, audio_config: "AudioConfig"):
            text_filters=[xml_function_tag_filter],
        )
    elif user_config.tts.provider == ServiceProviders.ELEVENLABS.value:
-        voice_id = user_config.tts.voice.split(" - ")[1]
+        # Backward compatible with older configuration "Name - voice_id"
+        try:
+            voice_id = user_config.tts.voice.split(" - ")[1]
+        except IndexError:
+            voice_id = user_config.tts.voice
+
        return ElevenLabsTTSService(
            reconnect_on_error=False,
            api_key=user_config.tts.api_key,
@ -103,6 +142,35 @@ def create_tts_service(user_config, audio_config: "AudioConfig"):
            voice=user_config.tts.voice.value,
            text_filters=[xml_function_tag_filter],
        )
+    elif user_config.tts.provider == ServiceProviders.SARVAM.value:
+        # Map Sarvam language code to pipecat Language enum for TTS
+        language_mapping = {
+            "bn-IN": Language.BN,
+            "en-IN": Language.EN,
+            "gu-IN": Language.GU,
+            "hi-IN": Language.HI,
+            "kn-IN": Language.KN,
+            "ml-IN": Language.ML,
+            "mr-IN": Language.MR,
+            "od-IN": Language.OR,
+            "pa-IN": Language.PA,
+            "ta-IN": Language.TA,
+            "te-IN": Language.TE,
+        }
+        language = getattr(user_config.tts, "language", None)
+        language_value = language.value if hasattr(language, "value") else language
+        pipecat_language = language_mapping.get(language_value, Language.HI)
+
+        voice = getattr(user_config.tts, "voice", None)
+        voice_value = voice.value if hasattr(voice, "value") else (voice or "anushka")
+
+        return SarvamTTSService(
+            api_key=user_config.tts.api_key,
+            model=user_config.tts.model.value,
+            voice_id=voice_value,
+            params=SarvamTTSService.InputParams(language=pipecat_language),
+            text_filters=[xml_function_tag_filter],
+        )
    else:
        raise HTTPException(
            status_code=400, detail=f"Invalid TTS provider {user_config.tts.provider}"
--- a/api/tests/test_configuration_masking_merge.py
+++ b/api/tests/test_configuration_masking_merge.py
@ -5,7 +5,6 @@ from api.schemas.user_configuration import UserConfiguration
 from api.services.configuration.masking import is_mask_of, mask_key, mask_user_config
 from api.services.configuration.merge import merge_user_configurations
 from api.services.configuration.registry import (
-    GroqModel,
    OpenAILLMService,
 )

@ -70,7 +69,7 @@ def test_merge_drops_old_key_when_provider_changes():
    incoming_partial = {
        "llm": {
            "provider": "groq",
-            "model": GroqModel.LLAMA_3_3_70B,
+            "model": "llama-3.3-70b-versatile",
            # api_key intentionally absent – should NOT inherit old key
        }
    }
--- a/docs/configurations/inference-providers.mdx
+++ b/docs/configurations/inference-providers.mdx
@ -1,18 +1,17 @@
 ---
-title: "Inference Provider"
-description: "Dograh ships with its own inferencing engine, which is hosted at https://services.dograh.com. The inference service provides LLM, TTS and STT services. In this document you can see how you can configure the inferencing engine to your favourite provider, like OpenAI, Gemini etc."
+title: "Model Configurations"
+description: "Voice Agents need AI Models to work, like LLM (Large Language Model), TTS (Voice) and STT (Transcriber). You can use any of your faviourite providers with Dograh Platform to run your Voice Agent."
 ---

-## Configure Inference Provider
+## Configure Models
+Dograh Platform ships with its own models by default. When you sign up on https://app.dograh.com or you setup the platform on your self hosted infrastructure, you get some Dograh model credits by default. 

-You can go to `https://app.dograh.com/model-configurations` if you are on hosted version of Dograh or go to `http://localhost:3010/model-configurations` if you are running Dograh locally.
+If you wish to change the models to a provider of your own choice, ou can go to `https://app.dograh.com/model-configurations` if you are on hosted version of Dograh or go to `http://localhost:3010/model-configurations` if you are running Dograh locally.

 You can see the configuration for the inference provider in the following screenshot.

 ![Model Configuration](../images/service-configuration.png)

-You can select the provider from the dropdown and configure the API key, model, etc. You can see [API Keys](api-keys) documentation for instructions on how to create Service Keys to be used in Model Configuration.
+You can select the provider from the dropdown and configure the API key, model, etc. For Dograh, you can see [Service Keys](api-keys) documentation for instructions on how to create Service Keys to be used in Model Configuration.

-## Next Steps
-
-You can see how to configure the telephony provider in [Telephony Integrations](/telephony/twilio).
+Next there are some in depth documentation of various AI Models that you can configure. 
--- a/docs/configurations/llm.mdx
+++ b/docs/configurations/llm.mdx
@ -0,0 +1,12 @@
+---
+title: "LLM"
+description: "Voice Agents use LLM (Large Language Models), which are trained to understand the conversational context, and respond to users."
+---
+
+You can currently use OpenAI, Google, Groq, Azure and Dograh LLMs in LLM configuration. There are some models provided by default for you to choose from the drop down. 
+
+![Select Models from DropDown](../images/models_dropdown.png)
+
+If you don't find a model in the drop down, you can always add a model manually.
+
+![Select Models from DropDown](../images/add_model_manually.png)
--- a/docs/configurations/transcriber.mdx
+++ b/docs/configurations/transcriber.mdx
@ -0,0 +1,8 @@
+---
+title: "Transcriber"
+description: "Voice Agents use STT (Speech to Text), to transcribe what the user speaks. This transcribed speech as text goes into an LLM to generate the response that gets played out to the user."
+---
+
+Dograh platform ships with Deepgram, Cartesia, OpenAI and Dograh transcribers by default. You can take a look at the providers documentation of which language to select for your language requirements. 
+
+Example: Deepgram has their language support documentation at https://developers.deepgram.com/docs/models-languages-overview#nova-3
--- a/docs/configurations/voice.mdx
+++ b/docs/configurations/voice.mdx
@ -0,0 +1,10 @@
+---
+title: "Voice"
+description: "Voice Agents use TTS (Text to Speech), which generates audio that LLMs generate during the course of a conversation. This is the audio that the end user having the conversation listens to."
+---
+
+Dograh platform ships with Elevenlabs, Deepgram, OpenAI and Dograh TTS engines by default. There are some voices from the providers that we ship by default. You can refer to the providers API documentation to select a voice ID thats most relevant for your language requirement.
+
+If you dont find your favourite voice, you can always add the voice ID manually.
+
+![Add Voice Manually](../images/add_tts_manually.png)
--- a/docs/docs.json
+++ b/docs/docs.json
@ -30,6 +30,9 @@
            "group": "Configurations",
            "pages": [
              "configurations/inference-providers",
+              "configurations/llm",
+              "configurations/voice",
+              "configurations/transcriber",
              "configurations/api-keys"
            ]
          },
--- a/docs/images/add_model_manually.png
+++ b/docs/images/add_model_manually.png
--- a/docs/images/add_tts_manually.png
+++ b/docs/images/add_tts_manually.png
--- a/docs/images/models_dropdown.png
+++ b/docs/images/models_dropdown.png
--- a/ui/src/app/api-keys/page.tsx
+++ b/ui/src/app/api-keys/page.tsx
@ -460,7 +460,7 @@ export default function APIKeysPage() {
                                            {showServiceArchived ? 'Hide' : 'Show'} Archived
                                        </Button>
                                    )}
-                                    {canCreateServiceKey && (
+                                    {canCreateServiceKey ? (
                                        <Button
                                            onClick={() => setIsCreateServiceDialogOpen(true)}
                                            size="sm"
@ -468,6 +468,10 @@ export default function APIKeysPage() {
                                            <Plus className="w-4 h-4 mr-2" />
                                            Create Service Key
                                        </Button>
+                                    ) : (
+                                        <span className="text">
+                                            To generate additional service keys, <a href="https://app.dograh.com" target="_blank" rel="noopener noreferrer" className="text-primary hover:underline">Sign up on app.dograh.com</a>
+                                        </span>
                                    )}
                                </div>
                            </div>
--- a/ui/src/client/sdk.gen.ts
+++ b/ui/src/client/sdk.gen.ts
--- a/ui/src/client/types.gen.ts
+++ b/ui/src/client/types.gen.ts
@ -650,6 +650,21 @@ export type VobizConfigurationResponse = {
    from_numbers: Array<string>;
 };

+export type VoiceInfo = {
+    voice_id: string;
+    name: string;
+    description?: string | null;
+    accent?: string | null;
+    gender?: string | null;
+    language?: string | null;
+    preview_url?: string | null;
+};
+
+export type VoicesResponse = {
+    provider: string;
+    voices: Array<VoiceInfo>;
+};
+
 /**
 * Request schema for Vonage configuration.
 */
@ -1828,6 +1843,40 @@ export type ReactivateApiKeyApiV1UserApiKeysApiKeyIdReactivatePutResponses = {

 export type ReactivateApiKeyApiV1UserApiKeysApiKeyIdReactivatePutResponse = ReactivateApiKeyApiV1UserApiKeysApiKeyIdReactivatePutResponses[keyof ReactivateApiKeyApiV1UserApiKeysApiKeyIdReactivatePutResponses];

+export type GetVoicesApiV1UserConfigurationsVoicesProviderGetData = {
+    body?: never;
+    headers?: {
+        authorization?: string | null;
+    };
+    path: {
+        provider: 'elevenlabs' | 'deepgram' | 'sarvam' | 'cartesia' | 'dograh';
+    };
+    query?: never;
+    url: '/api/v1/user/configurations/voices/{provider}';
+};
+
+export type GetVoicesApiV1UserConfigurationsVoicesProviderGetErrors = {
+    /**
+     * Not found
+     */
+    404: unknown;
+    /**
+     * Validation Error
+     */
+    422: HttpValidationError;
+};
+
+export type GetVoicesApiV1UserConfigurationsVoicesProviderGetError = GetVoicesApiV1UserConfigurationsVoicesProviderGetErrors[keyof GetVoicesApiV1UserConfigurationsVoicesProviderGetErrors];
+
+export type GetVoicesApiV1UserConfigurationsVoicesProviderGetResponses = {
+    /**
+     * Successful Response
+     */
+    200: VoicesResponse;
+};
+
+export type GetVoicesApiV1UserConfigurationsVoicesProviderGetResponse = GetVoicesApiV1UserConfigurationsVoicesProviderGetResponses[keyof GetVoicesApiV1UserConfigurationsVoicesProviderGetResponses];
+
 export type CreateCampaignApiV1CampaignCreatePostData = {
    body: CreateCampaignRequest;
    headers?: {
--- a/ui/src/components/ServiceConfiguration.tsx
+++ b/ui/src/components/ServiceConfiguration.tsx
@ -6,10 +6,12 @@ import { useForm } from "react-hook-form";
 import { getDefaultConfigurationsApiV1UserConfigurationsDefaultsGet } from '@/client/sdk.gen';
 import { Button } from "@/components/ui/button";
 import { Card, CardContent } from "@/components/ui/card";
+import { Checkbox } from "@/components/ui/checkbox";
 import { Input } from "@/components/ui/input";
 import { Label } from "@/components/ui/label";
 import { Select, SelectContent, SelectItem, SelectTrigger, SelectValue } from "@/components/ui/select";
 import { Tabs, TabsContent, TabsList, TabsTrigger } from "@/components/ui/tabs";
+import { VoiceSelector } from "@/components/VoiceSelector";
 import { useUserConfig } from "@/context/UserConfigContext";

 type ServiceSegment = "llm" | "tts" | "stt";
@ -18,6 +20,7 @@ interface SchemaProperty {
    type?: string;
    default?: string | number | boolean;
    enum?: string[];
+    examples?: string[];
    $ref?: string;
    description?: string;
    format?: string;
@ -40,6 +43,65 @@ const TAB_CONFIG: { key: ServiceSegment; label: string }[] = [
    { key: "stt", label: "Transcriber" },
 ];

+// Display names for language codes (Deepgram + Sarvam)
+const LANGUAGE_DISPLAY_NAMES: Record<string, string> = {
+    // Deepgram languages
+    "multi": "Multilingual (Auto-detect)",
+    "en": "English",
+    "en-US": "English (US)",
+    "en-GB": "English (UK)",
+    "en-AU": "English (Australia)",
+    "en-IN": "English (India)",
+    "es": "Spanish",
+    "es-419": "Spanish (Latin America)",
+    "fr": "French",
+    "fr-CA": "French (Canada)",
+    "de": "German",
+    "it": "Italian",
+    "pt": "Portuguese",
+    "pt-BR": "Portuguese (Brazil)",
+    "nl": "Dutch",
+    "hi": "Hindi",
+    "ja": "Japanese",
+    "ko": "Korean",
+    "zh-CN": "Chinese (Simplified)",
+    "zh-TW": "Chinese (Traditional)",
+    "ru": "Russian",
+    "pl": "Polish",
+    "tr": "Turkish",
+    "uk": "Ukrainian",
+    "vi": "Vietnamese",
+    "sv": "Swedish",
+    "da": "Danish",
+    "no": "Norwegian",
+    "fi": "Finnish",
+    "id": "Indonesian",
+    "th": "Thai",
+    // Sarvam Indian languages
+    "bn-IN": "Bengali",
+    "gu-IN": "Gujarati",
+    "hi-IN": "Hindi",
+    "kn-IN": "Kannada",
+    "ml-IN": "Malayalam",
+    "mr-IN": "Marathi",
+    "od-IN": "Odia",
+    "pa-IN": "Punjabi",
+    "ta-IN": "Tamil",
+    "te-IN": "Telugu",
+    "as-IN": "Assamese",
+};
+
+// Display names for Sarvam voices
+const VOICE_DISPLAY_NAMES: Record<string, string> = {
+    "anushka": "Anushka (Female)",
+    "manisha": "Manisha (Female)",
+    "vidya": "Vidya (Female)",
+    "arya": "Arya (Female)",
+    "abhilash": "Abhilash (Male)",
+    "karun": "Karun (Male)",
+    "hitesh": "Hitesh (Male)",
+};
+
 export default function ServiceConfiguration() {
    const [apiError, setApiError] = useState<string | null>(null);
    const [isSaving, setIsSaving] = useState(false);
@ -54,6 +116,8 @@ export default function ServiceConfiguration() {
        tts: "",
        stt: ""
    });
+    const [isManualModelInput, setIsManualModelInput] = useState(false);
+    const [hasCheckedManualMode, setHasCheckedManualMode] = useState(false);

    const {
        register,
@ -119,6 +183,29 @@ export default function ServiceConfiguration() {
        fetchConfigurations();
    }, [reset, userConfig]);

+    // Check if the saved LLM model is not in the suggested options (custom model)
+    useEffect(() => {
+        if (hasCheckedManualMode) return;
+
+        const currentProvider = serviceProviders.llm;
+        const providerSchema = schemas?.llm?.[currentProvider];
+        if (!providerSchema) return;
+
+        const modelSchema = providerSchema.properties.model;
+        const actualModelSchema = modelSchema?.$ref && providerSchema.$defs
+            ? providerSchema.$defs[modelSchema.$ref.split('/').pop() || '']
+            : modelSchema;
+
+        if (actualModelSchema?.examples && userConfig?.llm?.model) {
+            const savedModel = userConfig.llm.model as string;
+            const isInOptions = actualModelSchema.examples.includes(savedModel);
+            if (!isInOptions) {
+                setIsManualModelInput(true);
+            }
+            setHasCheckedManualMode(true);
+        }
+    }, [schemas, serviceProviders.llm, userConfig?.llm?.model, hasCheckedManualMode]);
+
    const handleProviderChange = (service: ServiceSegment, providerName: string) => {
        if (!providerName) {
            return;
@ -147,6 +234,11 @@ export default function ServiceConfiguration() {
        preservedValues[`${service}_provider`] = providerName;
        reset(preservedValues);
        setServiceProviders(prev => ({ ...prev, [service]: providerName }));
+
+        // Reset manual model input when LLM provider changes
+        if (service === "llm") {
+            setIsManualModelInput(false);
+        }
    }


@ -266,7 +358,7 @@ export default function ServiceConfiguration() {
                    <div className="space-y-2">
                        <Label>API Key</Label>
                        <Input
-                            type="password"
+                            type="text"
                            placeholder="Enter API key"
                            {...register(`${service}_api_key`, {
                                required: providerSchema.required?.includes("api_key"),
@ -291,7 +383,113 @@ export default function ServiceConfiguration() {
            ? providerSchema.$defs[schema.$ref.split('/').pop() || '']
            : schema;

+        // Use VoiceSelector for voice field in TTS service (except Sarvam which uses enum)
+        if (service === "tts" && field === "voice") {
+            const currentProvider = serviceProviders.tts;
+            // Sarvam uses enum-based voice selection, not VoiceSelector
+            if (currentProvider !== "sarvam" && !actualSchema?.enum) {
+                return (
+                    <VoiceSelector
+                        provider={currentProvider}
+                        value={watch(`${service}_${field}`) as string || ""}
+                        onChange={(voiceId) => {
+                            setValue(`${service}_${field}`, voiceId, { shouldDirty: true });
+                        }}
+                    />
+                );
+            }
+        }
+
+        // Handle LLM model field with manual input toggle (uses examples from schema)
+        if (service === "llm" && field === "model" && actualSchema?.examples) {
+            const currentValue = watch(`${service}_${field}`) as string || "";
+            const modelOptions = actualSchema.examples;
+
+            if (isManualModelInput) {
+                return (
+                    <div className="space-y-2">
+                        <Input
+                            type="text"
+                            placeholder="Enter model name"
+                            value={currentValue}
+                            onChange={(e) => {
+                                setValue(`${service}_${field}`, e.target.value, { shouldDirty: true });
+                            }}
+                        />
+                        <div className="flex items-center space-x-2">
+                            <Checkbox
+                                id="manual-model-input"
+                                checked={isManualModelInput}
+                                onCheckedChange={(checked) => {
+                                    setIsManualModelInput(checked as boolean);
+                                    if (!checked && modelOptions.length > 0) {
+                                        // Reset to first option when switching back
+                                        setValue(`${service}_${field}`, modelOptions[0], { shouldDirty: true });
+                                    }
+                                }}
+                            />
+                            <Label
+                                htmlFor="manual-model-input"
+                                className="text-sm font-normal cursor-pointer"
+                            >
+                                Add Model Manually
+                            </Label>
+                        </div>
+                    </div>
+                );
+            }
+
+            return (
+                <div className="space-y-2">
+                    <Select
+                        value={currentValue}
+                        onValueChange={(value) => {
+                            if (!value) return;
+                            setValue(`${service}_${field}`, value, { shouldDirty: true });
+                        }}
+                    >
+                        <SelectTrigger className="w-full">
+                            <SelectValue placeholder="Select model" />
+                        </SelectTrigger>
+                        <SelectContent>
+                            {modelOptions.map((value: string) => (
+                                <SelectItem key={value} value={value}>
+                                    {value}
+                                </SelectItem>
+                            ))}
+                        </SelectContent>
+                    </Select>
+                    <div className="flex items-center space-x-2">
+                        <Checkbox
+                            id="manual-model-input-dropdown"
+                            checked={isManualModelInput}
+                            onCheckedChange={(checked) => {
+                                setIsManualModelInput(checked as boolean);
+                            }}
+                        />
+                        <Label
+                            htmlFor="manual-model-input-dropdown"
+                            className="text-sm font-normal cursor-pointer"
+                        >
+                            Add Model Manually
+                        </Label>
+                    </div>
+                </div>
+            );
+        }
+
        if (actualSchema?.enum) {
+            // Use friendly display names for language and voice fields
+            const getDisplayName = (value: string) => {
+                if (field === "language") {
+                    return LANGUAGE_DISPLAY_NAMES[value] || value;
+                }
+                if (field === "voice") {
+                    return VOICE_DISPLAY_NAMES[value] || value;
+                }
+                return value;
+            };
+
            return (
                <Select
                    value={watch(`${service}_${field}`) as string || ""}
@ -308,7 +506,7 @@ export default function ServiceConfiguration() {
                    <SelectContent>
                        {actualSchema.enum.map((value: string) => (
                            <SelectItem key={value} value={value}>
-                                {value}
+                                {getDisplayName(value)}
                            </SelectItem>
                        ))}
                    </SelectContent>
--- a/ui/src/components/VoiceSelector.tsx
+++ b/ui/src/components/VoiceSelector.tsx
@ -0,0 +1,384 @@
+"use client";
+
+import { ChevronDown, Loader2, Search, Volume2 } from "lucide-react";
+import { useCallback, useEffect, useState } from "react";
+
+import { getVoicesApiV1UserConfigurationsVoicesProviderGet } from "@/client/sdk.gen";
+import { VoiceInfo } from "@/client/types.gen";
+import { Button } from "@/components/ui/button";
+import { Checkbox } from "@/components/ui/checkbox";
+import { Input } from "@/components/ui/input";
+import { Label } from "@/components/ui/label";
+import { Popover, PopoverContent, PopoverTrigger } from "@/components/ui/popover";
+import { useUserConfig } from "@/context/UserConfigContext";
+import { cn } from "@/lib/utils";
+
+// Providers that have MPS voice endpoints
+type TTSProviderWithVoices = "elevenlabs" | "deepgram" | "sarvam" | "cartesia" | "dograh";
+const MPS_VOICE_PROVIDERS: TTSProviderWithVoices[] = ["elevenlabs", "deepgram", "sarvam", "cartesia", "dograh"];
+
+interface VoiceSelectorProps {
+    provider: string;
+    value: string;
+    onChange: (voiceId: string) => void;
+    className?: string;
+}
+
+export const VoiceSelector: React.FC<VoiceSelectorProps> = ({
+    provider,
+    value,
+    onChange,
+    className,
+}) => {
+    const { accessToken } = useUserConfig();
+    const [isOpen, setIsOpen] = useState(false);
+    const [searchTerm, setSearchTerm] = useState("");
+    const [isManualInput, setIsManualInput] = useState(false);
+    const [manualVoiceId, setManualVoiceId] = useState(value || "");
+    const [voices, setVoices] = useState<VoiceInfo[]>([]);
+    const [isLoading, setIsLoading] = useState(false);
+    const [error, setError] = useState<string | null>(null);
+    const [playingPreview, setPlayingPreview] = useState<string | null>(null);
+    const [currentAudio, setCurrentAudio] = useState<HTMLAudioElement | null>(null);
+
+    // Check if provider has MPS voice endpoint
+    const hasMPSVoiceEndpoint = useCallback((providerName: string): boolean => {
+        return MPS_VOICE_PROVIDERS.includes(providerName.toLowerCase() as TTSProviderWithVoices);
+    }, []);
+
+    // Map provider names to API-compatible provider names
+    const getProviderKey = useCallback((providerName: string): TTSProviderWithVoices | null => {
+        const providerMap: Record<string, TTSProviderWithVoices> = {
+            elevenlabs: "elevenlabs",
+            deepgram: "deepgram",
+            sarvam: "sarvam",
+            cartesia: "cartesia",
+            dograh: "dograh",
+        };
+        return providerMap[providerName.toLowerCase()] || null;
+    }, []);
+
+    const fetchVoices = useCallback(async () => {
+        const providerKey = getProviderKey(provider);
+        if (!providerKey || !accessToken) {
+            setVoices([]);
+            return;
+        }
+
+        setIsLoading(true);
+        setError(null);
+
+        try {
+            const response = await getVoicesApiV1UserConfigurationsVoicesProviderGet({
+                path: { provider: providerKey },
+                headers: {
+                    Authorization: `Bearer ${accessToken}`,
+                },
+            });
+
+            if (response.data?.voices) {
+                setVoices(response.data.voices);
+            }
+        } catch (err) {
+            console.error("Failed to fetch voices:", err);
+            setError("Failed to load voices");
+            setVoices([]);
+        } finally {
+            setIsLoading(false);
+        }
+    }, [provider, getProviderKey, accessToken]);
+
+    useEffect(() => {
+        if (provider) {
+            fetchVoices();
+        }
+    }, [provider, fetchVoices]);
+
+    // Check if the current value exists in the voices list
+    useEffect(() => {
+        if (value && voices.length > 0) {
+            const voiceExists = voices.some((v) => v.voice_id === value);
+            if (!voiceExists) {
+                // If the value doesn't exist in the list, switch to manual input mode
+                setIsManualInput(true);
+                setManualVoiceId(value);
+            }
+        }
+    }, [value, voices]);
+
+    // Cleanup audio on unmount or when popover closes
+    useEffect(() => {
+        if (!isOpen && currentAudio) {
+            currentAudio.pause();
+            currentAudio.currentTime = 0;
+            setCurrentAudio(null);
+            setPlayingPreview(null);
+        }
+    }, [isOpen, currentAudio]);
+
+    // Cleanup on unmount
+    useEffect(() => {
+        return () => {
+            if (currentAudio) {
+                currentAudio.pause();
+            }
+        };
+    }, [currentAudio]);
+
+    const filteredVoices = voices.filter((voice) => {
+        const searchLower = searchTerm.toLowerCase();
+        return (
+            voice.name.toLowerCase().includes(searchLower) ||
+            voice.voice_id.toLowerCase().includes(searchLower) ||
+            (voice.description?.toLowerCase() || "").includes(searchLower) ||
+            (voice.accent?.toLowerCase() || "").includes(searchLower) ||
+            (voice.gender?.toLowerCase() || "").includes(searchLower) ||
+            (voice.language?.toLowerCase() || "").includes(searchLower)
+        );
+    });
+
+    const handleSelectVoice = (voiceId: string) => {
+        onChange(voiceId);
+        setIsOpen(false);
+        setSearchTerm("");
+    };
+
+    const handleManualInputToggle = (checked: boolean) => {
+        setIsManualInput(checked);
+        if (checked) {
+            setManualVoiceId(value || "");
+        } else {
+            // When switching back to dropdown, try to find the current value in voices
+            const existingVoice = voices.find((v) => v.voice_id === value);
+            if (!existingVoice && voices.length > 0) {
+                // If current value not in list, select the first voice
+                onChange(voices[0].voice_id);
+            }
+        }
+    };
+
+    const handleManualVoiceIdChange = (newValue: string) => {
+        setManualVoiceId(newValue);
+        onChange(newValue);
+    };
+
+    const getSelectedVoiceName = () => {
+        if (isManualInput && value) {
+            return value;
+        }
+        const voice = voices.find((v) => v.voice_id === value);
+        return voice?.name || value || "Select a voice";
+    };
+
+    const playPreview = (previewUrl: string, voiceId: string) => {
+        // Stop current audio if playing
+        if (currentAudio) {
+            currentAudio.pause();
+            currentAudio.currentTime = 0;
+            setCurrentAudio(null);
+        }
+
+        // If clicking the same voice that's playing, just stop it
+        if (playingPreview === voiceId) {
+            setPlayingPreview(null);
+            return;
+        }
+
+        setPlayingPreview(voiceId);
+        const audio = new Audio(previewUrl);
+        setCurrentAudio(audio);
+        audio.onended = () => {
+            setPlayingPreview(null);
+            setCurrentAudio(null);
+        };
+        audio.onerror = () => {
+            setPlayingPreview(null);
+            setCurrentAudio(null);
+        };
+        audio.play().catch(() => {
+            setPlayingPreview(null);
+            setCurrentAudio(null);
+        });
+    };
+
+    // For providers without MPS voice endpoint, show simple input
+    if (!hasMPSVoiceEndpoint(provider)) {
+        return (
+            <div className={cn("space-y-2", className)}>
+                <Input
+                    type="text"
+                    placeholder="Enter voice ID"
+                    value={value || ""}
+                    onChange={(e) => onChange(e.target.value)}
+                />
+            </div>
+        );
+    }
+
+    if (isManualInput) {
+        return (
+            <div className={cn("space-y-2", className)}>
+                <Input
+                    type="text"
+                    placeholder="Enter voice ID"
+                    value={manualVoiceId}
+                    onChange={(e) => handleManualVoiceIdChange(e.target.value)}
+                />
+                <div className="flex items-center space-x-2">
+                    <Checkbox
+                        id="manual-voice-input"
+                        checked={isManualInput}
+                        onCheckedChange={(checked) => handleManualInputToggle(checked as boolean)}
+                    />
+                    <Label
+                        htmlFor="manual-voice-input"
+                        className="text-sm font-normal cursor-pointer"
+                    >
+                        Add Voice ID Manually
+                    </Label>
+                </div>
+            </div>
+        );
+    }
+
+    return (
+        <div className={cn("space-y-2", className)}>
+            <Popover open={isOpen} onOpenChange={setIsOpen}>
+                <PopoverTrigger asChild>
+                    <Button
+                        variant="outline"
+                        role="combobox"
+                        aria-expanded={isOpen}
+                        className={cn(
+                            "w-full justify-between",
+                            !value && "text-muted-foreground"
+                        )}
+                        disabled={isLoading}
+                    >
+                        <span className="truncate">
+                            {isLoading ? "Loading voices..." : getSelectedVoiceName()}
+                        </span>
+                        {isLoading ? (
+                            <Loader2 className="ml-2 h-4 w-4 shrink-0 animate-spin" />
+                        ) : (
+                            <ChevronDown className="ml-2 h-4 w-4 shrink-0 opacity-50" />
+                        )}
+                    </Button>
+                </PopoverTrigger>
+                <PopoverContent className="w-[400px] p-0" align="start">
+                    <div className="p-2 space-y-2">
+                        <div className="relative">
+                            <Search className="absolute left-2 top-2.5 h-4 w-4 text-muted-foreground" />
+                            <Input
+                                placeholder="Search voices..."
+                                value={searchTerm}
+                                onChange={(e) => setSearchTerm(e.target.value)}
+                                className="pl-8"
+                            />
+                        </div>
+
+                        <div className="max-h-[300px] overflow-auto space-y-1">
+                            {error ? (
+                                <p className="text-sm text-red-500 text-center py-4">
+                                    {error}
+                                </p>
+                            ) : isLoading ? (
+                                <div className="flex items-center justify-center py-4">
+                                    <Loader2 className="h-6 w-6 animate-spin text-muted-foreground" />
+                                </div>
+                            ) : filteredVoices.length === 0 ? (
+                                <p className="text-sm text-muted-foreground text-center py-4">
+                                    No voices found
+                                </p>
+                            ) : (
+                                filteredVoices.map((voice) => (
+                                    <div
+                                        key={voice.voice_id}
+                                        className={cn(
+                                            "flex items-start space-x-3 p-2 hover:bg-accent rounded-sm cursor-pointer",
+                                            value === voice.voice_id && "bg-accent"
+                                        )}
+                                        onClick={() => handleSelectVoice(voice.voice_id)}
+                                    >
+                                        <div className="flex-1 min-w-0">
+                                            <div className="flex items-center gap-2">
+                                                <p className="text-sm font-medium truncate">
+                                                    {voice.name}
+                                                </p>
+                                                {voice.gender && (
+                                                    <span className="text-xs text-muted-foreground capitalize">
+                                                        {voice.gender}
+                                                    </span>
+                                                )}
+                                            </div>
+                                            {voice.description && (
+                                                <p className="text-xs text-muted-foreground line-clamp-2">
+                                                    {voice.description}
+                                                </p>
+                                            )}
+                                            <div className="flex items-center gap-2 mt-1">
+                                                {voice.accent && (
+                                                    <span className="text-xs bg-secondary px-1.5 py-0.5 rounded capitalize">
+                                                        {voice.accent}
+                                                    </span>
+                                                )}
+                                                {voice.language && (
+                                                    <span className="text-xs bg-secondary px-1.5 py-0.5 rounded uppercase">
+                                                        {voice.language}
+                                                    </span>
+                                                )}
+                                            </div>
+                                        </div>
+                                        {voice.preview_url && (
+                                            <Button
+                                                variant="ghost"
+                                                size="sm"
+                                                className="h-8 w-8 p-0 shrink-0"
+                                                onClick={(e) => {
+                                                    e.stopPropagation();
+                                                    playPreview(voice.preview_url!, voice.voice_id);
+                                                }}
+                                            >
+                                                <Volume2
+                                                    className={cn(
+                                                        "h-4 w-4",
+                                                        playingPreview === voice.voice_id &&
+                                                            "text-primary animate-pulse"
+                                                    )}
+                                                />
+                                            </Button>
+                                        )}
+                                    </div>
+                                ))
+                            )}
+                        </div>
+
+                        <div className="pt-2 border-t flex items-center justify-between">
+                            <div className="flex items-center space-x-2">
+                                <Checkbox
+                                    id="manual-voice-input-popup"
+                                    checked={isManualInput}
+                                    onCheckedChange={(checked) => {
+                                        handleManualInputToggle(checked as boolean);
+                                        if (checked) {
+                                            setIsOpen(false);
+                                        }
+                                    }}
+                                />
+                                <Label
+                                    htmlFor="manual-voice-input-popup"
+                                    className="text-sm font-normal cursor-pointer"
+                                >
+                                    Add Voice ID Manually
+                                </Label>
+                            </div>
+                            <p className="text-xs text-muted-foreground">
+                                {voices.length} voices available
+                            </p>
+                        </div>
+                    </div>
+                </PopoverContent>
+            </Popover>
+        </div>
+    );
+};