feat: add voice selectors in elevenlabs (#88)

This commit is contained in:
Abhishek 2025-12-25 15:05:53 +05:30 committed by GitHub
parent 480e8a5f60
commit 45c5b7c304
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
22 changed files with 978 additions and 166 deletions

View file

@ -20,7 +20,7 @@ RUN pip install --user --no-cache-dir -r requirements.txt && \
# Copy and install pipecat from local submodule
COPY pipecat /tmp/pipecat
RUN pip install --user --no-cache-dir '/tmp/pipecat[cartesia,deepgram,openai,elevenlabs,groq,google,azure,soundfile,silero,webrtc,local-smart-turn-v3]' && \
RUN pip install --user --no-cache-dir '/tmp/pipecat[cartesia,deepgram,openai,elevenlabs,groq,google,azure,sarvam,soundfile,silero,webrtc,local-smart-turn-v3]' && \
# Clean up pip cache and temporary pipecat directory
rm -rf /root/.cache/pip /tmp/pipecat

View file

@ -111,9 +111,7 @@ def apply_workflow_run_filters(
# (subscript [] only works in PostgreSQL 14+)
filter_conditions.append(
cast(WorkflowRunModel.gathered_context, JSONB)
.op("->>")(
"mapped_call_disposition"
)
.op("->>")("mapped_call_disposition")
.in_(codes)
)
@ -147,9 +145,7 @@ def apply_workflow_run_filters(
# Use ->> operator for compatibility with all PostgreSQL versions
filter_conditions.append(
cast(WorkflowRunModel.initial_context, JSONB)
.op("->>")(
"phone"
)
.op("->>")("phone")
.contains(phone)
)
@ -178,13 +174,9 @@ def apply_workflow_run_filters(
"total_cost_usd"
)
if min_val is not None:
filter_conditions.append(
cast(cost_text, Integer) >= min_val
)
filter_conditions.append(cast(cost_text, Integer) >= min_val)
if max_val is not None:
filter_conditions.append(
cast(cost_text, Integer) <= max_val
)
filter_conditions.append(cast(cost_text, Integer) <= max_val)
if filter_conditions:
base_query = base_query.where(and_(*filter_conditions))

View file

@ -1,7 +1,8 @@
from datetime import datetime, timedelta
from typing import List, Optional, TypedDict, Union
from typing import List, Literal, Optional, TypedDict, Union
from fastapi import APIRouter, Depends, HTTPException, Query
from loguru import logger
from pydantic import BaseModel
from api.db import db_client
@ -17,6 +18,7 @@ from api.services.configuration.defaults import DEFAULT_SERVICE_PROVIDERS
from api.services.configuration.masking import mask_user_config
from api.services.configuration.merge import merge_user_configurations
from api.services.configuration.registry import REGISTRY, ServiceType
from api.services.mps_service_key_client import mps_service_key_client
router = APIRouter(prefix="/user")
@ -274,3 +276,46 @@ async def reactivate_api_key(
raise HTTPException(status_code=500, detail="Failed to reactivate API key")
return {"success": True, "message": "API key reactivated successfully"}
# Voice Configuration Endpoints
TTSProvider = Literal["elevenlabs", "deepgram", "sarvam", "cartesia", "dograh"]
class VoiceInfo(BaseModel):
voice_id: str
name: str
description: Optional[str] = None
accent: Optional[str] = None
gender: Optional[str] = None
language: Optional[str] = None
preview_url: Optional[str] = None
class VoicesResponse(BaseModel):
provider: str
voices: List[VoiceInfo]
@router.get("/configurations/voices/{provider}")
async def get_voices(
provider: TTSProvider,
user: UserModel = Depends(get_user),
) -> VoicesResponse:
"""Get available voices for a TTS provider."""
try:
result = await mps_service_key_client.get_voices(
provider=provider,
organization_id=user.selected_organization_id,
created_by=user.provider_id,
)
return VoicesResponse(
provider=result.get("provider", provider),
voices=[VoiceInfo(**voice) for voice in result.get("voices", [])],
)
except Exception as e:
logger.error(f"Failed to fetch voices for {provider}: {e}")
raise HTTPException(
status_code=500,
detail=f"Failed to fetch voices for {provider}",
)

View file

@ -11,10 +11,8 @@ from api.db.models import UserModel
from api.schemas.user_configuration import UserConfiguration
from api.services.auth.stack_auth import stackauth
from api.services.configuration.registry import (
DograhLLMModel,
DograhSTTModel,
DograhTTSModel,
DograhVoice,
ServiceProviders,
)
@ -244,13 +242,13 @@ async def create_user_configuration_with_mps_key(
"llm": {
"provider": ServiceProviders.DOGRAH.value,
"api_key": service_key,
"model": DograhLLMModel.DEFAULT.value, # Default model
"model": "default", # Default model
},
"tts": {
"provider": ServiceProviders.DOGRAH.value,
"api_key": service_key,
"model": DograhTTSModel.DEFAULT.value, # Default model
"voice": DograhVoice.DEFAULT.value, # Default voice
"voice": "default", # Default voice
},
"stt": {
"provider": ServiceProviders.DOGRAH.value,

View file

@ -38,6 +38,7 @@ class UserConfigurationValidator:
ServiceProviders.AZURE.value: self._check_azure_api_key,
ServiceProviders.CARTESIA.value: self._check_cartesia_api_key,
ServiceProviders.DOGRAH.value: self._check_dograh_api_key,
ServiceProviders.SARVAM.value: self._check_sarvam_api_key,
}
async def validate(self, configuration: UserConfiguration) -> APIKeyStatusResponse:
@ -134,20 +135,5 @@ class UserConfigurationValidator:
def _check_dograh_api_key(self, model: str, api_key: str) -> bool:
return True
# def _check_neuphonic_api_key(self, model: str, api_key: str) -> bool:
# if not Neuphonic:
# self._provider_api_key_validity_status[model] = False
# return self._provider_api_key_validity_status[model]
# if model in self._provider_api_key_validity_status:
# return self._provider_api_key_validity_status[model]
# client = Neuphonic(api_key=api_key)
# try:
# response = client.voices.list() # get's all available voices
# voices = response.data["voices"]
# self._provider_api_key_validity_status[model] = True
# except Exception:
# self._provider_api_key_validity_status[model] = False
# return self._provider_api_key_validity_status[model]
def _check_sarvam_api_key(self, model: str, api_key: str) -> bool:
return True

View file

@ -20,6 +20,7 @@ class ServiceProviders(str, Enum):
GOOGLE = "google"
AZURE = "azure"
DOGRAH = "dograh"
SARVAM = "sarvam"
class BaseServiceConfiguration(BaseModel):
@ -31,6 +32,7 @@ class BaseServiceConfiguration(BaseModel):
ServiceProviders.GOOGLE,
ServiceProviders.AZURE,
ServiceProviders.DOGRAH,
# ServiceProviders.SARVAM,
]
api_key: str
@ -92,82 +94,56 @@ def register_stt(cls: Type[BaseSTTConfiguration]):
###################################################### LLM ########################################################################
class OpenAIModel(str, Enum):
GPT3_5_TURBO = "gpt-3.5-turbo"
GPT4_1 = "gpt-4.1"
GPT4_1_MINI = "gpt-4.1-mini"
GPT4_1_NANO = "gpt-4.1-nano"
GPT5 = "gpt-5"
GPT5_MINI = "gpt-5-mini"
GPT5_NANO = "gpt-5-nano"
# Suggested models for each provider (used for UI dropdown)
OPENAI_MODELS = ["gpt-4.1", "gpt-4.1-mini", "gpt-4.1-nano", "gpt-5", "gpt-5-mini", "gpt-5-nano", "gpt-3.5-turbo"]
GOOGLE_MODELS = ["gemini-2.0-flash", "gemini-2.0-flash-lite", "gemini-2.5-flash", "gemini-2.5-flash-lite"]
GROQ_MODELS = [
"llama-3.3-70b-versatile",
"deepseek-r1-distill-llama-70b",
"qwen-qwq-32b",
"meta-llama/llama-4-scout-17b-16e-instruct",
"meta-llama/llama-4-maverick-17b-128e-instruct",
"gemma2-9b-it",
"llama-3.1-8b-instant",
"openai/gpt-oss-120b",
]
AZURE_MODELS = ["gpt-4.1-mini"]
DOGRAH_LLM_MODELS = ["default", "accurate", "fast", "lite", "zen", "zen_lite"]
@register_llm
class OpenAILLMService(BaseLLMConfiguration):
provider: Literal[ServiceProviders.OPENAI] = ServiceProviders.OPENAI
model: OpenAIModel = OpenAIModel.GPT4_1
model: str = Field(default="gpt-4.1", json_schema_extra={"examples": OPENAI_MODELS})
api_key: str
class GoogleModel(str, Enum):
GEMINI_2_0_FLASH = "gemini-2.0-flash"
GEMINI_2_0_FLASH_LITE = "gemini-2.0-flash-lite"
GEMINI_2_5_FLASH = "gemini-2.5-flash"
GEMINI_2_5_FLASH_LITE = "gemini-2.5-flash-lite"
@register_llm
class GoogleLLMService(BaseLLMConfiguration):
provider: Literal[ServiceProviders.GOOGLE] = ServiceProviders.GOOGLE
model: GoogleModel = GoogleModel.GEMINI_2_0_FLASH
model: str = Field(default="gemini-2.0-flash", json_schema_extra={"examples": GOOGLE_MODELS})
api_key: str
class GroqModel(str, Enum):
LLAMA_3_3_70B = "llama-3.3-70b-versatile"
DEEPSEEK_R1_DISTILL_LLAMA_70B = "deepseek-r1-distill-llama-70b"
QUEN_QWQ_32B = "qwen-qwq-32b"
LLAMA_4_SCOUT_17B_16E_INSTRUCT = "meta-llama/llama-4-scout-17b-16e-instruct"
LLAMA_4_MAVERICK_17B_128E_INSTRUCT = "meta-llama/llama-4-maverick-17b-128e-instruct"
GEMMA2_9B_IT = "gemma2-9b-it"
LLAMA_3_1_8B_INSTANT = "llama-3.1-8b-instant"
OPENAI_GPT_OSS_120B = "openai/gpt-oss-120b"
@register_llm
class GroqLLMService(BaseLLMConfiguration):
provider: Literal[ServiceProviders.GROQ] = ServiceProviders.GROQ
model: GroqModel = GroqModel.LLAMA_3_3_70B
model: str = Field(default="llama-3.3-70b-versatile", json_schema_extra={"examples": GROQ_MODELS})
api_key: str
class AzureModel(str, Enum):
GPT4_1_MINI = "gpt-4.1-mini"
@register_llm
class AzureLLMService(BaseLLMConfiguration):
provider: Literal[ServiceProviders.AZURE] = ServiceProviders.AZURE
model: AzureModel = AzureModel.GPT4_1_MINI
model: str = Field(default="gpt-4.1-mini", json_schema_extra={"examples": AZURE_MODELS})
api_key: str
endpoint: str
# Dograh LLM Service
class DograhLLMModel(str, Enum):
DEFAULT = "default"
ACCURATE = "accurate"
FAST = "fast"
LITE = "lite"
ZEN = "zen"
ZEN_LITE = "zen_lite"
@register_llm
class DograhLLMService(BaseLLMConfiguration):
provider: Literal[ServiceProviders.DOGRAH] = ServiceProviders.DOGRAH
model: DograhLLMModel = DograhLLMModel.DEFAULT
model: str = Field(default="default", json_schema_extra={"examples": DOGRAH_LLM_MODELS})
api_key: str
@ -185,15 +161,10 @@ LLMConfig = Annotated[
###################################################### TTS ########################################################################
class DeepgramVoice(str, Enum):
HELENA = "aura-2-helena-en"
THALIA = "aura-2-thalia-en"
@register_tts
class DeepgramTTSConfiguration(BaseServiceConfiguration):
provider: Literal[ServiceProviders.DEEPGRAM] = ServiceProviders.DEEPGRAM
voice: DeepgramVoice = DeepgramVoice.HELENA
voice: str = "aura-2-helena-en"
api_key: str
@computed_field
@ -210,42 +181,6 @@ class DeepgramTTSConfiguration(BaseServiceConfiguration):
return "aura-2"
class ElevenlabsVoice(str, Enum):
ALEXANDRA = "Alexandra - 3dzJXoCYueSQiptQ6euE"
AMY = "Amy - oGn4Ha2pe2vSJkmIJgLQ"
ANGELA = "Angela - FUfBrNit0NNZAwb58KWH"
ARIA = "Aria - 9BWtsMINqrJLrRacOk9x"
CHELSEA = "Chelsea - NHRgOEwqx5WZNClv5sat"
CHRISTINA = "Christina - X03mvPuTfprif8QBAVeJ"
CLARA = "Clara - ZIlrSGI4jZqobxRKprJz"
CLYDE = "Clyde - 2EiwWnXFnvU5JabPnv8n"
DAVE = "Dave - CYw3kZ02Hs0563khs1Fj"
DOMI = "Domi - AZnzlk1XvdvUeBnXmlld"
DREW = "Drew - 29vD33N1CtxCmqQRPOHJ"
ELENA = "Elena_German - iFJwt4O7E3aafIpJFfcu"
EVE = "Eve - BZgkqPqms7Kj9ulSkVzn"
FIN = "Fin - D38z5RcWu1voky8WS1ja"
HOPE_BESTIE = "Hope_Bestie - uYXf8XasLslADfZ2MB4u"
HOPE_NATURAL = "Hope_Natural - OYTbf65OHHFELVut7v2H"
JARNATHAN = "Jarnathan - c6SfcYrb2t09NHXiT80T"
JENNA = "Jenna - C2BkQxlGNzBn7WD2bqfR"
JESSICA = "Jessica - cgSgspJ2msm6clMCkdW9"
JOHANNA = "Johanna_German - YYDsZT3K2y6tv7X1aj6N"
JUNIPER = "Juniper - aMSt68OGf4xUZAnLpTU8"
LAUREN = "Lauren - 3liN8q8YoeB9Hk6AboKe"
LINA = "Lina - oWjuL7HSoaEJRMDMP3HD"
MONIKA = "Monika_Hindi_8 - 2bNrEsM0omyhLiEyOwqY"
NEHA = "Neha_Hindi - QTKSa2Iyv0yoxvXY2V8a"
OLIVIA = "Olivia - 1rviaVF7GGGkTU36HNpz"
PAUL = "Paul - 5Q0t7uMcjvnagumLfvZi"
RACHEL = "Rachel - 21m00Tcm4TlvDq8ikWAM"
ROGER = "Roger - CwhRBWXzGAHq8TQ4Fs17"
SAMI_REAL = "Sami_Real - O4cGUVdAocn0z4EpQ9yF"
SARAH = "Sarah - EXAVITQu4vr4xnSDxMaL"
SIA = "Sia_Hindi_10 - ryIIztHPLYSJ74ueXxnO"
ZARA = "Zara - MmQVkVZnQ0dUbfWzcW6f"
class ElevenlabsModel(str, Enum):
FLASH_2 = "eleven_flash_v2_5"
@ -253,16 +188,12 @@ class ElevenlabsModel(str, Enum):
@register_tts
class ElevenlabsTTSConfiguration(BaseServiceConfiguration):
provider: Literal[ServiceProviders.ELEVENLABS] = ServiceProviders.ELEVENLABS
voice: ElevenlabsVoice = ElevenlabsVoice.RACHEL
voice: str = "21m00Tcm4TlvDq8ikWAM" # Rachel voice ID
speed: float = Field(default=1.0, ge=0.1, le=2.0, description="Speed of the voice")
model: ElevenlabsModel = ElevenlabsModel.FLASH_2
api_key: str
class OpenAIVoice(str, Enum):
ALLY = "alloy"
class OpenAITTSModel(str, Enum):
GPT_4o_MINI = "gpt-4o-mini-tts"
@ -271,29 +202,10 @@ class OpenAITTSModel(str, Enum):
class OpenAITTSService(BaseTTSConfiguration):
provider: Literal[ServiceProviders.OPENAI] = ServiceProviders.OPENAI
model: OpenAITTSModel = OpenAITTSModel.GPT_4o_MINI
voice: OpenAIVoice = OpenAIVoice.ALLY
voice: str = "alloy"
api_key: str
# class NeuphonicVoice(str, Enum):
# EMILY = "Emily - fc854436-2dac-4d21-aa69-ae17b54e98eb"
# @register_tts
# class NeuphonicTTSService(BaseTTSConfiguration):
# provider: Literal[ServiceProviders.NEUPHONIC] = ServiceProviders.NEUPHONIC
# voice: NeuphonicVoice = NeuphonicVoice.EMILY
# model: str = "NA"
# api_key: str
# Dograh TTS Service
class DograhVoice(str, Enum):
DEFAULT = "default"
JOEY = "joey"
RACHEL = "rachel"
class DograhTTSModel(str, Enum):
DEFAULT = "default"
@ -302,16 +214,58 @@ class DograhTTSModel(str, Enum):
class DograhTTSService(BaseTTSConfiguration):
provider: Literal[ServiceProviders.DOGRAH] = ServiceProviders.DOGRAH
model: DograhTTSModel = DograhTTSModel.DEFAULT
voice: DograhVoice = DograhVoice.DEFAULT
voice: str = "default"
api_key: str
class SarvamTTSModel(str, Enum):
BULBUL_V2 = "bulbul:v2"
BULBUL_V3 = "bulbul:v3"
class SarvamVoice(str, Enum):
# Female voices
ANUSHKA = "anushka"
MANISHA = "manisha"
VIDYA = "vidya"
ARYA = "arya"
# Male voices
ABHILASH = "abhilash"
KARUN = "karun"
HITESH = "hitesh"
class SarvamLanguage(str, Enum):
BENGALI = "bn-IN"
ENGLISH_INDIA = "en-IN"
GUJARATI = "gu-IN"
HINDI = "hi-IN"
KANNADA = "kn-IN"
MALAYALAM = "ml-IN"
MARATHI = "mr-IN"
ODIA = "od-IN"
PUNJABI = "pa-IN"
TAMIL = "ta-IN"
TELUGU = "te-IN"
ASSAMESE = "as-IN"
# @register_tts
# class SarvamTTSConfiguration(BaseTTSConfiguration):
# provider: Literal[ServiceProviders.SARVAM] = ServiceProviders.SARVAM
# model: SarvamTTSModel = SarvamTTSModel.BULBUL_V2
# voice: SarvamVoice = SarvamVoice.ANUSHKA
# language: SarvamLanguage = SarvamLanguage.HINDI
# api_key: str
TTSConfig = Annotated[
Union[
DeepgramTTSConfiguration,
OpenAITTSService,
ElevenlabsTTSConfiguration,
DograhTTSService,
# SarvamTTSConfiguration,
],
Field(discriminator="provider"),
]
@ -323,10 +277,45 @@ class DeepgramSTTModel(str, Enum):
NOVA_3_GENERAL = "nova-3-general"
class DeepgramLanguage(str, Enum):
MULTI = "multi"
ENGLISH = "en"
ENGLISH_US = "en-US"
ENGLISH_GB = "en-GB"
ENGLISH_AU = "en-AU"
ENGLISH_IN = "en-IN"
SPANISH = "es"
SPANISH_LATAM = "es-419"
FRENCH = "fr"
FRENCH_CA = "fr-CA"
GERMAN = "de"
ITALIAN = "it"
PORTUGUESE = "pt"
PORTUGUESE_BR = "pt-BR"
DUTCH = "nl"
HINDI = "hi"
JAPANESE = "ja"
KOREAN = "ko"
CHINESE_SIMPLIFIED = "zh-CN"
CHINESE_TRADITIONAL = "zh-TW"
RUSSIAN = "ru"
POLISH = "pl"
TURKISH = "tr"
UKRAINIAN = "uk"
VIETNAMESE = "vi"
SWEDISH = "sv"
DANISH = "da"
NORWEGIAN = "no"
FINNISH = "fi"
INDONESIAN = "id"
THAI = "th"
@register_stt
class DeepgramSTTConfiguration(BaseSTTConfiguration):
provider: Literal[ServiceProviders.DEEPGRAM] = ServiceProviders.DEEPGRAM
model: DeepgramSTTModel = DeepgramSTTModel.NOVA_3_GENERAL
language: DeepgramLanguage = DeepgramLanguage.MULTI
api_key: str
@ -359,8 +348,27 @@ class DograhSTTService(BaseSTTConfiguration):
api_key: str
# Sarvam STT Service
class SarvamSTTModel(str, Enum):
SAARIKA_V2_5 = "saarika:v2.5"
SAARAS_V2 = "saaras:v2" # STT-Translate model (auto-detects language)
# @register_stt
# class SarvamSTTConfiguration(BaseSTTConfiguration):
# provider: Literal[ServiceProviders.SARVAM] = ServiceProviders.SARVAM
# model: SarvamSTTModel = SarvamSTTModel.SAARIKA_V2_5
# language: SarvamLanguage = SarvamLanguage.HINDI
# api_key: str
STTConfig = Annotated[
Union[DeepgramSTTConfiguration, OpenAISTTConfiguration, DograhSTTService],
Union[
DeepgramSTTConfiguration,
OpenAISTTConfiguration,
DograhSTTService,
# SarvamSTTConfiguration,
],
Field(discriminator="provider"),
]

View file

@ -285,6 +285,44 @@ class MPSServiceKeyClient:
response=response,
)
async def get_voices(
self,
provider: str,
organization_id: Optional[int] = None,
created_by: Optional[str] = None,
) -> dict:
"""
Get available voices for a TTS provider from MPS.
Args:
provider: TTS provider name (elevenlabs, deepgram, sarvam, cartesia)
organization_id: Organization ID (for authenticated mode)
created_by: User provider ID (for OSS mode)
Returns:
Dictionary containing provider name and list of voices
Raises:
HTTPException: If the API call fails
"""
async with httpx.AsyncClient(timeout=self.timeout) as client:
response = await client.get(
f"{self.base_url}/api/v1/voice-proxy/{provider}/voices",
headers=self._get_headers(organization_id, created_by),
)
if response.status_code == 200:
return response.json()
else:
logger.error(
f"Failed to get voices for {provider}: {response.status_code} - {response.text}"
)
raise httpx.HTTPStatusError(
f"Failed to get voices: {response.text}",
request=response.request,
response=response,
)
async def call_workflow_api(
self,
call_type: str,

View file

@ -17,6 +17,9 @@ from pipecat.services.groq.llm import GroqLLMService
from pipecat.services.openai.llm import OpenAILLMService
from pipecat.services.openai.stt import OpenAISTTService
from pipecat.services.openai.tts import OpenAITTSService
from pipecat.services.sarvam.stt import SarvamSTTService
from pipecat.services.sarvam.tts import SarvamTTSService
from pipecat.transcriptions.language import Language
from pipecat.utils.text.xml_function_tag_filter import XMLFunctionTagFilter
if TYPE_CHECKING:
@ -26,8 +29,13 @@ if TYPE_CHECKING:
def create_stt_service(user_config):
"""Create and return appropriate STT service based on user configuration"""
if user_config.stt.provider == ServiceProviders.DEEPGRAM.value:
# Use language from user config, defaulting to "multi" for multilingual support
language = getattr(user_config.stt, "language", None)
language_value = (
language.value if hasattr(language, "value") else (language or "multi")
)
live_options = LiveOptions(
language="multi", profanity_filter=False, endpointing=100
language=language_value, profanity_filter=False, endpointing=100
)
return DeepgramSTTService(
live_options=live_options,
@ -53,6 +61,32 @@ def create_stt_service(user_config):
model=user_config.stt.model.value,
audio_passthrough=False, # Disable passthrough since audio is buffered separately
)
elif user_config.stt.provider == ServiceProviders.SARVAM.value:
# Map Sarvam language code to pipecat Language enum
language_mapping = {
"bn-IN": Language.BN_IN,
"gu-IN": Language.GU_IN,
"hi-IN": Language.HI_IN,
"kn-IN": Language.KN_IN,
"ml-IN": Language.ML_IN,
"mr-IN": Language.MR_IN,
"ta-IN": Language.TA_IN,
"te-IN": Language.TE_IN,
"pa-IN": Language.PA_IN,
"od-IN": Language.OR_IN,
"en-IN": Language.EN_IN,
"as-IN": Language.AS_IN,
}
language = getattr(user_config.stt, "language", None)
language_value = language.value if hasattr(language, "value") else language
pipecat_language = language_mapping.get(language_value, Language.HI_IN)
return SarvamSTTService(
api_key=user_config.stt.api_key,
model=user_config.stt.model.value,
params=SarvamSTTService.InputParams(language=pipecat_language),
audio_passthrough=False,
)
else:
raise HTTPException(
status_code=400, detail=f"Invalid STT provider {user_config.stt.provider}"
@ -81,7 +115,12 @@ def create_tts_service(user_config, audio_config: "AudioConfig"):
text_filters=[xml_function_tag_filter],
)
elif user_config.tts.provider == ServiceProviders.ELEVENLABS.value:
voice_id = user_config.tts.voice.split(" - ")[1]
# Backward compatible with older configuration "Name - voice_id"
try:
voice_id = user_config.tts.voice.split(" - ")[1]
except IndexError:
voice_id = user_config.tts.voice
return ElevenLabsTTSService(
reconnect_on_error=False,
api_key=user_config.tts.api_key,
@ -103,6 +142,35 @@ def create_tts_service(user_config, audio_config: "AudioConfig"):
voice=user_config.tts.voice.value,
text_filters=[xml_function_tag_filter],
)
elif user_config.tts.provider == ServiceProviders.SARVAM.value:
# Map Sarvam language code to pipecat Language enum for TTS
language_mapping = {
"bn-IN": Language.BN,
"en-IN": Language.EN,
"gu-IN": Language.GU,
"hi-IN": Language.HI,
"kn-IN": Language.KN,
"ml-IN": Language.ML,
"mr-IN": Language.MR,
"od-IN": Language.OR,
"pa-IN": Language.PA,
"ta-IN": Language.TA,
"te-IN": Language.TE,
}
language = getattr(user_config.tts, "language", None)
language_value = language.value if hasattr(language, "value") else language
pipecat_language = language_mapping.get(language_value, Language.HI)
voice = getattr(user_config.tts, "voice", None)
voice_value = voice.value if hasattr(voice, "value") else (voice or "anushka")
return SarvamTTSService(
api_key=user_config.tts.api_key,
model=user_config.tts.model.value,
voice_id=voice_value,
params=SarvamTTSService.InputParams(language=pipecat_language),
text_filters=[xml_function_tag_filter],
)
else:
raise HTTPException(
status_code=400, detail=f"Invalid TTS provider {user_config.tts.provider}"

View file

@ -5,7 +5,6 @@ from api.schemas.user_configuration import UserConfiguration
from api.services.configuration.masking import is_mask_of, mask_key, mask_user_config
from api.services.configuration.merge import merge_user_configurations
from api.services.configuration.registry import (
GroqModel,
OpenAILLMService,
)
@ -70,7 +69,7 @@ def test_merge_drops_old_key_when_provider_changes():
incoming_partial = {
"llm": {
"provider": "groq",
"model": GroqModel.LLAMA_3_3_70B,
"model": "llama-3.3-70b-versatile",
# api_key intentionally absent should NOT inherit old key
}
}

View file

@ -1,18 +1,17 @@
---
title: "Inference Provider"
description: "Dograh ships with its own inferencing engine, which is hosted at https://services.dograh.com. The inference service provides LLM, TTS and STT services. In this document you can see how you can configure the inferencing engine to your favourite provider, like OpenAI, Gemini etc."
title: "Model Configurations"
description: "Voice Agents need AI Models to work, like LLM (Large Language Model), TTS (Voice) and STT (Transcriber). You can use any of your faviourite providers with Dograh Platform to run your Voice Agent."
---
## Configure Inference Provider
## Configure Models
Dograh Platform ships with its own models by default. When you sign up on https://app.dograh.com or you setup the platform on your self hosted infrastructure, you get some Dograh model credits by default.
You can go to `https://app.dograh.com/model-configurations` if you are on hosted version of Dograh or go to `http://localhost:3010/model-configurations` if you are running Dograh locally.
If you wish to change the models to a provider of your own choice, ou can go to `https://app.dograh.com/model-configurations` if you are on hosted version of Dograh or go to `http://localhost:3010/model-configurations` if you are running Dograh locally.
You can see the configuration for the inference provider in the following screenshot.
![Model Configuration](../images/service-configuration.png)
You can select the provider from the dropdown and configure the API key, model, etc. You can see [API Keys](api-keys) documentation for instructions on how to create Service Keys to be used in Model Configuration.
You can select the provider from the dropdown and configure the API key, model, etc. For Dograh, you can see [Service Keys](api-keys) documentation for instructions on how to create Service Keys to be used in Model Configuration.
## Next Steps
You can see how to configure the telephony provider in [Telephony Integrations](/telephony/twilio).
Next there are some in depth documentation of various AI Models that you can configure.

View file

@ -0,0 +1,12 @@
---
title: "LLM"
description: "Voice Agents use LLM (Large Language Models), which are trained to understand the conversational context, and respond to users."
---
You can currently use OpenAI, Google, Groq, Azure and Dograh LLMs in LLM configuration. There are some models provided by default for you to choose from the drop down.
![Select Models from DropDown](../images/models_dropdown.png)
If you don't find a model in the drop down, you can always add a model manually.
![Select Models from DropDown](../images/add_model_manually.png)

View file

@ -0,0 +1,8 @@
---
title: "Transcriber"
description: "Voice Agents use STT (Speech to Text), to transcribe what the user speaks. This transcribed speech as text goes into an LLM to generate the response that gets played out to the user."
---
Dograh platform ships with Deepgram, Cartesia, OpenAI and Dograh transcribers by default. You can take a look at the providers documentation of which language to select for your language requirements.
Example: Deepgram has their language support documentation at https://developers.deepgram.com/docs/models-languages-overview#nova-3

View file

@ -0,0 +1,10 @@
---
title: "Voice"
description: "Voice Agents use TTS (Text to Speech), which generates audio that LLMs generate during the course of a conversation. This is the audio that the end user having the conversation listens to."
---
Dograh platform ships with Elevenlabs, Deepgram, OpenAI and Dograh TTS engines by default. There are some voices from the providers that we ship by default. You can refer to the providers API documentation to select a voice ID thats most relevant for your language requirement.
If you dont find your favourite voice, you can always add the voice ID manually.
![Add Voice Manually](../images/add_tts_manually.png)

View file

@ -30,6 +30,9 @@
"group": "Configurations",
"pages": [
"configurations/inference-providers",
"configurations/llm",
"configurations/voice",
"configurations/transcriber",
"configurations/api-keys"
]
},

Binary file not shown.

After

Width:  |  Height:  |  Size: 105 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 210 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 110 KiB

View file

@ -460,7 +460,7 @@ export default function APIKeysPage() {
{showServiceArchived ? 'Hide' : 'Show'} Archived
</Button>
)}
{canCreateServiceKey && (
{canCreateServiceKey ? (
<Button
onClick={() => setIsCreateServiceDialogOpen(true)}
size="sm"
@ -468,6 +468,10 @@ export default function APIKeysPage() {
<Plus className="w-4 h-4 mr-2" />
Create Service Key
</Button>
) : (
<span className="text">
To generate additional service keys, <a href="https://app.dograh.com" target="_blank" rel="noopener noreferrer" className="text-primary hover:underline">Sign up on app.dograh.com</a>
</span>
)}
</div>
</div>

File diff suppressed because one or more lines are too long

View file

@ -650,6 +650,21 @@ export type VobizConfigurationResponse = {
from_numbers: Array<string>;
};
export type VoiceInfo = {
voice_id: string;
name: string;
description?: string | null;
accent?: string | null;
gender?: string | null;
language?: string | null;
preview_url?: string | null;
};
export type VoicesResponse = {
provider: string;
voices: Array<VoiceInfo>;
};
/**
* Request schema for Vonage configuration.
*/
@ -1828,6 +1843,40 @@ export type ReactivateApiKeyApiV1UserApiKeysApiKeyIdReactivatePutResponses = {
export type ReactivateApiKeyApiV1UserApiKeysApiKeyIdReactivatePutResponse = ReactivateApiKeyApiV1UserApiKeysApiKeyIdReactivatePutResponses[keyof ReactivateApiKeyApiV1UserApiKeysApiKeyIdReactivatePutResponses];
export type GetVoicesApiV1UserConfigurationsVoicesProviderGetData = {
body?: never;
headers?: {
authorization?: string | null;
};
path: {
provider: 'elevenlabs' | 'deepgram' | 'sarvam' | 'cartesia' | 'dograh';
};
query?: never;
url: '/api/v1/user/configurations/voices/{provider}';
};
export type GetVoicesApiV1UserConfigurationsVoicesProviderGetErrors = {
/**
* Not found
*/
404: unknown;
/**
* Validation Error
*/
422: HttpValidationError;
};
export type GetVoicesApiV1UserConfigurationsVoicesProviderGetError = GetVoicesApiV1UserConfigurationsVoicesProviderGetErrors[keyof GetVoicesApiV1UserConfigurationsVoicesProviderGetErrors];
export type GetVoicesApiV1UserConfigurationsVoicesProviderGetResponses = {
/**
* Successful Response
*/
200: VoicesResponse;
};
export type GetVoicesApiV1UserConfigurationsVoicesProviderGetResponse = GetVoicesApiV1UserConfigurationsVoicesProviderGetResponses[keyof GetVoicesApiV1UserConfigurationsVoicesProviderGetResponses];
export type CreateCampaignApiV1CampaignCreatePostData = {
body: CreateCampaignRequest;
headers?: {

View file

@ -6,10 +6,12 @@ import { useForm } from "react-hook-form";
import { getDefaultConfigurationsApiV1UserConfigurationsDefaultsGet } from '@/client/sdk.gen';
import { Button } from "@/components/ui/button";
import { Card, CardContent } from "@/components/ui/card";
import { Checkbox } from "@/components/ui/checkbox";
import { Input } from "@/components/ui/input";
import { Label } from "@/components/ui/label";
import { Select, SelectContent, SelectItem, SelectTrigger, SelectValue } from "@/components/ui/select";
import { Tabs, TabsContent, TabsList, TabsTrigger } from "@/components/ui/tabs";
import { VoiceSelector } from "@/components/VoiceSelector";
import { useUserConfig } from "@/context/UserConfigContext";
type ServiceSegment = "llm" | "tts" | "stt";
@ -18,6 +20,7 @@ interface SchemaProperty {
type?: string;
default?: string | number | boolean;
enum?: string[];
examples?: string[];
$ref?: string;
description?: string;
format?: string;
@ -40,6 +43,65 @@ const TAB_CONFIG: { key: ServiceSegment; label: string }[] = [
{ key: "stt", label: "Transcriber" },
];
// Display names for language codes (Deepgram + Sarvam)
const LANGUAGE_DISPLAY_NAMES: Record<string, string> = {
// Deepgram languages
"multi": "Multilingual (Auto-detect)",
"en": "English",
"en-US": "English (US)",
"en-GB": "English (UK)",
"en-AU": "English (Australia)",
"en-IN": "English (India)",
"es": "Spanish",
"es-419": "Spanish (Latin America)",
"fr": "French",
"fr-CA": "French (Canada)",
"de": "German",
"it": "Italian",
"pt": "Portuguese",
"pt-BR": "Portuguese (Brazil)",
"nl": "Dutch",
"hi": "Hindi",
"ja": "Japanese",
"ko": "Korean",
"zh-CN": "Chinese (Simplified)",
"zh-TW": "Chinese (Traditional)",
"ru": "Russian",
"pl": "Polish",
"tr": "Turkish",
"uk": "Ukrainian",
"vi": "Vietnamese",
"sv": "Swedish",
"da": "Danish",
"no": "Norwegian",
"fi": "Finnish",
"id": "Indonesian",
"th": "Thai",
// Sarvam Indian languages
"bn-IN": "Bengali",
"gu-IN": "Gujarati",
"hi-IN": "Hindi",
"kn-IN": "Kannada",
"ml-IN": "Malayalam",
"mr-IN": "Marathi",
"od-IN": "Odia",
"pa-IN": "Punjabi",
"ta-IN": "Tamil",
"te-IN": "Telugu",
"as-IN": "Assamese",
};
// Display names for Sarvam voices
const VOICE_DISPLAY_NAMES: Record<string, string> = {
"anushka": "Anushka (Female)",
"manisha": "Manisha (Female)",
"vidya": "Vidya (Female)",
"arya": "Arya (Female)",
"abhilash": "Abhilash (Male)",
"karun": "Karun (Male)",
"hitesh": "Hitesh (Male)",
};
export default function ServiceConfiguration() {
const [apiError, setApiError] = useState<string | null>(null);
const [isSaving, setIsSaving] = useState(false);
@ -54,6 +116,8 @@ export default function ServiceConfiguration() {
tts: "",
stt: ""
});
const [isManualModelInput, setIsManualModelInput] = useState(false);
const [hasCheckedManualMode, setHasCheckedManualMode] = useState(false);
const {
register,
@ -119,6 +183,29 @@ export default function ServiceConfiguration() {
fetchConfigurations();
}, [reset, userConfig]);
// Check if the saved LLM model is not in the suggested options (custom model)
useEffect(() => {
if (hasCheckedManualMode) return;
const currentProvider = serviceProviders.llm;
const providerSchema = schemas?.llm?.[currentProvider];
if (!providerSchema) return;
const modelSchema = providerSchema.properties.model;
const actualModelSchema = modelSchema?.$ref && providerSchema.$defs
? providerSchema.$defs[modelSchema.$ref.split('/').pop() || '']
: modelSchema;
if (actualModelSchema?.examples && userConfig?.llm?.model) {
const savedModel = userConfig.llm.model as string;
const isInOptions = actualModelSchema.examples.includes(savedModel);
if (!isInOptions) {
setIsManualModelInput(true);
}
setHasCheckedManualMode(true);
}
}, [schemas, serviceProviders.llm, userConfig?.llm?.model, hasCheckedManualMode]);
const handleProviderChange = (service: ServiceSegment, providerName: string) => {
if (!providerName) {
return;
@ -147,6 +234,11 @@ export default function ServiceConfiguration() {
preservedValues[`${service}_provider`] = providerName;
reset(preservedValues);
setServiceProviders(prev => ({ ...prev, [service]: providerName }));
// Reset manual model input when LLM provider changes
if (service === "llm") {
setIsManualModelInput(false);
}
}
@ -266,7 +358,7 @@ export default function ServiceConfiguration() {
<div className="space-y-2">
<Label>API Key</Label>
<Input
type="password"
type="text"
placeholder="Enter API key"
{...register(`${service}_api_key`, {
required: providerSchema.required?.includes("api_key"),
@ -291,7 +383,113 @@ export default function ServiceConfiguration() {
? providerSchema.$defs[schema.$ref.split('/').pop() || '']
: schema;
// Use VoiceSelector for voice field in TTS service (except Sarvam which uses enum)
if (service === "tts" && field === "voice") {
const currentProvider = serviceProviders.tts;
// Sarvam uses enum-based voice selection, not VoiceSelector
if (currentProvider !== "sarvam" && !actualSchema?.enum) {
return (
<VoiceSelector
provider={currentProvider}
value={watch(`${service}_${field}`) as string || ""}
onChange={(voiceId) => {
setValue(`${service}_${field}`, voiceId, { shouldDirty: true });
}}
/>
);
}
}
// Handle LLM model field with manual input toggle (uses examples from schema)
if (service === "llm" && field === "model" && actualSchema?.examples) {
const currentValue = watch(`${service}_${field}`) as string || "";
const modelOptions = actualSchema.examples;
if (isManualModelInput) {
return (
<div className="space-y-2">
<Input
type="text"
placeholder="Enter model name"
value={currentValue}
onChange={(e) => {
setValue(`${service}_${field}`, e.target.value, { shouldDirty: true });
}}
/>
<div className="flex items-center space-x-2">
<Checkbox
id="manual-model-input"
checked={isManualModelInput}
onCheckedChange={(checked) => {
setIsManualModelInput(checked as boolean);
if (!checked && modelOptions.length > 0) {
// Reset to first option when switching back
setValue(`${service}_${field}`, modelOptions[0], { shouldDirty: true });
}
}}
/>
<Label
htmlFor="manual-model-input"
className="text-sm font-normal cursor-pointer"
>
Add Model Manually
</Label>
</div>
</div>
);
}
return (
<div className="space-y-2">
<Select
value={currentValue}
onValueChange={(value) => {
if (!value) return;
setValue(`${service}_${field}`, value, { shouldDirty: true });
}}
>
<SelectTrigger className="w-full">
<SelectValue placeholder="Select model" />
</SelectTrigger>
<SelectContent>
{modelOptions.map((value: string) => (
<SelectItem key={value} value={value}>
{value}
</SelectItem>
))}
</SelectContent>
</Select>
<div className="flex items-center space-x-2">
<Checkbox
id="manual-model-input-dropdown"
checked={isManualModelInput}
onCheckedChange={(checked) => {
setIsManualModelInput(checked as boolean);
}}
/>
<Label
htmlFor="manual-model-input-dropdown"
className="text-sm font-normal cursor-pointer"
>
Add Model Manually
</Label>
</div>
</div>
);
}
if (actualSchema?.enum) {
// Use friendly display names for language and voice fields
const getDisplayName = (value: string) => {
if (field === "language") {
return LANGUAGE_DISPLAY_NAMES[value] || value;
}
if (field === "voice") {
return VOICE_DISPLAY_NAMES[value] || value;
}
return value;
};
return (
<Select
value={watch(`${service}_${field}`) as string || ""}
@ -308,7 +506,7 @@ export default function ServiceConfiguration() {
<SelectContent>
{actualSchema.enum.map((value: string) => (
<SelectItem key={value} value={value}>
{value}
{getDisplayName(value)}
</SelectItem>
))}
</SelectContent>

View file

@ -0,0 +1,384 @@
"use client";
import { ChevronDown, Loader2, Search, Volume2 } from "lucide-react";
import { useCallback, useEffect, useState } from "react";
import { getVoicesApiV1UserConfigurationsVoicesProviderGet } from "@/client/sdk.gen";
import { VoiceInfo } from "@/client/types.gen";
import { Button } from "@/components/ui/button";
import { Checkbox } from "@/components/ui/checkbox";
import { Input } from "@/components/ui/input";
import { Label } from "@/components/ui/label";
import { Popover, PopoverContent, PopoverTrigger } from "@/components/ui/popover";
import { useUserConfig } from "@/context/UserConfigContext";
import { cn } from "@/lib/utils";
// Providers that have MPS voice endpoints
type TTSProviderWithVoices = "elevenlabs" | "deepgram" | "sarvam" | "cartesia" | "dograh";
const MPS_VOICE_PROVIDERS: TTSProviderWithVoices[] = ["elevenlabs", "deepgram", "sarvam", "cartesia", "dograh"];
interface VoiceSelectorProps {
provider: string;
value: string;
onChange: (voiceId: string) => void;
className?: string;
}
export const VoiceSelector: React.FC<VoiceSelectorProps> = ({
provider,
value,
onChange,
className,
}) => {
const { accessToken } = useUserConfig();
const [isOpen, setIsOpen] = useState(false);
const [searchTerm, setSearchTerm] = useState("");
const [isManualInput, setIsManualInput] = useState(false);
const [manualVoiceId, setManualVoiceId] = useState(value || "");
const [voices, setVoices] = useState<VoiceInfo[]>([]);
const [isLoading, setIsLoading] = useState(false);
const [error, setError] = useState<string | null>(null);
const [playingPreview, setPlayingPreview] = useState<string | null>(null);
const [currentAudio, setCurrentAudio] = useState<HTMLAudioElement | null>(null);
// Check if provider has MPS voice endpoint
const hasMPSVoiceEndpoint = useCallback((providerName: string): boolean => {
return MPS_VOICE_PROVIDERS.includes(providerName.toLowerCase() as TTSProviderWithVoices);
}, []);
// Map provider names to API-compatible provider names
const getProviderKey = useCallback((providerName: string): TTSProviderWithVoices | null => {
const providerMap: Record<string, TTSProviderWithVoices> = {
elevenlabs: "elevenlabs",
deepgram: "deepgram",
sarvam: "sarvam",
cartesia: "cartesia",
dograh: "dograh",
};
return providerMap[providerName.toLowerCase()] || null;
}, []);
const fetchVoices = useCallback(async () => {
const providerKey = getProviderKey(provider);
if (!providerKey || !accessToken) {
setVoices([]);
return;
}
setIsLoading(true);
setError(null);
try {
const response = await getVoicesApiV1UserConfigurationsVoicesProviderGet({
path: { provider: providerKey },
headers: {
Authorization: `Bearer ${accessToken}`,
},
});
if (response.data?.voices) {
setVoices(response.data.voices);
}
} catch (err) {
console.error("Failed to fetch voices:", err);
setError("Failed to load voices");
setVoices([]);
} finally {
setIsLoading(false);
}
}, [provider, getProviderKey, accessToken]);
useEffect(() => {
if (provider) {
fetchVoices();
}
}, [provider, fetchVoices]);
// Check if the current value exists in the voices list
useEffect(() => {
if (value && voices.length > 0) {
const voiceExists = voices.some((v) => v.voice_id === value);
if (!voiceExists) {
// If the value doesn't exist in the list, switch to manual input mode
setIsManualInput(true);
setManualVoiceId(value);
}
}
}, [value, voices]);
// Cleanup audio on unmount or when popover closes
useEffect(() => {
if (!isOpen && currentAudio) {
currentAudio.pause();
currentAudio.currentTime = 0;
setCurrentAudio(null);
setPlayingPreview(null);
}
}, [isOpen, currentAudio]);
// Cleanup on unmount
useEffect(() => {
return () => {
if (currentAudio) {
currentAudio.pause();
}
};
}, [currentAudio]);
const filteredVoices = voices.filter((voice) => {
const searchLower = searchTerm.toLowerCase();
return (
voice.name.toLowerCase().includes(searchLower) ||
voice.voice_id.toLowerCase().includes(searchLower) ||
(voice.description?.toLowerCase() || "").includes(searchLower) ||
(voice.accent?.toLowerCase() || "").includes(searchLower) ||
(voice.gender?.toLowerCase() || "").includes(searchLower) ||
(voice.language?.toLowerCase() || "").includes(searchLower)
);
});
const handleSelectVoice = (voiceId: string) => {
onChange(voiceId);
setIsOpen(false);
setSearchTerm("");
};
const handleManualInputToggle = (checked: boolean) => {
setIsManualInput(checked);
if (checked) {
setManualVoiceId(value || "");
} else {
// When switching back to dropdown, try to find the current value in voices
const existingVoice = voices.find((v) => v.voice_id === value);
if (!existingVoice && voices.length > 0) {
// If current value not in list, select the first voice
onChange(voices[0].voice_id);
}
}
};
const handleManualVoiceIdChange = (newValue: string) => {
setManualVoiceId(newValue);
onChange(newValue);
};
const getSelectedVoiceName = () => {
if (isManualInput && value) {
return value;
}
const voice = voices.find((v) => v.voice_id === value);
return voice?.name || value || "Select a voice";
};
const playPreview = (previewUrl: string, voiceId: string) => {
// Stop current audio if playing
if (currentAudio) {
currentAudio.pause();
currentAudio.currentTime = 0;
setCurrentAudio(null);
}
// If clicking the same voice that's playing, just stop it
if (playingPreview === voiceId) {
setPlayingPreview(null);
return;
}
setPlayingPreview(voiceId);
const audio = new Audio(previewUrl);
setCurrentAudio(audio);
audio.onended = () => {
setPlayingPreview(null);
setCurrentAudio(null);
};
audio.onerror = () => {
setPlayingPreview(null);
setCurrentAudio(null);
};
audio.play().catch(() => {
setPlayingPreview(null);
setCurrentAudio(null);
});
};
// For providers without MPS voice endpoint, show simple input
if (!hasMPSVoiceEndpoint(provider)) {
return (
<div className={cn("space-y-2", className)}>
<Input
type="text"
placeholder="Enter voice ID"
value={value || ""}
onChange={(e) => onChange(e.target.value)}
/>
</div>
);
}
if (isManualInput) {
return (
<div className={cn("space-y-2", className)}>
<Input
type="text"
placeholder="Enter voice ID"
value={manualVoiceId}
onChange={(e) => handleManualVoiceIdChange(e.target.value)}
/>
<div className="flex items-center space-x-2">
<Checkbox
id="manual-voice-input"
checked={isManualInput}
onCheckedChange={(checked) => handleManualInputToggle(checked as boolean)}
/>
<Label
htmlFor="manual-voice-input"
className="text-sm font-normal cursor-pointer"
>
Add Voice ID Manually
</Label>
</div>
</div>
);
}
return (
<div className={cn("space-y-2", className)}>
<Popover open={isOpen} onOpenChange={setIsOpen}>
<PopoverTrigger asChild>
<Button
variant="outline"
role="combobox"
aria-expanded={isOpen}
className={cn(
"w-full justify-between",
!value && "text-muted-foreground"
)}
disabled={isLoading}
>
<span className="truncate">
{isLoading ? "Loading voices..." : getSelectedVoiceName()}
</span>
{isLoading ? (
<Loader2 className="ml-2 h-4 w-4 shrink-0 animate-spin" />
) : (
<ChevronDown className="ml-2 h-4 w-4 shrink-0 opacity-50" />
)}
</Button>
</PopoverTrigger>
<PopoverContent className="w-[400px] p-0" align="start">
<div className="p-2 space-y-2">
<div className="relative">
<Search className="absolute left-2 top-2.5 h-4 w-4 text-muted-foreground" />
<Input
placeholder="Search voices..."
value={searchTerm}
onChange={(e) => setSearchTerm(e.target.value)}
className="pl-8"
/>
</div>
<div className="max-h-[300px] overflow-auto space-y-1">
{error ? (
<p className="text-sm text-red-500 text-center py-4">
{error}
</p>
) : isLoading ? (
<div className="flex items-center justify-center py-4">
<Loader2 className="h-6 w-6 animate-spin text-muted-foreground" />
</div>
) : filteredVoices.length === 0 ? (
<p className="text-sm text-muted-foreground text-center py-4">
No voices found
</p>
) : (
filteredVoices.map((voice) => (
<div
key={voice.voice_id}
className={cn(
"flex items-start space-x-3 p-2 hover:bg-accent rounded-sm cursor-pointer",
value === voice.voice_id && "bg-accent"
)}
onClick={() => handleSelectVoice(voice.voice_id)}
>
<div className="flex-1 min-w-0">
<div className="flex items-center gap-2">
<p className="text-sm font-medium truncate">
{voice.name}
</p>
{voice.gender && (
<span className="text-xs text-muted-foreground capitalize">
{voice.gender}
</span>
)}
</div>
{voice.description && (
<p className="text-xs text-muted-foreground line-clamp-2">
{voice.description}
</p>
)}
<div className="flex items-center gap-2 mt-1">
{voice.accent && (
<span className="text-xs bg-secondary px-1.5 py-0.5 rounded capitalize">
{voice.accent}
</span>
)}
{voice.language && (
<span className="text-xs bg-secondary px-1.5 py-0.5 rounded uppercase">
{voice.language}
</span>
)}
</div>
</div>
{voice.preview_url && (
<Button
variant="ghost"
size="sm"
className="h-8 w-8 p-0 shrink-0"
onClick={(e) => {
e.stopPropagation();
playPreview(voice.preview_url!, voice.voice_id);
}}
>
<Volume2
className={cn(
"h-4 w-4",
playingPreview === voice.voice_id &&
"text-primary animate-pulse"
)}
/>
</Button>
)}
</div>
))
)}
</div>
<div className="pt-2 border-t flex items-center justify-between">
<div className="flex items-center space-x-2">
<Checkbox
id="manual-voice-input-popup"
checked={isManualInput}
onCheckedChange={(checked) => {
handleManualInputToggle(checked as boolean);
if (checked) {
setIsOpen(false);
}
}}
/>
<Label
htmlFor="manual-voice-input-popup"
className="text-sm font-normal cursor-pointer"
>
Add Voice ID Manually
</Label>
</div>
<p className="text-xs text-muted-foreground">
{voices.length} voices available
</p>
</div>
</div>
</PopoverContent>
</Popover>
</div>
);
};