feat: add voice selectors in elevenlabs (#88)

This commit is contained in:
Abhishek 2025-12-25 15:05:53 +05:30 committed by GitHub
parent 480e8a5f60
commit 45c5b7c304
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
22 changed files with 978 additions and 166 deletions

View file

@ -11,10 +11,8 @@ from api.db.models import UserModel
from api.schemas.user_configuration import UserConfiguration
from api.services.auth.stack_auth import stackauth
from api.services.configuration.registry import (
DograhLLMModel,
DograhSTTModel,
DograhTTSModel,
DograhVoice,
ServiceProviders,
)
@ -244,13 +242,13 @@ async def create_user_configuration_with_mps_key(
"llm": {
"provider": ServiceProviders.DOGRAH.value,
"api_key": service_key,
"model": DograhLLMModel.DEFAULT.value, # Default model
"model": "default", # Default model
},
"tts": {
"provider": ServiceProviders.DOGRAH.value,
"api_key": service_key,
"model": DograhTTSModel.DEFAULT.value, # Default model
"voice": DograhVoice.DEFAULT.value, # Default voice
"voice": "default", # Default voice
},
"stt": {
"provider": ServiceProviders.DOGRAH.value,

View file

@ -38,6 +38,7 @@ class UserConfigurationValidator:
ServiceProviders.AZURE.value: self._check_azure_api_key,
ServiceProviders.CARTESIA.value: self._check_cartesia_api_key,
ServiceProviders.DOGRAH.value: self._check_dograh_api_key,
ServiceProviders.SARVAM.value: self._check_sarvam_api_key,
}
async def validate(self, configuration: UserConfiguration) -> APIKeyStatusResponse:
@ -134,20 +135,5 @@ class UserConfigurationValidator:
def _check_dograh_api_key(self, model: str, api_key: str) -> bool:
return True
# def _check_neuphonic_api_key(self, model: str, api_key: str) -> bool:
# if not Neuphonic:
# self._provider_api_key_validity_status[model] = False
# return self._provider_api_key_validity_status[model]
# if model in self._provider_api_key_validity_status:
# return self._provider_api_key_validity_status[model]
# client = Neuphonic(api_key=api_key)
# try:
# response = client.voices.list() # get's all available voices
# voices = response.data["voices"]
# self._provider_api_key_validity_status[model] = True
# except Exception:
# self._provider_api_key_validity_status[model] = False
# return self._provider_api_key_validity_status[model]
def _check_sarvam_api_key(self, model: str, api_key: str) -> bool:
return True

View file

@ -20,6 +20,7 @@ class ServiceProviders(str, Enum):
GOOGLE = "google"
AZURE = "azure"
DOGRAH = "dograh"
SARVAM = "sarvam"
class BaseServiceConfiguration(BaseModel):
@ -31,6 +32,7 @@ class BaseServiceConfiguration(BaseModel):
ServiceProviders.GOOGLE,
ServiceProviders.AZURE,
ServiceProviders.DOGRAH,
# ServiceProviders.SARVAM,
]
api_key: str
@ -92,82 +94,56 @@ def register_stt(cls: Type[BaseSTTConfiguration]):
###################################################### LLM ########################################################################
class OpenAIModel(str, Enum):
GPT3_5_TURBO = "gpt-3.5-turbo"
GPT4_1 = "gpt-4.1"
GPT4_1_MINI = "gpt-4.1-mini"
GPT4_1_NANO = "gpt-4.1-nano"
GPT5 = "gpt-5"
GPT5_MINI = "gpt-5-mini"
GPT5_NANO = "gpt-5-nano"
# Suggested models for each provider (used for UI dropdown)
OPENAI_MODELS = ["gpt-4.1", "gpt-4.1-mini", "gpt-4.1-nano", "gpt-5", "gpt-5-mini", "gpt-5-nano", "gpt-3.5-turbo"]
GOOGLE_MODELS = ["gemini-2.0-flash", "gemini-2.0-flash-lite", "gemini-2.5-flash", "gemini-2.5-flash-lite"]
GROQ_MODELS = [
"llama-3.3-70b-versatile",
"deepseek-r1-distill-llama-70b",
"qwen-qwq-32b",
"meta-llama/llama-4-scout-17b-16e-instruct",
"meta-llama/llama-4-maverick-17b-128e-instruct",
"gemma2-9b-it",
"llama-3.1-8b-instant",
"openai/gpt-oss-120b",
]
AZURE_MODELS = ["gpt-4.1-mini"]
DOGRAH_LLM_MODELS = ["default", "accurate", "fast", "lite", "zen", "zen_lite"]
@register_llm
class OpenAILLMService(BaseLLMConfiguration):
provider: Literal[ServiceProviders.OPENAI] = ServiceProviders.OPENAI
model: OpenAIModel = OpenAIModel.GPT4_1
model: str = Field(default="gpt-4.1", json_schema_extra={"examples": OPENAI_MODELS})
api_key: str
class GoogleModel(str, Enum):
GEMINI_2_0_FLASH = "gemini-2.0-flash"
GEMINI_2_0_FLASH_LITE = "gemini-2.0-flash-lite"
GEMINI_2_5_FLASH = "gemini-2.5-flash"
GEMINI_2_5_FLASH_LITE = "gemini-2.5-flash-lite"
@register_llm
class GoogleLLMService(BaseLLMConfiguration):
provider: Literal[ServiceProviders.GOOGLE] = ServiceProviders.GOOGLE
model: GoogleModel = GoogleModel.GEMINI_2_0_FLASH
model: str = Field(default="gemini-2.0-flash", json_schema_extra={"examples": GOOGLE_MODELS})
api_key: str
class GroqModel(str, Enum):
LLAMA_3_3_70B = "llama-3.3-70b-versatile"
DEEPSEEK_R1_DISTILL_LLAMA_70B = "deepseek-r1-distill-llama-70b"
QUEN_QWQ_32B = "qwen-qwq-32b"
LLAMA_4_SCOUT_17B_16E_INSTRUCT = "meta-llama/llama-4-scout-17b-16e-instruct"
LLAMA_4_MAVERICK_17B_128E_INSTRUCT = "meta-llama/llama-4-maverick-17b-128e-instruct"
GEMMA2_9B_IT = "gemma2-9b-it"
LLAMA_3_1_8B_INSTANT = "llama-3.1-8b-instant"
OPENAI_GPT_OSS_120B = "openai/gpt-oss-120b"
@register_llm
class GroqLLMService(BaseLLMConfiguration):
provider: Literal[ServiceProviders.GROQ] = ServiceProviders.GROQ
model: GroqModel = GroqModel.LLAMA_3_3_70B
model: str = Field(default="llama-3.3-70b-versatile", json_schema_extra={"examples": GROQ_MODELS})
api_key: str
class AzureModel(str, Enum):
GPT4_1_MINI = "gpt-4.1-mini"
@register_llm
class AzureLLMService(BaseLLMConfiguration):
provider: Literal[ServiceProviders.AZURE] = ServiceProviders.AZURE
model: AzureModel = AzureModel.GPT4_1_MINI
model: str = Field(default="gpt-4.1-mini", json_schema_extra={"examples": AZURE_MODELS})
api_key: str
endpoint: str
# Dograh LLM Service
class DograhLLMModel(str, Enum):
DEFAULT = "default"
ACCURATE = "accurate"
FAST = "fast"
LITE = "lite"
ZEN = "zen"
ZEN_LITE = "zen_lite"
@register_llm
class DograhLLMService(BaseLLMConfiguration):
provider: Literal[ServiceProviders.DOGRAH] = ServiceProviders.DOGRAH
model: DograhLLMModel = DograhLLMModel.DEFAULT
model: str = Field(default="default", json_schema_extra={"examples": DOGRAH_LLM_MODELS})
api_key: str
@ -185,15 +161,10 @@ LLMConfig = Annotated[
###################################################### TTS ########################################################################
class DeepgramVoice(str, Enum):
HELENA = "aura-2-helena-en"
THALIA = "aura-2-thalia-en"
@register_tts
class DeepgramTTSConfiguration(BaseServiceConfiguration):
provider: Literal[ServiceProviders.DEEPGRAM] = ServiceProviders.DEEPGRAM
voice: DeepgramVoice = DeepgramVoice.HELENA
voice: str = "aura-2-helena-en"
api_key: str
@computed_field
@ -210,42 +181,6 @@ class DeepgramTTSConfiguration(BaseServiceConfiguration):
return "aura-2"
class ElevenlabsVoice(str, Enum):
ALEXANDRA = "Alexandra - 3dzJXoCYueSQiptQ6euE"
AMY = "Amy - oGn4Ha2pe2vSJkmIJgLQ"
ANGELA = "Angela - FUfBrNit0NNZAwb58KWH"
ARIA = "Aria - 9BWtsMINqrJLrRacOk9x"
CHELSEA = "Chelsea - NHRgOEwqx5WZNClv5sat"
CHRISTINA = "Christina - X03mvPuTfprif8QBAVeJ"
CLARA = "Clara - ZIlrSGI4jZqobxRKprJz"
CLYDE = "Clyde - 2EiwWnXFnvU5JabPnv8n"
DAVE = "Dave - CYw3kZ02Hs0563khs1Fj"
DOMI = "Domi - AZnzlk1XvdvUeBnXmlld"
DREW = "Drew - 29vD33N1CtxCmqQRPOHJ"
ELENA = "Elena_German - iFJwt4O7E3aafIpJFfcu"
EVE = "Eve - BZgkqPqms7Kj9ulSkVzn"
FIN = "Fin - D38z5RcWu1voky8WS1ja"
HOPE_BESTIE = "Hope_Bestie - uYXf8XasLslADfZ2MB4u"
HOPE_NATURAL = "Hope_Natural - OYTbf65OHHFELVut7v2H"
JARNATHAN = "Jarnathan - c6SfcYrb2t09NHXiT80T"
JENNA = "Jenna - C2BkQxlGNzBn7WD2bqfR"
JESSICA = "Jessica - cgSgspJ2msm6clMCkdW9"
JOHANNA = "Johanna_German - YYDsZT3K2y6tv7X1aj6N"
JUNIPER = "Juniper - aMSt68OGf4xUZAnLpTU8"
LAUREN = "Lauren - 3liN8q8YoeB9Hk6AboKe"
LINA = "Lina - oWjuL7HSoaEJRMDMP3HD"
MONIKA = "Monika_Hindi_8 - 2bNrEsM0omyhLiEyOwqY"
NEHA = "Neha_Hindi - QTKSa2Iyv0yoxvXY2V8a"
OLIVIA = "Olivia - 1rviaVF7GGGkTU36HNpz"
PAUL = "Paul - 5Q0t7uMcjvnagumLfvZi"
RACHEL = "Rachel - 21m00Tcm4TlvDq8ikWAM"
ROGER = "Roger - CwhRBWXzGAHq8TQ4Fs17"
SAMI_REAL = "Sami_Real - O4cGUVdAocn0z4EpQ9yF"
SARAH = "Sarah - EXAVITQu4vr4xnSDxMaL"
SIA = "Sia_Hindi_10 - ryIIztHPLYSJ74ueXxnO"
ZARA = "Zara - MmQVkVZnQ0dUbfWzcW6f"
class ElevenlabsModel(str, Enum):
FLASH_2 = "eleven_flash_v2_5"
@ -253,16 +188,12 @@ class ElevenlabsModel(str, Enum):
@register_tts
class ElevenlabsTTSConfiguration(BaseServiceConfiguration):
provider: Literal[ServiceProviders.ELEVENLABS] = ServiceProviders.ELEVENLABS
voice: ElevenlabsVoice = ElevenlabsVoice.RACHEL
voice: str = "21m00Tcm4TlvDq8ikWAM" # Rachel voice ID
speed: float = Field(default=1.0, ge=0.1, le=2.0, description="Speed of the voice")
model: ElevenlabsModel = ElevenlabsModel.FLASH_2
api_key: str
class OpenAIVoice(str, Enum):
ALLY = "alloy"
class OpenAITTSModel(str, Enum):
GPT_4o_MINI = "gpt-4o-mini-tts"
@ -271,29 +202,10 @@ class OpenAITTSModel(str, Enum):
class OpenAITTSService(BaseTTSConfiguration):
provider: Literal[ServiceProviders.OPENAI] = ServiceProviders.OPENAI
model: OpenAITTSModel = OpenAITTSModel.GPT_4o_MINI
voice: OpenAIVoice = OpenAIVoice.ALLY
voice: str = "alloy"
api_key: str
# class NeuphonicVoice(str, Enum):
# EMILY = "Emily - fc854436-2dac-4d21-aa69-ae17b54e98eb"
# @register_tts
# class NeuphonicTTSService(BaseTTSConfiguration):
# provider: Literal[ServiceProviders.NEUPHONIC] = ServiceProviders.NEUPHONIC
# voice: NeuphonicVoice = NeuphonicVoice.EMILY
# model: str = "NA"
# api_key: str
# Dograh TTS Service
class DograhVoice(str, Enum):
DEFAULT = "default"
JOEY = "joey"
RACHEL = "rachel"
class DograhTTSModel(str, Enum):
DEFAULT = "default"
@ -302,16 +214,58 @@ class DograhTTSModel(str, Enum):
class DograhTTSService(BaseTTSConfiguration):
provider: Literal[ServiceProviders.DOGRAH] = ServiceProviders.DOGRAH
model: DograhTTSModel = DograhTTSModel.DEFAULT
voice: DograhVoice = DograhVoice.DEFAULT
voice: str = "default"
api_key: str
class SarvamTTSModel(str, Enum):
BULBUL_V2 = "bulbul:v2"
BULBUL_V3 = "bulbul:v3"
class SarvamVoice(str, Enum):
# Female voices
ANUSHKA = "anushka"
MANISHA = "manisha"
VIDYA = "vidya"
ARYA = "arya"
# Male voices
ABHILASH = "abhilash"
KARUN = "karun"
HITESH = "hitesh"
class SarvamLanguage(str, Enum):
BENGALI = "bn-IN"
ENGLISH_INDIA = "en-IN"
GUJARATI = "gu-IN"
HINDI = "hi-IN"
KANNADA = "kn-IN"
MALAYALAM = "ml-IN"
MARATHI = "mr-IN"
ODIA = "od-IN"
PUNJABI = "pa-IN"
TAMIL = "ta-IN"
TELUGU = "te-IN"
ASSAMESE = "as-IN"
# @register_tts
# class SarvamTTSConfiguration(BaseTTSConfiguration):
# provider: Literal[ServiceProviders.SARVAM] = ServiceProviders.SARVAM
# model: SarvamTTSModel = SarvamTTSModel.BULBUL_V2
# voice: SarvamVoice = SarvamVoice.ANUSHKA
# language: SarvamLanguage = SarvamLanguage.HINDI
# api_key: str
TTSConfig = Annotated[
Union[
DeepgramTTSConfiguration,
OpenAITTSService,
ElevenlabsTTSConfiguration,
DograhTTSService,
# SarvamTTSConfiguration,
],
Field(discriminator="provider"),
]
@ -323,10 +277,45 @@ class DeepgramSTTModel(str, Enum):
NOVA_3_GENERAL = "nova-3-general"
class DeepgramLanguage(str, Enum):
MULTI = "multi"
ENGLISH = "en"
ENGLISH_US = "en-US"
ENGLISH_GB = "en-GB"
ENGLISH_AU = "en-AU"
ENGLISH_IN = "en-IN"
SPANISH = "es"
SPANISH_LATAM = "es-419"
FRENCH = "fr"
FRENCH_CA = "fr-CA"
GERMAN = "de"
ITALIAN = "it"
PORTUGUESE = "pt"
PORTUGUESE_BR = "pt-BR"
DUTCH = "nl"
HINDI = "hi"
JAPANESE = "ja"
KOREAN = "ko"
CHINESE_SIMPLIFIED = "zh-CN"
CHINESE_TRADITIONAL = "zh-TW"
RUSSIAN = "ru"
POLISH = "pl"
TURKISH = "tr"
UKRAINIAN = "uk"
VIETNAMESE = "vi"
SWEDISH = "sv"
DANISH = "da"
NORWEGIAN = "no"
FINNISH = "fi"
INDONESIAN = "id"
THAI = "th"
@register_stt
class DeepgramSTTConfiguration(BaseSTTConfiguration):
provider: Literal[ServiceProviders.DEEPGRAM] = ServiceProviders.DEEPGRAM
model: DeepgramSTTModel = DeepgramSTTModel.NOVA_3_GENERAL
language: DeepgramLanguage = DeepgramLanguage.MULTI
api_key: str
@ -359,8 +348,27 @@ class DograhSTTService(BaseSTTConfiguration):
api_key: str
# Sarvam STT Service
class SarvamSTTModel(str, Enum):
SAARIKA_V2_5 = "saarika:v2.5"
SAARAS_V2 = "saaras:v2" # STT-Translate model (auto-detects language)
# @register_stt
# class SarvamSTTConfiguration(BaseSTTConfiguration):
# provider: Literal[ServiceProviders.SARVAM] = ServiceProviders.SARVAM
# model: SarvamSTTModel = SarvamSTTModel.SAARIKA_V2_5
# language: SarvamLanguage = SarvamLanguage.HINDI
# api_key: str
STTConfig = Annotated[
Union[DeepgramSTTConfiguration, OpenAISTTConfiguration, DograhSTTService],
Union[
DeepgramSTTConfiguration,
OpenAISTTConfiguration,
DograhSTTService,
# SarvamSTTConfiguration,
],
Field(discriminator="provider"),
]

View file

@ -285,6 +285,44 @@ class MPSServiceKeyClient:
response=response,
)
async def get_voices(
self,
provider: str,
organization_id: Optional[int] = None,
created_by: Optional[str] = None,
) -> dict:
"""
Get available voices for a TTS provider from MPS.
Args:
provider: TTS provider name (elevenlabs, deepgram, sarvam, cartesia)
organization_id: Organization ID (for authenticated mode)
created_by: User provider ID (for OSS mode)
Returns:
Dictionary containing provider name and list of voices
Raises:
HTTPException: If the API call fails
"""
async with httpx.AsyncClient(timeout=self.timeout) as client:
response = await client.get(
f"{self.base_url}/api/v1/voice-proxy/{provider}/voices",
headers=self._get_headers(organization_id, created_by),
)
if response.status_code == 200:
return response.json()
else:
logger.error(
f"Failed to get voices for {provider}: {response.status_code} - {response.text}"
)
raise httpx.HTTPStatusError(
f"Failed to get voices: {response.text}",
request=response.request,
response=response,
)
async def call_workflow_api(
self,
call_type: str,

View file

@ -17,6 +17,9 @@ from pipecat.services.groq.llm import GroqLLMService
from pipecat.services.openai.llm import OpenAILLMService
from pipecat.services.openai.stt import OpenAISTTService
from pipecat.services.openai.tts import OpenAITTSService
from pipecat.services.sarvam.stt import SarvamSTTService
from pipecat.services.sarvam.tts import SarvamTTSService
from pipecat.transcriptions.language import Language
from pipecat.utils.text.xml_function_tag_filter import XMLFunctionTagFilter
if TYPE_CHECKING:
@ -26,8 +29,13 @@ if TYPE_CHECKING:
def create_stt_service(user_config):
"""Create and return appropriate STT service based on user configuration"""
if user_config.stt.provider == ServiceProviders.DEEPGRAM.value:
# Use language from user config, defaulting to "multi" for multilingual support
language = getattr(user_config.stt, "language", None)
language_value = (
language.value if hasattr(language, "value") else (language or "multi")
)
live_options = LiveOptions(
language="multi", profanity_filter=False, endpointing=100
language=language_value, profanity_filter=False, endpointing=100
)
return DeepgramSTTService(
live_options=live_options,
@ -53,6 +61,32 @@ def create_stt_service(user_config):
model=user_config.stt.model.value,
audio_passthrough=False, # Disable passthrough since audio is buffered separately
)
elif user_config.stt.provider == ServiceProviders.SARVAM.value:
# Map Sarvam language code to pipecat Language enum
language_mapping = {
"bn-IN": Language.BN_IN,
"gu-IN": Language.GU_IN,
"hi-IN": Language.HI_IN,
"kn-IN": Language.KN_IN,
"ml-IN": Language.ML_IN,
"mr-IN": Language.MR_IN,
"ta-IN": Language.TA_IN,
"te-IN": Language.TE_IN,
"pa-IN": Language.PA_IN,
"od-IN": Language.OR_IN,
"en-IN": Language.EN_IN,
"as-IN": Language.AS_IN,
}
language = getattr(user_config.stt, "language", None)
language_value = language.value if hasattr(language, "value") else language
pipecat_language = language_mapping.get(language_value, Language.HI_IN)
return SarvamSTTService(
api_key=user_config.stt.api_key,
model=user_config.stt.model.value,
params=SarvamSTTService.InputParams(language=pipecat_language),
audio_passthrough=False,
)
else:
raise HTTPException(
status_code=400, detail=f"Invalid STT provider {user_config.stt.provider}"
@ -81,7 +115,12 @@ def create_tts_service(user_config, audio_config: "AudioConfig"):
text_filters=[xml_function_tag_filter],
)
elif user_config.tts.provider == ServiceProviders.ELEVENLABS.value:
voice_id = user_config.tts.voice.split(" - ")[1]
# Backward compatible with older configuration "Name - voice_id"
try:
voice_id = user_config.tts.voice.split(" - ")[1]
except IndexError:
voice_id = user_config.tts.voice
return ElevenLabsTTSService(
reconnect_on_error=False,
api_key=user_config.tts.api_key,
@ -103,6 +142,35 @@ def create_tts_service(user_config, audio_config: "AudioConfig"):
voice=user_config.tts.voice.value,
text_filters=[xml_function_tag_filter],
)
elif user_config.tts.provider == ServiceProviders.SARVAM.value:
# Map Sarvam language code to pipecat Language enum for TTS
language_mapping = {
"bn-IN": Language.BN,
"en-IN": Language.EN,
"gu-IN": Language.GU,
"hi-IN": Language.HI,
"kn-IN": Language.KN,
"ml-IN": Language.ML,
"mr-IN": Language.MR,
"od-IN": Language.OR,
"pa-IN": Language.PA,
"ta-IN": Language.TA,
"te-IN": Language.TE,
}
language = getattr(user_config.tts, "language", None)
language_value = language.value if hasattr(language, "value") else language
pipecat_language = language_mapping.get(language_value, Language.HI)
voice = getattr(user_config.tts, "voice", None)
voice_value = voice.value if hasattr(voice, "value") else (voice or "anushka")
return SarvamTTSService(
api_key=user_config.tts.api_key,
model=user_config.tts.model.value,
voice_id=voice_value,
params=SarvamTTSService.InputParams(language=pipecat_language),
text_filters=[xml_function_tag_filter],
)
else:
raise HTTPException(
status_code=400, detail=f"Invalid TTS provider {user_config.tts.provider}"