mirror of
https://github.com/dograh-hq/dograh.git
synced 2026-06-25 08:48:13 +02:00
feat: add voice selectors in elevenlabs (#88)
This commit is contained in:
parent
480e8a5f60
commit
45c5b7c304
22 changed files with 978 additions and 166 deletions
|
|
@ -20,7 +20,7 @@ RUN pip install --user --no-cache-dir -r requirements.txt && \
|
|||
|
||||
# Copy and install pipecat from local submodule
|
||||
COPY pipecat /tmp/pipecat
|
||||
RUN pip install --user --no-cache-dir '/tmp/pipecat[cartesia,deepgram,openai,elevenlabs,groq,google,azure,soundfile,silero,webrtc,local-smart-turn-v3]' && \
|
||||
RUN pip install --user --no-cache-dir '/tmp/pipecat[cartesia,deepgram,openai,elevenlabs,groq,google,azure,sarvam,soundfile,silero,webrtc,local-smart-turn-v3]' && \
|
||||
# Clean up pip cache and temporary pipecat directory
|
||||
rm -rf /root/.cache/pip /tmp/pipecat
|
||||
|
||||
|
|
|
|||
|
|
@ -111,9 +111,7 @@ def apply_workflow_run_filters(
|
|||
# (subscript [] only works in PostgreSQL 14+)
|
||||
filter_conditions.append(
|
||||
cast(WorkflowRunModel.gathered_context, JSONB)
|
||||
.op("->>")(
|
||||
"mapped_call_disposition"
|
||||
)
|
||||
.op("->>")("mapped_call_disposition")
|
||||
.in_(codes)
|
||||
)
|
||||
|
||||
|
|
@ -147,9 +145,7 @@ def apply_workflow_run_filters(
|
|||
# Use ->> operator for compatibility with all PostgreSQL versions
|
||||
filter_conditions.append(
|
||||
cast(WorkflowRunModel.initial_context, JSONB)
|
||||
.op("->>")(
|
||||
"phone"
|
||||
)
|
||||
.op("->>")("phone")
|
||||
.contains(phone)
|
||||
)
|
||||
|
||||
|
|
@ -178,13 +174,9 @@ def apply_workflow_run_filters(
|
|||
"total_cost_usd"
|
||||
)
|
||||
if min_val is not None:
|
||||
filter_conditions.append(
|
||||
cast(cost_text, Integer) >= min_val
|
||||
)
|
||||
filter_conditions.append(cast(cost_text, Integer) >= min_val)
|
||||
if max_val is not None:
|
||||
filter_conditions.append(
|
||||
cast(cost_text, Integer) <= max_val
|
||||
)
|
||||
filter_conditions.append(cast(cost_text, Integer) <= max_val)
|
||||
|
||||
if filter_conditions:
|
||||
base_query = base_query.where(and_(*filter_conditions))
|
||||
|
|
|
|||
|
|
@ -1,7 +1,8 @@
|
|||
from datetime import datetime, timedelta
|
||||
from typing import List, Optional, TypedDict, Union
|
||||
from typing import List, Literal, Optional, TypedDict, Union
|
||||
|
||||
from fastapi import APIRouter, Depends, HTTPException, Query
|
||||
from loguru import logger
|
||||
from pydantic import BaseModel
|
||||
|
||||
from api.db import db_client
|
||||
|
|
@ -17,6 +18,7 @@ from api.services.configuration.defaults import DEFAULT_SERVICE_PROVIDERS
|
|||
from api.services.configuration.masking import mask_user_config
|
||||
from api.services.configuration.merge import merge_user_configurations
|
||||
from api.services.configuration.registry import REGISTRY, ServiceType
|
||||
from api.services.mps_service_key_client import mps_service_key_client
|
||||
|
||||
router = APIRouter(prefix="/user")
|
||||
|
||||
|
|
@ -274,3 +276,46 @@ async def reactivate_api_key(
|
|||
raise HTTPException(status_code=500, detail="Failed to reactivate API key")
|
||||
|
||||
return {"success": True, "message": "API key reactivated successfully"}
|
||||
|
||||
|
||||
# Voice Configuration Endpoints
|
||||
TTSProvider = Literal["elevenlabs", "deepgram", "sarvam", "cartesia", "dograh"]
|
||||
|
||||
|
||||
class VoiceInfo(BaseModel):
|
||||
voice_id: str
|
||||
name: str
|
||||
description: Optional[str] = None
|
||||
accent: Optional[str] = None
|
||||
gender: Optional[str] = None
|
||||
language: Optional[str] = None
|
||||
preview_url: Optional[str] = None
|
||||
|
||||
|
||||
class VoicesResponse(BaseModel):
|
||||
provider: str
|
||||
voices: List[VoiceInfo]
|
||||
|
||||
|
||||
@router.get("/configurations/voices/{provider}")
|
||||
async def get_voices(
|
||||
provider: TTSProvider,
|
||||
user: UserModel = Depends(get_user),
|
||||
) -> VoicesResponse:
|
||||
"""Get available voices for a TTS provider."""
|
||||
try:
|
||||
result = await mps_service_key_client.get_voices(
|
||||
provider=provider,
|
||||
organization_id=user.selected_organization_id,
|
||||
created_by=user.provider_id,
|
||||
)
|
||||
return VoicesResponse(
|
||||
provider=result.get("provider", provider),
|
||||
voices=[VoiceInfo(**voice) for voice in result.get("voices", [])],
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to fetch voices for {provider}: {e}")
|
||||
raise HTTPException(
|
||||
status_code=500,
|
||||
detail=f"Failed to fetch voices for {provider}",
|
||||
)
|
||||
|
|
|
|||
|
|
@ -11,10 +11,8 @@ from api.db.models import UserModel
|
|||
from api.schemas.user_configuration import UserConfiguration
|
||||
from api.services.auth.stack_auth import stackauth
|
||||
from api.services.configuration.registry import (
|
||||
DograhLLMModel,
|
||||
DograhSTTModel,
|
||||
DograhTTSModel,
|
||||
DograhVoice,
|
||||
ServiceProviders,
|
||||
)
|
||||
|
||||
|
|
@ -244,13 +242,13 @@ async def create_user_configuration_with_mps_key(
|
|||
"llm": {
|
||||
"provider": ServiceProviders.DOGRAH.value,
|
||||
"api_key": service_key,
|
||||
"model": DograhLLMModel.DEFAULT.value, # Default model
|
||||
"model": "default", # Default model
|
||||
},
|
||||
"tts": {
|
||||
"provider": ServiceProviders.DOGRAH.value,
|
||||
"api_key": service_key,
|
||||
"model": DograhTTSModel.DEFAULT.value, # Default model
|
||||
"voice": DograhVoice.DEFAULT.value, # Default voice
|
||||
"voice": "default", # Default voice
|
||||
},
|
||||
"stt": {
|
||||
"provider": ServiceProviders.DOGRAH.value,
|
||||
|
|
|
|||
|
|
@ -38,6 +38,7 @@ class UserConfigurationValidator:
|
|||
ServiceProviders.AZURE.value: self._check_azure_api_key,
|
||||
ServiceProviders.CARTESIA.value: self._check_cartesia_api_key,
|
||||
ServiceProviders.DOGRAH.value: self._check_dograh_api_key,
|
||||
ServiceProviders.SARVAM.value: self._check_sarvam_api_key,
|
||||
}
|
||||
|
||||
async def validate(self, configuration: UserConfiguration) -> APIKeyStatusResponse:
|
||||
|
|
@ -134,20 +135,5 @@ class UserConfigurationValidator:
|
|||
def _check_dograh_api_key(self, model: str, api_key: str) -> bool:
|
||||
return True
|
||||
|
||||
# def _check_neuphonic_api_key(self, model: str, api_key: str) -> bool:
|
||||
# if not Neuphonic:
|
||||
# self._provider_api_key_validity_status[model] = False
|
||||
# return self._provider_api_key_validity_status[model]
|
||||
|
||||
# if model in self._provider_api_key_validity_status:
|
||||
# return self._provider_api_key_validity_status[model]
|
||||
|
||||
# client = Neuphonic(api_key=api_key)
|
||||
# try:
|
||||
# response = client.voices.list() # get's all available voices
|
||||
# voices = response.data["voices"]
|
||||
# self._provider_api_key_validity_status[model] = True
|
||||
# except Exception:
|
||||
# self._provider_api_key_validity_status[model] = False
|
||||
|
||||
# return self._provider_api_key_validity_status[model]
|
||||
def _check_sarvam_api_key(self, model: str, api_key: str) -> bool:
|
||||
return True
|
||||
|
|
|
|||
|
|
@ -20,6 +20,7 @@ class ServiceProviders(str, Enum):
|
|||
GOOGLE = "google"
|
||||
AZURE = "azure"
|
||||
DOGRAH = "dograh"
|
||||
SARVAM = "sarvam"
|
||||
|
||||
|
||||
class BaseServiceConfiguration(BaseModel):
|
||||
|
|
@ -31,6 +32,7 @@ class BaseServiceConfiguration(BaseModel):
|
|||
ServiceProviders.GOOGLE,
|
||||
ServiceProviders.AZURE,
|
||||
ServiceProviders.DOGRAH,
|
||||
# ServiceProviders.SARVAM,
|
||||
]
|
||||
api_key: str
|
||||
|
||||
|
|
@ -92,82 +94,56 @@ def register_stt(cls: Type[BaseSTTConfiguration]):
|
|||
|
||||
###################################################### LLM ########################################################################
|
||||
|
||||
|
||||
class OpenAIModel(str, Enum):
|
||||
GPT3_5_TURBO = "gpt-3.5-turbo"
|
||||
GPT4_1 = "gpt-4.1"
|
||||
GPT4_1_MINI = "gpt-4.1-mini"
|
||||
GPT4_1_NANO = "gpt-4.1-nano"
|
||||
GPT5 = "gpt-5"
|
||||
GPT5_MINI = "gpt-5-mini"
|
||||
GPT5_NANO = "gpt-5-nano"
|
||||
# Suggested models for each provider (used for UI dropdown)
|
||||
OPENAI_MODELS = ["gpt-4.1", "gpt-4.1-mini", "gpt-4.1-nano", "gpt-5", "gpt-5-mini", "gpt-5-nano", "gpt-3.5-turbo"]
|
||||
GOOGLE_MODELS = ["gemini-2.0-flash", "gemini-2.0-flash-lite", "gemini-2.5-flash", "gemini-2.5-flash-lite"]
|
||||
GROQ_MODELS = [
|
||||
"llama-3.3-70b-versatile",
|
||||
"deepseek-r1-distill-llama-70b",
|
||||
"qwen-qwq-32b",
|
||||
"meta-llama/llama-4-scout-17b-16e-instruct",
|
||||
"meta-llama/llama-4-maverick-17b-128e-instruct",
|
||||
"gemma2-9b-it",
|
||||
"llama-3.1-8b-instant",
|
||||
"openai/gpt-oss-120b",
|
||||
]
|
||||
AZURE_MODELS = ["gpt-4.1-mini"]
|
||||
DOGRAH_LLM_MODELS = ["default", "accurate", "fast", "lite", "zen", "zen_lite"]
|
||||
|
||||
|
||||
@register_llm
|
||||
class OpenAILLMService(BaseLLMConfiguration):
|
||||
provider: Literal[ServiceProviders.OPENAI] = ServiceProviders.OPENAI
|
||||
model: OpenAIModel = OpenAIModel.GPT4_1
|
||||
model: str = Field(default="gpt-4.1", json_schema_extra={"examples": OPENAI_MODELS})
|
||||
api_key: str
|
||||
|
||||
|
||||
class GoogleModel(str, Enum):
|
||||
GEMINI_2_0_FLASH = "gemini-2.0-flash"
|
||||
GEMINI_2_0_FLASH_LITE = "gemini-2.0-flash-lite"
|
||||
GEMINI_2_5_FLASH = "gemini-2.5-flash"
|
||||
GEMINI_2_5_FLASH_LITE = "gemini-2.5-flash-lite"
|
||||
|
||||
|
||||
@register_llm
|
||||
class GoogleLLMService(BaseLLMConfiguration):
|
||||
provider: Literal[ServiceProviders.GOOGLE] = ServiceProviders.GOOGLE
|
||||
model: GoogleModel = GoogleModel.GEMINI_2_0_FLASH
|
||||
model: str = Field(default="gemini-2.0-flash", json_schema_extra={"examples": GOOGLE_MODELS})
|
||||
api_key: str
|
||||
|
||||
|
||||
class GroqModel(str, Enum):
|
||||
LLAMA_3_3_70B = "llama-3.3-70b-versatile"
|
||||
DEEPSEEK_R1_DISTILL_LLAMA_70B = "deepseek-r1-distill-llama-70b"
|
||||
QUEN_QWQ_32B = "qwen-qwq-32b"
|
||||
LLAMA_4_SCOUT_17B_16E_INSTRUCT = "meta-llama/llama-4-scout-17b-16e-instruct"
|
||||
LLAMA_4_MAVERICK_17B_128E_INSTRUCT = "meta-llama/llama-4-maverick-17b-128e-instruct"
|
||||
GEMMA2_9B_IT = "gemma2-9b-it"
|
||||
LLAMA_3_1_8B_INSTANT = "llama-3.1-8b-instant"
|
||||
OPENAI_GPT_OSS_120B = "openai/gpt-oss-120b"
|
||||
|
||||
|
||||
@register_llm
|
||||
class GroqLLMService(BaseLLMConfiguration):
|
||||
provider: Literal[ServiceProviders.GROQ] = ServiceProviders.GROQ
|
||||
model: GroqModel = GroqModel.LLAMA_3_3_70B
|
||||
model: str = Field(default="llama-3.3-70b-versatile", json_schema_extra={"examples": GROQ_MODELS})
|
||||
api_key: str
|
||||
|
||||
|
||||
class AzureModel(str, Enum):
|
||||
GPT4_1_MINI = "gpt-4.1-mini"
|
||||
|
||||
|
||||
@register_llm
|
||||
class AzureLLMService(BaseLLMConfiguration):
|
||||
provider: Literal[ServiceProviders.AZURE] = ServiceProviders.AZURE
|
||||
model: AzureModel = AzureModel.GPT4_1_MINI
|
||||
model: str = Field(default="gpt-4.1-mini", json_schema_extra={"examples": AZURE_MODELS})
|
||||
api_key: str
|
||||
endpoint: str
|
||||
|
||||
|
||||
# Dograh LLM Service
|
||||
class DograhLLMModel(str, Enum):
|
||||
DEFAULT = "default"
|
||||
ACCURATE = "accurate"
|
||||
FAST = "fast"
|
||||
LITE = "lite"
|
||||
ZEN = "zen"
|
||||
ZEN_LITE = "zen_lite"
|
||||
|
||||
|
||||
@register_llm
|
||||
class DograhLLMService(BaseLLMConfiguration):
|
||||
provider: Literal[ServiceProviders.DOGRAH] = ServiceProviders.DOGRAH
|
||||
model: DograhLLMModel = DograhLLMModel.DEFAULT
|
||||
model: str = Field(default="default", json_schema_extra={"examples": DOGRAH_LLM_MODELS})
|
||||
api_key: str
|
||||
|
||||
|
||||
|
|
@ -185,15 +161,10 @@ LLMConfig = Annotated[
|
|||
###################################################### TTS ########################################################################
|
||||
|
||||
|
||||
class DeepgramVoice(str, Enum):
|
||||
HELENA = "aura-2-helena-en"
|
||||
THALIA = "aura-2-thalia-en"
|
||||
|
||||
|
||||
@register_tts
|
||||
class DeepgramTTSConfiguration(BaseServiceConfiguration):
|
||||
provider: Literal[ServiceProviders.DEEPGRAM] = ServiceProviders.DEEPGRAM
|
||||
voice: DeepgramVoice = DeepgramVoice.HELENA
|
||||
voice: str = "aura-2-helena-en"
|
||||
api_key: str
|
||||
|
||||
@computed_field
|
||||
|
|
@ -210,42 +181,6 @@ class DeepgramTTSConfiguration(BaseServiceConfiguration):
|
|||
return "aura-2"
|
||||
|
||||
|
||||
class ElevenlabsVoice(str, Enum):
|
||||
ALEXANDRA = "Alexandra - 3dzJXoCYueSQiptQ6euE"
|
||||
AMY = "Amy - oGn4Ha2pe2vSJkmIJgLQ"
|
||||
ANGELA = "Angela - FUfBrNit0NNZAwb58KWH"
|
||||
ARIA = "Aria - 9BWtsMINqrJLrRacOk9x"
|
||||
CHELSEA = "Chelsea - NHRgOEwqx5WZNClv5sat"
|
||||
CHRISTINA = "Christina - X03mvPuTfprif8QBAVeJ"
|
||||
CLARA = "Clara - ZIlrSGI4jZqobxRKprJz"
|
||||
CLYDE = "Clyde - 2EiwWnXFnvU5JabPnv8n"
|
||||
DAVE = "Dave - CYw3kZ02Hs0563khs1Fj"
|
||||
DOMI = "Domi - AZnzlk1XvdvUeBnXmlld"
|
||||
DREW = "Drew - 29vD33N1CtxCmqQRPOHJ"
|
||||
ELENA = "Elena_German - iFJwt4O7E3aafIpJFfcu"
|
||||
EVE = "Eve - BZgkqPqms7Kj9ulSkVzn"
|
||||
FIN = "Fin - D38z5RcWu1voky8WS1ja"
|
||||
HOPE_BESTIE = "Hope_Bestie - uYXf8XasLslADfZ2MB4u"
|
||||
HOPE_NATURAL = "Hope_Natural - OYTbf65OHHFELVut7v2H"
|
||||
JARNATHAN = "Jarnathan - c6SfcYrb2t09NHXiT80T"
|
||||
JENNA = "Jenna - C2BkQxlGNzBn7WD2bqfR"
|
||||
JESSICA = "Jessica - cgSgspJ2msm6clMCkdW9"
|
||||
JOHANNA = "Johanna_German - YYDsZT3K2y6tv7X1aj6N"
|
||||
JUNIPER = "Juniper - aMSt68OGf4xUZAnLpTU8"
|
||||
LAUREN = "Lauren - 3liN8q8YoeB9Hk6AboKe"
|
||||
LINA = "Lina - oWjuL7HSoaEJRMDMP3HD"
|
||||
MONIKA = "Monika_Hindi_8 - 2bNrEsM0omyhLiEyOwqY"
|
||||
NEHA = "Neha_Hindi - QTKSa2Iyv0yoxvXY2V8a"
|
||||
OLIVIA = "Olivia - 1rviaVF7GGGkTU36HNpz"
|
||||
PAUL = "Paul - 5Q0t7uMcjvnagumLfvZi"
|
||||
RACHEL = "Rachel - 21m00Tcm4TlvDq8ikWAM"
|
||||
ROGER = "Roger - CwhRBWXzGAHq8TQ4Fs17"
|
||||
SAMI_REAL = "Sami_Real - O4cGUVdAocn0z4EpQ9yF"
|
||||
SARAH = "Sarah - EXAVITQu4vr4xnSDxMaL"
|
||||
SIA = "Sia_Hindi_10 - ryIIztHPLYSJ74ueXxnO"
|
||||
ZARA = "Zara - MmQVkVZnQ0dUbfWzcW6f"
|
||||
|
||||
|
||||
class ElevenlabsModel(str, Enum):
|
||||
FLASH_2 = "eleven_flash_v2_5"
|
||||
|
||||
|
|
@ -253,16 +188,12 @@ class ElevenlabsModel(str, Enum):
|
|||
@register_tts
|
||||
class ElevenlabsTTSConfiguration(BaseServiceConfiguration):
|
||||
provider: Literal[ServiceProviders.ELEVENLABS] = ServiceProviders.ELEVENLABS
|
||||
voice: ElevenlabsVoice = ElevenlabsVoice.RACHEL
|
||||
voice: str = "21m00Tcm4TlvDq8ikWAM" # Rachel voice ID
|
||||
speed: float = Field(default=1.0, ge=0.1, le=2.0, description="Speed of the voice")
|
||||
model: ElevenlabsModel = ElevenlabsModel.FLASH_2
|
||||
api_key: str
|
||||
|
||||
|
||||
class OpenAIVoice(str, Enum):
|
||||
ALLY = "alloy"
|
||||
|
||||
|
||||
class OpenAITTSModel(str, Enum):
|
||||
GPT_4o_MINI = "gpt-4o-mini-tts"
|
||||
|
||||
|
|
@ -271,29 +202,10 @@ class OpenAITTSModel(str, Enum):
|
|||
class OpenAITTSService(BaseTTSConfiguration):
|
||||
provider: Literal[ServiceProviders.OPENAI] = ServiceProviders.OPENAI
|
||||
model: OpenAITTSModel = OpenAITTSModel.GPT_4o_MINI
|
||||
voice: OpenAIVoice = OpenAIVoice.ALLY
|
||||
voice: str = "alloy"
|
||||
api_key: str
|
||||
|
||||
|
||||
# class NeuphonicVoice(str, Enum):
|
||||
# EMILY = "Emily - fc854436-2dac-4d21-aa69-ae17b54e98eb"
|
||||
|
||||
|
||||
# @register_tts
|
||||
# class NeuphonicTTSService(BaseTTSConfiguration):
|
||||
# provider: Literal[ServiceProviders.NEUPHONIC] = ServiceProviders.NEUPHONIC
|
||||
# voice: NeuphonicVoice = NeuphonicVoice.EMILY
|
||||
# model: str = "NA"
|
||||
# api_key: str
|
||||
|
||||
|
||||
# Dograh TTS Service
|
||||
class DograhVoice(str, Enum):
|
||||
DEFAULT = "default"
|
||||
JOEY = "joey"
|
||||
RACHEL = "rachel"
|
||||
|
||||
|
||||
class DograhTTSModel(str, Enum):
|
||||
DEFAULT = "default"
|
||||
|
||||
|
|
@ -302,16 +214,58 @@ class DograhTTSModel(str, Enum):
|
|||
class DograhTTSService(BaseTTSConfiguration):
|
||||
provider: Literal[ServiceProviders.DOGRAH] = ServiceProviders.DOGRAH
|
||||
model: DograhTTSModel = DograhTTSModel.DEFAULT
|
||||
voice: DograhVoice = DograhVoice.DEFAULT
|
||||
voice: str = "default"
|
||||
api_key: str
|
||||
|
||||
|
||||
class SarvamTTSModel(str, Enum):
|
||||
BULBUL_V2 = "bulbul:v2"
|
||||
BULBUL_V3 = "bulbul:v3"
|
||||
|
||||
|
||||
class SarvamVoice(str, Enum):
|
||||
# Female voices
|
||||
ANUSHKA = "anushka"
|
||||
MANISHA = "manisha"
|
||||
VIDYA = "vidya"
|
||||
ARYA = "arya"
|
||||
# Male voices
|
||||
ABHILASH = "abhilash"
|
||||
KARUN = "karun"
|
||||
HITESH = "hitesh"
|
||||
|
||||
|
||||
class SarvamLanguage(str, Enum):
|
||||
BENGALI = "bn-IN"
|
||||
ENGLISH_INDIA = "en-IN"
|
||||
GUJARATI = "gu-IN"
|
||||
HINDI = "hi-IN"
|
||||
KANNADA = "kn-IN"
|
||||
MALAYALAM = "ml-IN"
|
||||
MARATHI = "mr-IN"
|
||||
ODIA = "od-IN"
|
||||
PUNJABI = "pa-IN"
|
||||
TAMIL = "ta-IN"
|
||||
TELUGU = "te-IN"
|
||||
ASSAMESE = "as-IN"
|
||||
|
||||
|
||||
# @register_tts
|
||||
# class SarvamTTSConfiguration(BaseTTSConfiguration):
|
||||
# provider: Literal[ServiceProviders.SARVAM] = ServiceProviders.SARVAM
|
||||
# model: SarvamTTSModel = SarvamTTSModel.BULBUL_V2
|
||||
# voice: SarvamVoice = SarvamVoice.ANUSHKA
|
||||
# language: SarvamLanguage = SarvamLanguage.HINDI
|
||||
# api_key: str
|
||||
|
||||
|
||||
TTSConfig = Annotated[
|
||||
Union[
|
||||
DeepgramTTSConfiguration,
|
||||
OpenAITTSService,
|
||||
ElevenlabsTTSConfiguration,
|
||||
DograhTTSService,
|
||||
# SarvamTTSConfiguration,
|
||||
],
|
||||
Field(discriminator="provider"),
|
||||
]
|
||||
|
|
@ -323,10 +277,45 @@ class DeepgramSTTModel(str, Enum):
|
|||
NOVA_3_GENERAL = "nova-3-general"
|
||||
|
||||
|
||||
class DeepgramLanguage(str, Enum):
|
||||
MULTI = "multi"
|
||||
ENGLISH = "en"
|
||||
ENGLISH_US = "en-US"
|
||||
ENGLISH_GB = "en-GB"
|
||||
ENGLISH_AU = "en-AU"
|
||||
ENGLISH_IN = "en-IN"
|
||||
SPANISH = "es"
|
||||
SPANISH_LATAM = "es-419"
|
||||
FRENCH = "fr"
|
||||
FRENCH_CA = "fr-CA"
|
||||
GERMAN = "de"
|
||||
ITALIAN = "it"
|
||||
PORTUGUESE = "pt"
|
||||
PORTUGUESE_BR = "pt-BR"
|
||||
DUTCH = "nl"
|
||||
HINDI = "hi"
|
||||
JAPANESE = "ja"
|
||||
KOREAN = "ko"
|
||||
CHINESE_SIMPLIFIED = "zh-CN"
|
||||
CHINESE_TRADITIONAL = "zh-TW"
|
||||
RUSSIAN = "ru"
|
||||
POLISH = "pl"
|
||||
TURKISH = "tr"
|
||||
UKRAINIAN = "uk"
|
||||
VIETNAMESE = "vi"
|
||||
SWEDISH = "sv"
|
||||
DANISH = "da"
|
||||
NORWEGIAN = "no"
|
||||
FINNISH = "fi"
|
||||
INDONESIAN = "id"
|
||||
THAI = "th"
|
||||
|
||||
|
||||
@register_stt
|
||||
class DeepgramSTTConfiguration(BaseSTTConfiguration):
|
||||
provider: Literal[ServiceProviders.DEEPGRAM] = ServiceProviders.DEEPGRAM
|
||||
model: DeepgramSTTModel = DeepgramSTTModel.NOVA_3_GENERAL
|
||||
language: DeepgramLanguage = DeepgramLanguage.MULTI
|
||||
api_key: str
|
||||
|
||||
|
||||
|
|
@ -359,8 +348,27 @@ class DograhSTTService(BaseSTTConfiguration):
|
|||
api_key: str
|
||||
|
||||
|
||||
# Sarvam STT Service
|
||||
class SarvamSTTModel(str, Enum):
|
||||
SAARIKA_V2_5 = "saarika:v2.5"
|
||||
SAARAS_V2 = "saaras:v2" # STT-Translate model (auto-detects language)
|
||||
|
||||
|
||||
# @register_stt
|
||||
# class SarvamSTTConfiguration(BaseSTTConfiguration):
|
||||
# provider: Literal[ServiceProviders.SARVAM] = ServiceProviders.SARVAM
|
||||
# model: SarvamSTTModel = SarvamSTTModel.SAARIKA_V2_5
|
||||
# language: SarvamLanguage = SarvamLanguage.HINDI
|
||||
# api_key: str
|
||||
|
||||
|
||||
STTConfig = Annotated[
|
||||
Union[DeepgramSTTConfiguration, OpenAISTTConfiguration, DograhSTTService],
|
||||
Union[
|
||||
DeepgramSTTConfiguration,
|
||||
OpenAISTTConfiguration,
|
||||
DograhSTTService,
|
||||
# SarvamSTTConfiguration,
|
||||
],
|
||||
Field(discriminator="provider"),
|
||||
]
|
||||
|
||||
|
|
|
|||
|
|
@ -285,6 +285,44 @@ class MPSServiceKeyClient:
|
|||
response=response,
|
||||
)
|
||||
|
||||
async def get_voices(
|
||||
self,
|
||||
provider: str,
|
||||
organization_id: Optional[int] = None,
|
||||
created_by: Optional[str] = None,
|
||||
) -> dict:
|
||||
"""
|
||||
Get available voices for a TTS provider from MPS.
|
||||
|
||||
Args:
|
||||
provider: TTS provider name (elevenlabs, deepgram, sarvam, cartesia)
|
||||
organization_id: Organization ID (for authenticated mode)
|
||||
created_by: User provider ID (for OSS mode)
|
||||
|
||||
Returns:
|
||||
Dictionary containing provider name and list of voices
|
||||
|
||||
Raises:
|
||||
HTTPException: If the API call fails
|
||||
"""
|
||||
async with httpx.AsyncClient(timeout=self.timeout) as client:
|
||||
response = await client.get(
|
||||
f"{self.base_url}/api/v1/voice-proxy/{provider}/voices",
|
||||
headers=self._get_headers(organization_id, created_by),
|
||||
)
|
||||
|
||||
if response.status_code == 200:
|
||||
return response.json()
|
||||
else:
|
||||
logger.error(
|
||||
f"Failed to get voices for {provider}: {response.status_code} - {response.text}"
|
||||
)
|
||||
raise httpx.HTTPStatusError(
|
||||
f"Failed to get voices: {response.text}",
|
||||
request=response.request,
|
||||
response=response,
|
||||
)
|
||||
|
||||
async def call_workflow_api(
|
||||
self,
|
||||
call_type: str,
|
||||
|
|
|
|||
|
|
@ -17,6 +17,9 @@ from pipecat.services.groq.llm import GroqLLMService
|
|||
from pipecat.services.openai.llm import OpenAILLMService
|
||||
from pipecat.services.openai.stt import OpenAISTTService
|
||||
from pipecat.services.openai.tts import OpenAITTSService
|
||||
from pipecat.services.sarvam.stt import SarvamSTTService
|
||||
from pipecat.services.sarvam.tts import SarvamTTSService
|
||||
from pipecat.transcriptions.language import Language
|
||||
from pipecat.utils.text.xml_function_tag_filter import XMLFunctionTagFilter
|
||||
|
||||
if TYPE_CHECKING:
|
||||
|
|
@ -26,8 +29,13 @@ if TYPE_CHECKING:
|
|||
def create_stt_service(user_config):
|
||||
"""Create and return appropriate STT service based on user configuration"""
|
||||
if user_config.stt.provider == ServiceProviders.DEEPGRAM.value:
|
||||
# Use language from user config, defaulting to "multi" for multilingual support
|
||||
language = getattr(user_config.stt, "language", None)
|
||||
language_value = (
|
||||
language.value if hasattr(language, "value") else (language or "multi")
|
||||
)
|
||||
live_options = LiveOptions(
|
||||
language="multi", profanity_filter=False, endpointing=100
|
||||
language=language_value, profanity_filter=False, endpointing=100
|
||||
)
|
||||
return DeepgramSTTService(
|
||||
live_options=live_options,
|
||||
|
|
@ -53,6 +61,32 @@ def create_stt_service(user_config):
|
|||
model=user_config.stt.model.value,
|
||||
audio_passthrough=False, # Disable passthrough since audio is buffered separately
|
||||
)
|
||||
elif user_config.stt.provider == ServiceProviders.SARVAM.value:
|
||||
# Map Sarvam language code to pipecat Language enum
|
||||
language_mapping = {
|
||||
"bn-IN": Language.BN_IN,
|
||||
"gu-IN": Language.GU_IN,
|
||||
"hi-IN": Language.HI_IN,
|
||||
"kn-IN": Language.KN_IN,
|
||||
"ml-IN": Language.ML_IN,
|
||||
"mr-IN": Language.MR_IN,
|
||||
"ta-IN": Language.TA_IN,
|
||||
"te-IN": Language.TE_IN,
|
||||
"pa-IN": Language.PA_IN,
|
||||
"od-IN": Language.OR_IN,
|
||||
"en-IN": Language.EN_IN,
|
||||
"as-IN": Language.AS_IN,
|
||||
}
|
||||
language = getattr(user_config.stt, "language", None)
|
||||
language_value = language.value if hasattr(language, "value") else language
|
||||
pipecat_language = language_mapping.get(language_value, Language.HI_IN)
|
||||
|
||||
return SarvamSTTService(
|
||||
api_key=user_config.stt.api_key,
|
||||
model=user_config.stt.model.value,
|
||||
params=SarvamSTTService.InputParams(language=pipecat_language),
|
||||
audio_passthrough=False,
|
||||
)
|
||||
else:
|
||||
raise HTTPException(
|
||||
status_code=400, detail=f"Invalid STT provider {user_config.stt.provider}"
|
||||
|
|
@ -81,7 +115,12 @@ def create_tts_service(user_config, audio_config: "AudioConfig"):
|
|||
text_filters=[xml_function_tag_filter],
|
||||
)
|
||||
elif user_config.tts.provider == ServiceProviders.ELEVENLABS.value:
|
||||
voice_id = user_config.tts.voice.split(" - ")[1]
|
||||
# Backward compatible with older configuration "Name - voice_id"
|
||||
try:
|
||||
voice_id = user_config.tts.voice.split(" - ")[1]
|
||||
except IndexError:
|
||||
voice_id = user_config.tts.voice
|
||||
|
||||
return ElevenLabsTTSService(
|
||||
reconnect_on_error=False,
|
||||
api_key=user_config.tts.api_key,
|
||||
|
|
@ -103,6 +142,35 @@ def create_tts_service(user_config, audio_config: "AudioConfig"):
|
|||
voice=user_config.tts.voice.value,
|
||||
text_filters=[xml_function_tag_filter],
|
||||
)
|
||||
elif user_config.tts.provider == ServiceProviders.SARVAM.value:
|
||||
# Map Sarvam language code to pipecat Language enum for TTS
|
||||
language_mapping = {
|
||||
"bn-IN": Language.BN,
|
||||
"en-IN": Language.EN,
|
||||
"gu-IN": Language.GU,
|
||||
"hi-IN": Language.HI,
|
||||
"kn-IN": Language.KN,
|
||||
"ml-IN": Language.ML,
|
||||
"mr-IN": Language.MR,
|
||||
"od-IN": Language.OR,
|
||||
"pa-IN": Language.PA,
|
||||
"ta-IN": Language.TA,
|
||||
"te-IN": Language.TE,
|
||||
}
|
||||
language = getattr(user_config.tts, "language", None)
|
||||
language_value = language.value if hasattr(language, "value") else language
|
||||
pipecat_language = language_mapping.get(language_value, Language.HI)
|
||||
|
||||
voice = getattr(user_config.tts, "voice", None)
|
||||
voice_value = voice.value if hasattr(voice, "value") else (voice or "anushka")
|
||||
|
||||
return SarvamTTSService(
|
||||
api_key=user_config.tts.api_key,
|
||||
model=user_config.tts.model.value,
|
||||
voice_id=voice_value,
|
||||
params=SarvamTTSService.InputParams(language=pipecat_language),
|
||||
text_filters=[xml_function_tag_filter],
|
||||
)
|
||||
else:
|
||||
raise HTTPException(
|
||||
status_code=400, detail=f"Invalid TTS provider {user_config.tts.provider}"
|
||||
|
|
|
|||
|
|
@ -5,7 +5,6 @@ from api.schemas.user_configuration import UserConfiguration
|
|||
from api.services.configuration.masking import is_mask_of, mask_key, mask_user_config
|
||||
from api.services.configuration.merge import merge_user_configurations
|
||||
from api.services.configuration.registry import (
|
||||
GroqModel,
|
||||
OpenAILLMService,
|
||||
)
|
||||
|
||||
|
|
@ -70,7 +69,7 @@ def test_merge_drops_old_key_when_provider_changes():
|
|||
incoming_partial = {
|
||||
"llm": {
|
||||
"provider": "groq",
|
||||
"model": GroqModel.LLAMA_3_3_70B,
|
||||
"model": "llama-3.3-70b-versatile",
|
||||
# api_key intentionally absent – should NOT inherit old key
|
||||
}
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue