mirror of
https://github.com/dograh-hq/dograh.git
synced 2026-06-07 07:55:16 +02:00
feat: add voice selectors in elevenlabs (#88)
This commit is contained in:
parent
480e8a5f60
commit
45c5b7c304
22 changed files with 978 additions and 166 deletions
|
|
@ -20,7 +20,7 @@ RUN pip install --user --no-cache-dir -r requirements.txt && \
|
|||
|
||||
# Copy and install pipecat from local submodule
|
||||
COPY pipecat /tmp/pipecat
|
||||
RUN pip install --user --no-cache-dir '/tmp/pipecat[cartesia,deepgram,openai,elevenlabs,groq,google,azure,soundfile,silero,webrtc,local-smart-turn-v3]' && \
|
||||
RUN pip install --user --no-cache-dir '/tmp/pipecat[cartesia,deepgram,openai,elevenlabs,groq,google,azure,sarvam,soundfile,silero,webrtc,local-smart-turn-v3]' && \
|
||||
# Clean up pip cache and temporary pipecat directory
|
||||
rm -rf /root/.cache/pip /tmp/pipecat
|
||||
|
||||
|
|
|
|||
|
|
@ -111,9 +111,7 @@ def apply_workflow_run_filters(
|
|||
# (subscript [] only works in PostgreSQL 14+)
|
||||
filter_conditions.append(
|
||||
cast(WorkflowRunModel.gathered_context, JSONB)
|
||||
.op("->>")(
|
||||
"mapped_call_disposition"
|
||||
)
|
||||
.op("->>")("mapped_call_disposition")
|
||||
.in_(codes)
|
||||
)
|
||||
|
||||
|
|
@ -147,9 +145,7 @@ def apply_workflow_run_filters(
|
|||
# Use ->> operator for compatibility with all PostgreSQL versions
|
||||
filter_conditions.append(
|
||||
cast(WorkflowRunModel.initial_context, JSONB)
|
||||
.op("->>")(
|
||||
"phone"
|
||||
)
|
||||
.op("->>")("phone")
|
||||
.contains(phone)
|
||||
)
|
||||
|
||||
|
|
@ -178,13 +174,9 @@ def apply_workflow_run_filters(
|
|||
"total_cost_usd"
|
||||
)
|
||||
if min_val is not None:
|
||||
filter_conditions.append(
|
||||
cast(cost_text, Integer) >= min_val
|
||||
)
|
||||
filter_conditions.append(cast(cost_text, Integer) >= min_val)
|
||||
if max_val is not None:
|
||||
filter_conditions.append(
|
||||
cast(cost_text, Integer) <= max_val
|
||||
)
|
||||
filter_conditions.append(cast(cost_text, Integer) <= max_val)
|
||||
|
||||
if filter_conditions:
|
||||
base_query = base_query.where(and_(*filter_conditions))
|
||||
|
|
|
|||
|
|
@ -1,7 +1,8 @@
|
|||
from datetime import datetime, timedelta
|
||||
from typing import List, Optional, TypedDict, Union
|
||||
from typing import List, Literal, Optional, TypedDict, Union
|
||||
|
||||
from fastapi import APIRouter, Depends, HTTPException, Query
|
||||
from loguru import logger
|
||||
from pydantic import BaseModel
|
||||
|
||||
from api.db import db_client
|
||||
|
|
@ -17,6 +18,7 @@ from api.services.configuration.defaults import DEFAULT_SERVICE_PROVIDERS
|
|||
from api.services.configuration.masking import mask_user_config
|
||||
from api.services.configuration.merge import merge_user_configurations
|
||||
from api.services.configuration.registry import REGISTRY, ServiceType
|
||||
from api.services.mps_service_key_client import mps_service_key_client
|
||||
|
||||
router = APIRouter(prefix="/user")
|
||||
|
||||
|
|
@ -274,3 +276,46 @@ async def reactivate_api_key(
|
|||
raise HTTPException(status_code=500, detail="Failed to reactivate API key")
|
||||
|
||||
return {"success": True, "message": "API key reactivated successfully"}
|
||||
|
||||
|
||||
# Voice Configuration Endpoints
|
||||
TTSProvider = Literal["elevenlabs", "deepgram", "sarvam", "cartesia", "dograh"]
|
||||
|
||||
|
||||
class VoiceInfo(BaseModel):
|
||||
voice_id: str
|
||||
name: str
|
||||
description: Optional[str] = None
|
||||
accent: Optional[str] = None
|
||||
gender: Optional[str] = None
|
||||
language: Optional[str] = None
|
||||
preview_url: Optional[str] = None
|
||||
|
||||
|
||||
class VoicesResponse(BaseModel):
|
||||
provider: str
|
||||
voices: List[VoiceInfo]
|
||||
|
||||
|
||||
@router.get("/configurations/voices/{provider}")
|
||||
async def get_voices(
|
||||
provider: TTSProvider,
|
||||
user: UserModel = Depends(get_user),
|
||||
) -> VoicesResponse:
|
||||
"""Get available voices for a TTS provider."""
|
||||
try:
|
||||
result = await mps_service_key_client.get_voices(
|
||||
provider=provider,
|
||||
organization_id=user.selected_organization_id,
|
||||
created_by=user.provider_id,
|
||||
)
|
||||
return VoicesResponse(
|
||||
provider=result.get("provider", provider),
|
||||
voices=[VoiceInfo(**voice) for voice in result.get("voices", [])],
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to fetch voices for {provider}: {e}")
|
||||
raise HTTPException(
|
||||
status_code=500,
|
||||
detail=f"Failed to fetch voices for {provider}",
|
||||
)
|
||||
|
|
|
|||
|
|
@ -11,10 +11,8 @@ from api.db.models import UserModel
|
|||
from api.schemas.user_configuration import UserConfiguration
|
||||
from api.services.auth.stack_auth import stackauth
|
||||
from api.services.configuration.registry import (
|
||||
DograhLLMModel,
|
||||
DograhSTTModel,
|
||||
DograhTTSModel,
|
||||
DograhVoice,
|
||||
ServiceProviders,
|
||||
)
|
||||
|
||||
|
|
@ -244,13 +242,13 @@ async def create_user_configuration_with_mps_key(
|
|||
"llm": {
|
||||
"provider": ServiceProviders.DOGRAH.value,
|
||||
"api_key": service_key,
|
||||
"model": DograhLLMModel.DEFAULT.value, # Default model
|
||||
"model": "default", # Default model
|
||||
},
|
||||
"tts": {
|
||||
"provider": ServiceProviders.DOGRAH.value,
|
||||
"api_key": service_key,
|
||||
"model": DograhTTSModel.DEFAULT.value, # Default model
|
||||
"voice": DograhVoice.DEFAULT.value, # Default voice
|
||||
"voice": "default", # Default voice
|
||||
},
|
||||
"stt": {
|
||||
"provider": ServiceProviders.DOGRAH.value,
|
||||
|
|
|
|||
|
|
@ -38,6 +38,7 @@ class UserConfigurationValidator:
|
|||
ServiceProviders.AZURE.value: self._check_azure_api_key,
|
||||
ServiceProviders.CARTESIA.value: self._check_cartesia_api_key,
|
||||
ServiceProviders.DOGRAH.value: self._check_dograh_api_key,
|
||||
ServiceProviders.SARVAM.value: self._check_sarvam_api_key,
|
||||
}
|
||||
|
||||
async def validate(self, configuration: UserConfiguration) -> APIKeyStatusResponse:
|
||||
|
|
@ -134,20 +135,5 @@ class UserConfigurationValidator:
|
|||
def _check_dograh_api_key(self, model: str, api_key: str) -> bool:
|
||||
return True
|
||||
|
||||
# def _check_neuphonic_api_key(self, model: str, api_key: str) -> bool:
|
||||
# if not Neuphonic:
|
||||
# self._provider_api_key_validity_status[model] = False
|
||||
# return self._provider_api_key_validity_status[model]
|
||||
|
||||
# if model in self._provider_api_key_validity_status:
|
||||
# return self._provider_api_key_validity_status[model]
|
||||
|
||||
# client = Neuphonic(api_key=api_key)
|
||||
# try:
|
||||
# response = client.voices.list() # get's all available voices
|
||||
# voices = response.data["voices"]
|
||||
# self._provider_api_key_validity_status[model] = True
|
||||
# except Exception:
|
||||
# self._provider_api_key_validity_status[model] = False
|
||||
|
||||
# return self._provider_api_key_validity_status[model]
|
||||
def _check_sarvam_api_key(self, model: str, api_key: str) -> bool:
|
||||
return True
|
||||
|
|
|
|||
|
|
@ -20,6 +20,7 @@ class ServiceProviders(str, Enum):
|
|||
GOOGLE = "google"
|
||||
AZURE = "azure"
|
||||
DOGRAH = "dograh"
|
||||
SARVAM = "sarvam"
|
||||
|
||||
|
||||
class BaseServiceConfiguration(BaseModel):
|
||||
|
|
@ -31,6 +32,7 @@ class BaseServiceConfiguration(BaseModel):
|
|||
ServiceProviders.GOOGLE,
|
||||
ServiceProviders.AZURE,
|
||||
ServiceProviders.DOGRAH,
|
||||
# ServiceProviders.SARVAM,
|
||||
]
|
||||
api_key: str
|
||||
|
||||
|
|
@ -92,82 +94,56 @@ def register_stt(cls: Type[BaseSTTConfiguration]):
|
|||
|
||||
###################################################### LLM ########################################################################
|
||||
|
||||
|
||||
class OpenAIModel(str, Enum):
|
||||
GPT3_5_TURBO = "gpt-3.5-turbo"
|
||||
GPT4_1 = "gpt-4.1"
|
||||
GPT4_1_MINI = "gpt-4.1-mini"
|
||||
GPT4_1_NANO = "gpt-4.1-nano"
|
||||
GPT5 = "gpt-5"
|
||||
GPT5_MINI = "gpt-5-mini"
|
||||
GPT5_NANO = "gpt-5-nano"
|
||||
# Suggested models for each provider (used for UI dropdown)
|
||||
OPENAI_MODELS = ["gpt-4.1", "gpt-4.1-mini", "gpt-4.1-nano", "gpt-5", "gpt-5-mini", "gpt-5-nano", "gpt-3.5-turbo"]
|
||||
GOOGLE_MODELS = ["gemini-2.0-flash", "gemini-2.0-flash-lite", "gemini-2.5-flash", "gemini-2.5-flash-lite"]
|
||||
GROQ_MODELS = [
|
||||
"llama-3.3-70b-versatile",
|
||||
"deepseek-r1-distill-llama-70b",
|
||||
"qwen-qwq-32b",
|
||||
"meta-llama/llama-4-scout-17b-16e-instruct",
|
||||
"meta-llama/llama-4-maverick-17b-128e-instruct",
|
||||
"gemma2-9b-it",
|
||||
"llama-3.1-8b-instant",
|
||||
"openai/gpt-oss-120b",
|
||||
]
|
||||
AZURE_MODELS = ["gpt-4.1-mini"]
|
||||
DOGRAH_LLM_MODELS = ["default", "accurate", "fast", "lite", "zen", "zen_lite"]
|
||||
|
||||
|
||||
@register_llm
|
||||
class OpenAILLMService(BaseLLMConfiguration):
|
||||
provider: Literal[ServiceProviders.OPENAI] = ServiceProviders.OPENAI
|
||||
model: OpenAIModel = OpenAIModel.GPT4_1
|
||||
model: str = Field(default="gpt-4.1", json_schema_extra={"examples": OPENAI_MODELS})
|
||||
api_key: str
|
||||
|
||||
|
||||
class GoogleModel(str, Enum):
|
||||
GEMINI_2_0_FLASH = "gemini-2.0-flash"
|
||||
GEMINI_2_0_FLASH_LITE = "gemini-2.0-flash-lite"
|
||||
GEMINI_2_5_FLASH = "gemini-2.5-flash"
|
||||
GEMINI_2_5_FLASH_LITE = "gemini-2.5-flash-lite"
|
||||
|
||||
|
||||
@register_llm
|
||||
class GoogleLLMService(BaseLLMConfiguration):
|
||||
provider: Literal[ServiceProviders.GOOGLE] = ServiceProviders.GOOGLE
|
||||
model: GoogleModel = GoogleModel.GEMINI_2_0_FLASH
|
||||
model: str = Field(default="gemini-2.0-flash", json_schema_extra={"examples": GOOGLE_MODELS})
|
||||
api_key: str
|
||||
|
||||
|
||||
class GroqModel(str, Enum):
|
||||
LLAMA_3_3_70B = "llama-3.3-70b-versatile"
|
||||
DEEPSEEK_R1_DISTILL_LLAMA_70B = "deepseek-r1-distill-llama-70b"
|
||||
QUEN_QWQ_32B = "qwen-qwq-32b"
|
||||
LLAMA_4_SCOUT_17B_16E_INSTRUCT = "meta-llama/llama-4-scout-17b-16e-instruct"
|
||||
LLAMA_4_MAVERICK_17B_128E_INSTRUCT = "meta-llama/llama-4-maverick-17b-128e-instruct"
|
||||
GEMMA2_9B_IT = "gemma2-9b-it"
|
||||
LLAMA_3_1_8B_INSTANT = "llama-3.1-8b-instant"
|
||||
OPENAI_GPT_OSS_120B = "openai/gpt-oss-120b"
|
||||
|
||||
|
||||
@register_llm
|
||||
class GroqLLMService(BaseLLMConfiguration):
|
||||
provider: Literal[ServiceProviders.GROQ] = ServiceProviders.GROQ
|
||||
model: GroqModel = GroqModel.LLAMA_3_3_70B
|
||||
model: str = Field(default="llama-3.3-70b-versatile", json_schema_extra={"examples": GROQ_MODELS})
|
||||
api_key: str
|
||||
|
||||
|
||||
class AzureModel(str, Enum):
|
||||
GPT4_1_MINI = "gpt-4.1-mini"
|
||||
|
||||
|
||||
@register_llm
|
||||
class AzureLLMService(BaseLLMConfiguration):
|
||||
provider: Literal[ServiceProviders.AZURE] = ServiceProviders.AZURE
|
||||
model: AzureModel = AzureModel.GPT4_1_MINI
|
||||
model: str = Field(default="gpt-4.1-mini", json_schema_extra={"examples": AZURE_MODELS})
|
||||
api_key: str
|
||||
endpoint: str
|
||||
|
||||
|
||||
# Dograh LLM Service
|
||||
class DograhLLMModel(str, Enum):
|
||||
DEFAULT = "default"
|
||||
ACCURATE = "accurate"
|
||||
FAST = "fast"
|
||||
LITE = "lite"
|
||||
ZEN = "zen"
|
||||
ZEN_LITE = "zen_lite"
|
||||
|
||||
|
||||
@register_llm
|
||||
class DograhLLMService(BaseLLMConfiguration):
|
||||
provider: Literal[ServiceProviders.DOGRAH] = ServiceProviders.DOGRAH
|
||||
model: DograhLLMModel = DograhLLMModel.DEFAULT
|
||||
model: str = Field(default="default", json_schema_extra={"examples": DOGRAH_LLM_MODELS})
|
||||
api_key: str
|
||||
|
||||
|
||||
|
|
@ -185,15 +161,10 @@ LLMConfig = Annotated[
|
|||
###################################################### TTS ########################################################################
|
||||
|
||||
|
||||
class DeepgramVoice(str, Enum):
|
||||
HELENA = "aura-2-helena-en"
|
||||
THALIA = "aura-2-thalia-en"
|
||||
|
||||
|
||||
@register_tts
|
||||
class DeepgramTTSConfiguration(BaseServiceConfiguration):
|
||||
provider: Literal[ServiceProviders.DEEPGRAM] = ServiceProviders.DEEPGRAM
|
||||
voice: DeepgramVoice = DeepgramVoice.HELENA
|
||||
voice: str = "aura-2-helena-en"
|
||||
api_key: str
|
||||
|
||||
@computed_field
|
||||
|
|
@ -210,42 +181,6 @@ class DeepgramTTSConfiguration(BaseServiceConfiguration):
|
|||
return "aura-2"
|
||||
|
||||
|
||||
class ElevenlabsVoice(str, Enum):
|
||||
ALEXANDRA = "Alexandra - 3dzJXoCYueSQiptQ6euE"
|
||||
AMY = "Amy - oGn4Ha2pe2vSJkmIJgLQ"
|
||||
ANGELA = "Angela - FUfBrNit0NNZAwb58KWH"
|
||||
ARIA = "Aria - 9BWtsMINqrJLrRacOk9x"
|
||||
CHELSEA = "Chelsea - NHRgOEwqx5WZNClv5sat"
|
||||
CHRISTINA = "Christina - X03mvPuTfprif8QBAVeJ"
|
||||
CLARA = "Clara - ZIlrSGI4jZqobxRKprJz"
|
||||
CLYDE = "Clyde - 2EiwWnXFnvU5JabPnv8n"
|
||||
DAVE = "Dave - CYw3kZ02Hs0563khs1Fj"
|
||||
DOMI = "Domi - AZnzlk1XvdvUeBnXmlld"
|
||||
DREW = "Drew - 29vD33N1CtxCmqQRPOHJ"
|
||||
ELENA = "Elena_German - iFJwt4O7E3aafIpJFfcu"
|
||||
EVE = "Eve - BZgkqPqms7Kj9ulSkVzn"
|
||||
FIN = "Fin - D38z5RcWu1voky8WS1ja"
|
||||
HOPE_BESTIE = "Hope_Bestie - uYXf8XasLslADfZ2MB4u"
|
||||
HOPE_NATURAL = "Hope_Natural - OYTbf65OHHFELVut7v2H"
|
||||
JARNATHAN = "Jarnathan - c6SfcYrb2t09NHXiT80T"
|
||||
JENNA = "Jenna - C2BkQxlGNzBn7WD2bqfR"
|
||||
JESSICA = "Jessica - cgSgspJ2msm6clMCkdW9"
|
||||
JOHANNA = "Johanna_German - YYDsZT3K2y6tv7X1aj6N"
|
||||
JUNIPER = "Juniper - aMSt68OGf4xUZAnLpTU8"
|
||||
LAUREN = "Lauren - 3liN8q8YoeB9Hk6AboKe"
|
||||
LINA = "Lina - oWjuL7HSoaEJRMDMP3HD"
|
||||
MONIKA = "Monika_Hindi_8 - 2bNrEsM0omyhLiEyOwqY"
|
||||
NEHA = "Neha_Hindi - QTKSa2Iyv0yoxvXY2V8a"
|
||||
OLIVIA = "Olivia - 1rviaVF7GGGkTU36HNpz"
|
||||
PAUL = "Paul - 5Q0t7uMcjvnagumLfvZi"
|
||||
RACHEL = "Rachel - 21m00Tcm4TlvDq8ikWAM"
|
||||
ROGER = "Roger - CwhRBWXzGAHq8TQ4Fs17"
|
||||
SAMI_REAL = "Sami_Real - O4cGUVdAocn0z4EpQ9yF"
|
||||
SARAH = "Sarah - EXAVITQu4vr4xnSDxMaL"
|
||||
SIA = "Sia_Hindi_10 - ryIIztHPLYSJ74ueXxnO"
|
||||
ZARA = "Zara - MmQVkVZnQ0dUbfWzcW6f"
|
||||
|
||||
|
||||
class ElevenlabsModel(str, Enum):
|
||||
FLASH_2 = "eleven_flash_v2_5"
|
||||
|
||||
|
|
@ -253,16 +188,12 @@ class ElevenlabsModel(str, Enum):
|
|||
@register_tts
|
||||
class ElevenlabsTTSConfiguration(BaseServiceConfiguration):
|
||||
provider: Literal[ServiceProviders.ELEVENLABS] = ServiceProviders.ELEVENLABS
|
||||
voice: ElevenlabsVoice = ElevenlabsVoice.RACHEL
|
||||
voice: str = "21m00Tcm4TlvDq8ikWAM" # Rachel voice ID
|
||||
speed: float = Field(default=1.0, ge=0.1, le=2.0, description="Speed of the voice")
|
||||
model: ElevenlabsModel = ElevenlabsModel.FLASH_2
|
||||
api_key: str
|
||||
|
||||
|
||||
class OpenAIVoice(str, Enum):
|
||||
ALLY = "alloy"
|
||||
|
||||
|
||||
class OpenAITTSModel(str, Enum):
|
||||
GPT_4o_MINI = "gpt-4o-mini-tts"
|
||||
|
||||
|
|
@ -271,29 +202,10 @@ class OpenAITTSModel(str, Enum):
|
|||
class OpenAITTSService(BaseTTSConfiguration):
|
||||
provider: Literal[ServiceProviders.OPENAI] = ServiceProviders.OPENAI
|
||||
model: OpenAITTSModel = OpenAITTSModel.GPT_4o_MINI
|
||||
voice: OpenAIVoice = OpenAIVoice.ALLY
|
||||
voice: str = "alloy"
|
||||
api_key: str
|
||||
|
||||
|
||||
# class NeuphonicVoice(str, Enum):
|
||||
# EMILY = "Emily - fc854436-2dac-4d21-aa69-ae17b54e98eb"
|
||||
|
||||
|
||||
# @register_tts
|
||||
# class NeuphonicTTSService(BaseTTSConfiguration):
|
||||
# provider: Literal[ServiceProviders.NEUPHONIC] = ServiceProviders.NEUPHONIC
|
||||
# voice: NeuphonicVoice = NeuphonicVoice.EMILY
|
||||
# model: str = "NA"
|
||||
# api_key: str
|
||||
|
||||
|
||||
# Dograh TTS Service
|
||||
class DograhVoice(str, Enum):
|
||||
DEFAULT = "default"
|
||||
JOEY = "joey"
|
||||
RACHEL = "rachel"
|
||||
|
||||
|
||||
class DograhTTSModel(str, Enum):
|
||||
DEFAULT = "default"
|
||||
|
||||
|
|
@ -302,16 +214,58 @@ class DograhTTSModel(str, Enum):
|
|||
class DograhTTSService(BaseTTSConfiguration):
|
||||
provider: Literal[ServiceProviders.DOGRAH] = ServiceProviders.DOGRAH
|
||||
model: DograhTTSModel = DograhTTSModel.DEFAULT
|
||||
voice: DograhVoice = DograhVoice.DEFAULT
|
||||
voice: str = "default"
|
||||
api_key: str
|
||||
|
||||
|
||||
class SarvamTTSModel(str, Enum):
|
||||
BULBUL_V2 = "bulbul:v2"
|
||||
BULBUL_V3 = "bulbul:v3"
|
||||
|
||||
|
||||
class SarvamVoice(str, Enum):
|
||||
# Female voices
|
||||
ANUSHKA = "anushka"
|
||||
MANISHA = "manisha"
|
||||
VIDYA = "vidya"
|
||||
ARYA = "arya"
|
||||
# Male voices
|
||||
ABHILASH = "abhilash"
|
||||
KARUN = "karun"
|
||||
HITESH = "hitesh"
|
||||
|
||||
|
||||
class SarvamLanguage(str, Enum):
|
||||
BENGALI = "bn-IN"
|
||||
ENGLISH_INDIA = "en-IN"
|
||||
GUJARATI = "gu-IN"
|
||||
HINDI = "hi-IN"
|
||||
KANNADA = "kn-IN"
|
||||
MALAYALAM = "ml-IN"
|
||||
MARATHI = "mr-IN"
|
||||
ODIA = "od-IN"
|
||||
PUNJABI = "pa-IN"
|
||||
TAMIL = "ta-IN"
|
||||
TELUGU = "te-IN"
|
||||
ASSAMESE = "as-IN"
|
||||
|
||||
|
||||
# @register_tts
|
||||
# class SarvamTTSConfiguration(BaseTTSConfiguration):
|
||||
# provider: Literal[ServiceProviders.SARVAM] = ServiceProviders.SARVAM
|
||||
# model: SarvamTTSModel = SarvamTTSModel.BULBUL_V2
|
||||
# voice: SarvamVoice = SarvamVoice.ANUSHKA
|
||||
# language: SarvamLanguage = SarvamLanguage.HINDI
|
||||
# api_key: str
|
||||
|
||||
|
||||
TTSConfig = Annotated[
|
||||
Union[
|
||||
DeepgramTTSConfiguration,
|
||||
OpenAITTSService,
|
||||
ElevenlabsTTSConfiguration,
|
||||
DograhTTSService,
|
||||
# SarvamTTSConfiguration,
|
||||
],
|
||||
Field(discriminator="provider"),
|
||||
]
|
||||
|
|
@ -323,10 +277,45 @@ class DeepgramSTTModel(str, Enum):
|
|||
NOVA_3_GENERAL = "nova-3-general"
|
||||
|
||||
|
||||
class DeepgramLanguage(str, Enum):
|
||||
MULTI = "multi"
|
||||
ENGLISH = "en"
|
||||
ENGLISH_US = "en-US"
|
||||
ENGLISH_GB = "en-GB"
|
||||
ENGLISH_AU = "en-AU"
|
||||
ENGLISH_IN = "en-IN"
|
||||
SPANISH = "es"
|
||||
SPANISH_LATAM = "es-419"
|
||||
FRENCH = "fr"
|
||||
FRENCH_CA = "fr-CA"
|
||||
GERMAN = "de"
|
||||
ITALIAN = "it"
|
||||
PORTUGUESE = "pt"
|
||||
PORTUGUESE_BR = "pt-BR"
|
||||
DUTCH = "nl"
|
||||
HINDI = "hi"
|
||||
JAPANESE = "ja"
|
||||
KOREAN = "ko"
|
||||
CHINESE_SIMPLIFIED = "zh-CN"
|
||||
CHINESE_TRADITIONAL = "zh-TW"
|
||||
RUSSIAN = "ru"
|
||||
POLISH = "pl"
|
||||
TURKISH = "tr"
|
||||
UKRAINIAN = "uk"
|
||||
VIETNAMESE = "vi"
|
||||
SWEDISH = "sv"
|
||||
DANISH = "da"
|
||||
NORWEGIAN = "no"
|
||||
FINNISH = "fi"
|
||||
INDONESIAN = "id"
|
||||
THAI = "th"
|
||||
|
||||
|
||||
@register_stt
|
||||
class DeepgramSTTConfiguration(BaseSTTConfiguration):
|
||||
provider: Literal[ServiceProviders.DEEPGRAM] = ServiceProviders.DEEPGRAM
|
||||
model: DeepgramSTTModel = DeepgramSTTModel.NOVA_3_GENERAL
|
||||
language: DeepgramLanguage = DeepgramLanguage.MULTI
|
||||
api_key: str
|
||||
|
||||
|
||||
|
|
@ -359,8 +348,27 @@ class DograhSTTService(BaseSTTConfiguration):
|
|||
api_key: str
|
||||
|
||||
|
||||
# Sarvam STT Service
|
||||
class SarvamSTTModel(str, Enum):
|
||||
SAARIKA_V2_5 = "saarika:v2.5"
|
||||
SAARAS_V2 = "saaras:v2" # STT-Translate model (auto-detects language)
|
||||
|
||||
|
||||
# @register_stt
|
||||
# class SarvamSTTConfiguration(BaseSTTConfiguration):
|
||||
# provider: Literal[ServiceProviders.SARVAM] = ServiceProviders.SARVAM
|
||||
# model: SarvamSTTModel = SarvamSTTModel.SAARIKA_V2_5
|
||||
# language: SarvamLanguage = SarvamLanguage.HINDI
|
||||
# api_key: str
|
||||
|
||||
|
||||
STTConfig = Annotated[
|
||||
Union[DeepgramSTTConfiguration, OpenAISTTConfiguration, DograhSTTService],
|
||||
Union[
|
||||
DeepgramSTTConfiguration,
|
||||
OpenAISTTConfiguration,
|
||||
DograhSTTService,
|
||||
# SarvamSTTConfiguration,
|
||||
],
|
||||
Field(discriminator="provider"),
|
||||
]
|
||||
|
||||
|
|
|
|||
|
|
@ -285,6 +285,44 @@ class MPSServiceKeyClient:
|
|||
response=response,
|
||||
)
|
||||
|
||||
async def get_voices(
|
||||
self,
|
||||
provider: str,
|
||||
organization_id: Optional[int] = None,
|
||||
created_by: Optional[str] = None,
|
||||
) -> dict:
|
||||
"""
|
||||
Get available voices for a TTS provider from MPS.
|
||||
|
||||
Args:
|
||||
provider: TTS provider name (elevenlabs, deepgram, sarvam, cartesia)
|
||||
organization_id: Organization ID (for authenticated mode)
|
||||
created_by: User provider ID (for OSS mode)
|
||||
|
||||
Returns:
|
||||
Dictionary containing provider name and list of voices
|
||||
|
||||
Raises:
|
||||
HTTPException: If the API call fails
|
||||
"""
|
||||
async with httpx.AsyncClient(timeout=self.timeout) as client:
|
||||
response = await client.get(
|
||||
f"{self.base_url}/api/v1/voice-proxy/{provider}/voices",
|
||||
headers=self._get_headers(organization_id, created_by),
|
||||
)
|
||||
|
||||
if response.status_code == 200:
|
||||
return response.json()
|
||||
else:
|
||||
logger.error(
|
||||
f"Failed to get voices for {provider}: {response.status_code} - {response.text}"
|
||||
)
|
||||
raise httpx.HTTPStatusError(
|
||||
f"Failed to get voices: {response.text}",
|
||||
request=response.request,
|
||||
response=response,
|
||||
)
|
||||
|
||||
async def call_workflow_api(
|
||||
self,
|
||||
call_type: str,
|
||||
|
|
|
|||
|
|
@ -17,6 +17,9 @@ from pipecat.services.groq.llm import GroqLLMService
|
|||
from pipecat.services.openai.llm import OpenAILLMService
|
||||
from pipecat.services.openai.stt import OpenAISTTService
|
||||
from pipecat.services.openai.tts import OpenAITTSService
|
||||
from pipecat.services.sarvam.stt import SarvamSTTService
|
||||
from pipecat.services.sarvam.tts import SarvamTTSService
|
||||
from pipecat.transcriptions.language import Language
|
||||
from pipecat.utils.text.xml_function_tag_filter import XMLFunctionTagFilter
|
||||
|
||||
if TYPE_CHECKING:
|
||||
|
|
@ -26,8 +29,13 @@ if TYPE_CHECKING:
|
|||
def create_stt_service(user_config):
|
||||
"""Create and return appropriate STT service based on user configuration"""
|
||||
if user_config.stt.provider == ServiceProviders.DEEPGRAM.value:
|
||||
# Use language from user config, defaulting to "multi" for multilingual support
|
||||
language = getattr(user_config.stt, "language", None)
|
||||
language_value = (
|
||||
language.value if hasattr(language, "value") else (language or "multi")
|
||||
)
|
||||
live_options = LiveOptions(
|
||||
language="multi", profanity_filter=False, endpointing=100
|
||||
language=language_value, profanity_filter=False, endpointing=100
|
||||
)
|
||||
return DeepgramSTTService(
|
||||
live_options=live_options,
|
||||
|
|
@ -53,6 +61,32 @@ def create_stt_service(user_config):
|
|||
model=user_config.stt.model.value,
|
||||
audio_passthrough=False, # Disable passthrough since audio is buffered separately
|
||||
)
|
||||
elif user_config.stt.provider == ServiceProviders.SARVAM.value:
|
||||
# Map Sarvam language code to pipecat Language enum
|
||||
language_mapping = {
|
||||
"bn-IN": Language.BN_IN,
|
||||
"gu-IN": Language.GU_IN,
|
||||
"hi-IN": Language.HI_IN,
|
||||
"kn-IN": Language.KN_IN,
|
||||
"ml-IN": Language.ML_IN,
|
||||
"mr-IN": Language.MR_IN,
|
||||
"ta-IN": Language.TA_IN,
|
||||
"te-IN": Language.TE_IN,
|
||||
"pa-IN": Language.PA_IN,
|
||||
"od-IN": Language.OR_IN,
|
||||
"en-IN": Language.EN_IN,
|
||||
"as-IN": Language.AS_IN,
|
||||
}
|
||||
language = getattr(user_config.stt, "language", None)
|
||||
language_value = language.value if hasattr(language, "value") else language
|
||||
pipecat_language = language_mapping.get(language_value, Language.HI_IN)
|
||||
|
||||
return SarvamSTTService(
|
||||
api_key=user_config.stt.api_key,
|
||||
model=user_config.stt.model.value,
|
||||
params=SarvamSTTService.InputParams(language=pipecat_language),
|
||||
audio_passthrough=False,
|
||||
)
|
||||
else:
|
||||
raise HTTPException(
|
||||
status_code=400, detail=f"Invalid STT provider {user_config.stt.provider}"
|
||||
|
|
@ -81,7 +115,12 @@ def create_tts_service(user_config, audio_config: "AudioConfig"):
|
|||
text_filters=[xml_function_tag_filter],
|
||||
)
|
||||
elif user_config.tts.provider == ServiceProviders.ELEVENLABS.value:
|
||||
voice_id = user_config.tts.voice.split(" - ")[1]
|
||||
# Backward compatible with older configuration "Name - voice_id"
|
||||
try:
|
||||
voice_id = user_config.tts.voice.split(" - ")[1]
|
||||
except IndexError:
|
||||
voice_id = user_config.tts.voice
|
||||
|
||||
return ElevenLabsTTSService(
|
||||
reconnect_on_error=False,
|
||||
api_key=user_config.tts.api_key,
|
||||
|
|
@ -103,6 +142,35 @@ def create_tts_service(user_config, audio_config: "AudioConfig"):
|
|||
voice=user_config.tts.voice.value,
|
||||
text_filters=[xml_function_tag_filter],
|
||||
)
|
||||
elif user_config.tts.provider == ServiceProviders.SARVAM.value:
|
||||
# Map Sarvam language code to pipecat Language enum for TTS
|
||||
language_mapping = {
|
||||
"bn-IN": Language.BN,
|
||||
"en-IN": Language.EN,
|
||||
"gu-IN": Language.GU,
|
||||
"hi-IN": Language.HI,
|
||||
"kn-IN": Language.KN,
|
||||
"ml-IN": Language.ML,
|
||||
"mr-IN": Language.MR,
|
||||
"od-IN": Language.OR,
|
||||
"pa-IN": Language.PA,
|
||||
"ta-IN": Language.TA,
|
||||
"te-IN": Language.TE,
|
||||
}
|
||||
language = getattr(user_config.tts, "language", None)
|
||||
language_value = language.value if hasattr(language, "value") else language
|
||||
pipecat_language = language_mapping.get(language_value, Language.HI)
|
||||
|
||||
voice = getattr(user_config.tts, "voice", None)
|
||||
voice_value = voice.value if hasattr(voice, "value") else (voice or "anushka")
|
||||
|
||||
return SarvamTTSService(
|
||||
api_key=user_config.tts.api_key,
|
||||
model=user_config.tts.model.value,
|
||||
voice_id=voice_value,
|
||||
params=SarvamTTSService.InputParams(language=pipecat_language),
|
||||
text_filters=[xml_function_tag_filter],
|
||||
)
|
||||
else:
|
||||
raise HTTPException(
|
||||
status_code=400, detail=f"Invalid TTS provider {user_config.tts.provider}"
|
||||
|
|
|
|||
|
|
@ -5,7 +5,6 @@ from api.schemas.user_configuration import UserConfiguration
|
|||
from api.services.configuration.masking import is_mask_of, mask_key, mask_user_config
|
||||
from api.services.configuration.merge import merge_user_configurations
|
||||
from api.services.configuration.registry import (
|
||||
GroqModel,
|
||||
OpenAILLMService,
|
||||
)
|
||||
|
||||
|
|
@ -70,7 +69,7 @@ def test_merge_drops_old_key_when_provider_changes():
|
|||
incoming_partial = {
|
||||
"llm": {
|
||||
"provider": "groq",
|
||||
"model": GroqModel.LLAMA_3_3_70B,
|
||||
"model": "llama-3.3-70b-versatile",
|
||||
# api_key intentionally absent – should NOT inherit old key
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,18 +1,17 @@
|
|||
---
|
||||
title: "Inference Provider"
|
||||
description: "Dograh ships with its own inferencing engine, which is hosted at https://services.dograh.com. The inference service provides LLM, TTS and STT services. In this document you can see how you can configure the inferencing engine to your favourite provider, like OpenAI, Gemini etc."
|
||||
title: "Model Configurations"
|
||||
description: "Voice Agents need AI Models to work, like LLM (Large Language Model), TTS (Voice) and STT (Transcriber). You can use any of your faviourite providers with Dograh Platform to run your Voice Agent."
|
||||
---
|
||||
|
||||
## Configure Inference Provider
|
||||
## Configure Models
|
||||
Dograh Platform ships with its own models by default. When you sign up on https://app.dograh.com or you setup the platform on your self hosted infrastructure, you get some Dograh model credits by default.
|
||||
|
||||
You can go to `https://app.dograh.com/model-configurations` if you are on hosted version of Dograh or go to `http://localhost:3010/model-configurations` if you are running Dograh locally.
|
||||
If you wish to change the models to a provider of your own choice, ou can go to `https://app.dograh.com/model-configurations` if you are on hosted version of Dograh or go to `http://localhost:3010/model-configurations` if you are running Dograh locally.
|
||||
|
||||
You can see the configuration for the inference provider in the following screenshot.
|
||||
|
||||

|
||||
|
||||
You can select the provider from the dropdown and configure the API key, model, etc. You can see [API Keys](api-keys) documentation for instructions on how to create Service Keys to be used in Model Configuration.
|
||||
You can select the provider from the dropdown and configure the API key, model, etc. For Dograh, you can see [Service Keys](api-keys) documentation for instructions on how to create Service Keys to be used in Model Configuration.
|
||||
|
||||
## Next Steps
|
||||
|
||||
You can see how to configure the telephony provider in [Telephony Integrations](/telephony/twilio).
|
||||
Next there are some in depth documentation of various AI Models that you can configure.
|
||||
12
docs/configurations/llm.mdx
Normal file
12
docs/configurations/llm.mdx
Normal file
|
|
@ -0,0 +1,12 @@
|
|||
---
|
||||
title: "LLM"
|
||||
description: "Voice Agents use LLM (Large Language Models), which are trained to understand the conversational context, and respond to users."
|
||||
---
|
||||
|
||||
You can currently use OpenAI, Google, Groq, Azure and Dograh LLMs in LLM configuration. There are some models provided by default for you to choose from the drop down.
|
||||
|
||||

|
||||
|
||||
If you don't find a model in the drop down, you can always add a model manually.
|
||||
|
||||

|
||||
8
docs/configurations/transcriber.mdx
Normal file
8
docs/configurations/transcriber.mdx
Normal file
|
|
@ -0,0 +1,8 @@
|
|||
---
|
||||
title: "Transcriber"
|
||||
description: "Voice Agents use STT (Speech to Text), to transcribe what the user speaks. This transcribed speech as text goes into an LLM to generate the response that gets played out to the user."
|
||||
---
|
||||
|
||||
Dograh platform ships with Deepgram, Cartesia, OpenAI and Dograh transcribers by default. You can take a look at the providers documentation of which language to select for your language requirements.
|
||||
|
||||
Example: Deepgram has their language support documentation at https://developers.deepgram.com/docs/models-languages-overview#nova-3
|
||||
10
docs/configurations/voice.mdx
Normal file
10
docs/configurations/voice.mdx
Normal file
|
|
@ -0,0 +1,10 @@
|
|||
---
|
||||
title: "Voice"
|
||||
description: "Voice Agents use TTS (Text to Speech), which generates audio that LLMs generate during the course of a conversation. This is the audio that the end user having the conversation listens to."
|
||||
---
|
||||
|
||||
Dograh platform ships with Elevenlabs, Deepgram, OpenAI and Dograh TTS engines by default. There are some voices from the providers that we ship by default. You can refer to the providers API documentation to select a voice ID thats most relevant for your language requirement.
|
||||
|
||||
If you dont find your favourite voice, you can always add the voice ID manually.
|
||||
|
||||

|
||||
|
|
@ -30,6 +30,9 @@
|
|||
"group": "Configurations",
|
||||
"pages": [
|
||||
"configurations/inference-providers",
|
||||
"configurations/llm",
|
||||
"configurations/voice",
|
||||
"configurations/transcriber",
|
||||
"configurations/api-keys"
|
||||
]
|
||||
},
|
||||
|
|
|
|||
BIN
docs/images/add_model_manually.png
Normal file
BIN
docs/images/add_model_manually.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 105 KiB |
BIN
docs/images/add_tts_manually.png
Normal file
BIN
docs/images/add_tts_manually.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 210 KiB |
BIN
docs/images/models_dropdown.png
Normal file
BIN
docs/images/models_dropdown.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 110 KiB |
|
|
@ -460,7 +460,7 @@ export default function APIKeysPage() {
|
|||
{showServiceArchived ? 'Hide' : 'Show'} Archived
|
||||
</Button>
|
||||
)}
|
||||
{canCreateServiceKey && (
|
||||
{canCreateServiceKey ? (
|
||||
<Button
|
||||
onClick={() => setIsCreateServiceDialogOpen(true)}
|
||||
size="sm"
|
||||
|
|
@ -468,6 +468,10 @@ export default function APIKeysPage() {
|
|||
<Plus className="w-4 h-4 mr-2" />
|
||||
Create Service Key
|
||||
</Button>
|
||||
) : (
|
||||
<span className="text">
|
||||
To generate additional service keys, <a href="https://app.dograh.com" target="_blank" rel="noopener noreferrer" className="text-primary hover:underline">Sign up on app.dograh.com</a>
|
||||
</span>
|
||||
)}
|
||||
</div>
|
||||
</div>
|
||||
|
|
|
|||
File diff suppressed because one or more lines are too long
|
|
@ -650,6 +650,21 @@ export type VobizConfigurationResponse = {
|
|||
from_numbers: Array<string>;
|
||||
};
|
||||
|
||||
export type VoiceInfo = {
|
||||
voice_id: string;
|
||||
name: string;
|
||||
description?: string | null;
|
||||
accent?: string | null;
|
||||
gender?: string | null;
|
||||
language?: string | null;
|
||||
preview_url?: string | null;
|
||||
};
|
||||
|
||||
export type VoicesResponse = {
|
||||
provider: string;
|
||||
voices: Array<VoiceInfo>;
|
||||
};
|
||||
|
||||
/**
|
||||
* Request schema for Vonage configuration.
|
||||
*/
|
||||
|
|
@ -1828,6 +1843,40 @@ export type ReactivateApiKeyApiV1UserApiKeysApiKeyIdReactivatePutResponses = {
|
|||
|
||||
export type ReactivateApiKeyApiV1UserApiKeysApiKeyIdReactivatePutResponse = ReactivateApiKeyApiV1UserApiKeysApiKeyIdReactivatePutResponses[keyof ReactivateApiKeyApiV1UserApiKeysApiKeyIdReactivatePutResponses];
|
||||
|
||||
export type GetVoicesApiV1UserConfigurationsVoicesProviderGetData = {
|
||||
body?: never;
|
||||
headers?: {
|
||||
authorization?: string | null;
|
||||
};
|
||||
path: {
|
||||
provider: 'elevenlabs' | 'deepgram' | 'sarvam' | 'cartesia' | 'dograh';
|
||||
};
|
||||
query?: never;
|
||||
url: '/api/v1/user/configurations/voices/{provider}';
|
||||
};
|
||||
|
||||
export type GetVoicesApiV1UserConfigurationsVoicesProviderGetErrors = {
|
||||
/**
|
||||
* Not found
|
||||
*/
|
||||
404: unknown;
|
||||
/**
|
||||
* Validation Error
|
||||
*/
|
||||
422: HttpValidationError;
|
||||
};
|
||||
|
||||
export type GetVoicesApiV1UserConfigurationsVoicesProviderGetError = GetVoicesApiV1UserConfigurationsVoicesProviderGetErrors[keyof GetVoicesApiV1UserConfigurationsVoicesProviderGetErrors];
|
||||
|
||||
export type GetVoicesApiV1UserConfigurationsVoicesProviderGetResponses = {
|
||||
/**
|
||||
* Successful Response
|
||||
*/
|
||||
200: VoicesResponse;
|
||||
};
|
||||
|
||||
export type GetVoicesApiV1UserConfigurationsVoicesProviderGetResponse = GetVoicesApiV1UserConfigurationsVoicesProviderGetResponses[keyof GetVoicesApiV1UserConfigurationsVoicesProviderGetResponses];
|
||||
|
||||
export type CreateCampaignApiV1CampaignCreatePostData = {
|
||||
body: CreateCampaignRequest;
|
||||
headers?: {
|
||||
|
|
|
|||
|
|
@ -6,10 +6,12 @@ import { useForm } from "react-hook-form";
|
|||
import { getDefaultConfigurationsApiV1UserConfigurationsDefaultsGet } from '@/client/sdk.gen';
|
||||
import { Button } from "@/components/ui/button";
|
||||
import { Card, CardContent } from "@/components/ui/card";
|
||||
import { Checkbox } from "@/components/ui/checkbox";
|
||||
import { Input } from "@/components/ui/input";
|
||||
import { Label } from "@/components/ui/label";
|
||||
import { Select, SelectContent, SelectItem, SelectTrigger, SelectValue } from "@/components/ui/select";
|
||||
import { Tabs, TabsContent, TabsList, TabsTrigger } from "@/components/ui/tabs";
|
||||
import { VoiceSelector } from "@/components/VoiceSelector";
|
||||
import { useUserConfig } from "@/context/UserConfigContext";
|
||||
|
||||
type ServiceSegment = "llm" | "tts" | "stt";
|
||||
|
|
@ -18,6 +20,7 @@ interface SchemaProperty {
|
|||
type?: string;
|
||||
default?: string | number | boolean;
|
||||
enum?: string[];
|
||||
examples?: string[];
|
||||
$ref?: string;
|
||||
description?: string;
|
||||
format?: string;
|
||||
|
|
@ -40,6 +43,65 @@ const TAB_CONFIG: { key: ServiceSegment; label: string }[] = [
|
|||
{ key: "stt", label: "Transcriber" },
|
||||
];
|
||||
|
||||
// Display names for language codes (Deepgram + Sarvam)
|
||||
const LANGUAGE_DISPLAY_NAMES: Record<string, string> = {
|
||||
// Deepgram languages
|
||||
"multi": "Multilingual (Auto-detect)",
|
||||
"en": "English",
|
||||
"en-US": "English (US)",
|
||||
"en-GB": "English (UK)",
|
||||
"en-AU": "English (Australia)",
|
||||
"en-IN": "English (India)",
|
||||
"es": "Spanish",
|
||||
"es-419": "Spanish (Latin America)",
|
||||
"fr": "French",
|
||||
"fr-CA": "French (Canada)",
|
||||
"de": "German",
|
||||
"it": "Italian",
|
||||
"pt": "Portuguese",
|
||||
"pt-BR": "Portuguese (Brazil)",
|
||||
"nl": "Dutch",
|
||||
"hi": "Hindi",
|
||||
"ja": "Japanese",
|
||||
"ko": "Korean",
|
||||
"zh-CN": "Chinese (Simplified)",
|
||||
"zh-TW": "Chinese (Traditional)",
|
||||
"ru": "Russian",
|
||||
"pl": "Polish",
|
||||
"tr": "Turkish",
|
||||
"uk": "Ukrainian",
|
||||
"vi": "Vietnamese",
|
||||
"sv": "Swedish",
|
||||
"da": "Danish",
|
||||
"no": "Norwegian",
|
||||
"fi": "Finnish",
|
||||
"id": "Indonesian",
|
||||
"th": "Thai",
|
||||
// Sarvam Indian languages
|
||||
"bn-IN": "Bengali",
|
||||
"gu-IN": "Gujarati",
|
||||
"hi-IN": "Hindi",
|
||||
"kn-IN": "Kannada",
|
||||
"ml-IN": "Malayalam",
|
||||
"mr-IN": "Marathi",
|
||||
"od-IN": "Odia",
|
||||
"pa-IN": "Punjabi",
|
||||
"ta-IN": "Tamil",
|
||||
"te-IN": "Telugu",
|
||||
"as-IN": "Assamese",
|
||||
};
|
||||
|
||||
// Display names for Sarvam voices
|
||||
const VOICE_DISPLAY_NAMES: Record<string, string> = {
|
||||
"anushka": "Anushka (Female)",
|
||||
"manisha": "Manisha (Female)",
|
||||
"vidya": "Vidya (Female)",
|
||||
"arya": "Arya (Female)",
|
||||
"abhilash": "Abhilash (Male)",
|
||||
"karun": "Karun (Male)",
|
||||
"hitesh": "Hitesh (Male)",
|
||||
};
|
||||
|
||||
export default function ServiceConfiguration() {
|
||||
const [apiError, setApiError] = useState<string | null>(null);
|
||||
const [isSaving, setIsSaving] = useState(false);
|
||||
|
|
@ -54,6 +116,8 @@ export default function ServiceConfiguration() {
|
|||
tts: "",
|
||||
stt: ""
|
||||
});
|
||||
const [isManualModelInput, setIsManualModelInput] = useState(false);
|
||||
const [hasCheckedManualMode, setHasCheckedManualMode] = useState(false);
|
||||
|
||||
const {
|
||||
register,
|
||||
|
|
@ -119,6 +183,29 @@ export default function ServiceConfiguration() {
|
|||
fetchConfigurations();
|
||||
}, [reset, userConfig]);
|
||||
|
||||
// Check if the saved LLM model is not in the suggested options (custom model)
|
||||
useEffect(() => {
|
||||
if (hasCheckedManualMode) return;
|
||||
|
||||
const currentProvider = serviceProviders.llm;
|
||||
const providerSchema = schemas?.llm?.[currentProvider];
|
||||
if (!providerSchema) return;
|
||||
|
||||
const modelSchema = providerSchema.properties.model;
|
||||
const actualModelSchema = modelSchema?.$ref && providerSchema.$defs
|
||||
? providerSchema.$defs[modelSchema.$ref.split('/').pop() || '']
|
||||
: modelSchema;
|
||||
|
||||
if (actualModelSchema?.examples && userConfig?.llm?.model) {
|
||||
const savedModel = userConfig.llm.model as string;
|
||||
const isInOptions = actualModelSchema.examples.includes(savedModel);
|
||||
if (!isInOptions) {
|
||||
setIsManualModelInput(true);
|
||||
}
|
||||
setHasCheckedManualMode(true);
|
||||
}
|
||||
}, [schemas, serviceProviders.llm, userConfig?.llm?.model, hasCheckedManualMode]);
|
||||
|
||||
const handleProviderChange = (service: ServiceSegment, providerName: string) => {
|
||||
if (!providerName) {
|
||||
return;
|
||||
|
|
@ -147,6 +234,11 @@ export default function ServiceConfiguration() {
|
|||
preservedValues[`${service}_provider`] = providerName;
|
||||
reset(preservedValues);
|
||||
setServiceProviders(prev => ({ ...prev, [service]: providerName }));
|
||||
|
||||
// Reset manual model input when LLM provider changes
|
||||
if (service === "llm") {
|
||||
setIsManualModelInput(false);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -266,7 +358,7 @@ export default function ServiceConfiguration() {
|
|||
<div className="space-y-2">
|
||||
<Label>API Key</Label>
|
||||
<Input
|
||||
type="password"
|
||||
type="text"
|
||||
placeholder="Enter API key"
|
||||
{...register(`${service}_api_key`, {
|
||||
required: providerSchema.required?.includes("api_key"),
|
||||
|
|
@ -291,7 +383,113 @@ export default function ServiceConfiguration() {
|
|||
? providerSchema.$defs[schema.$ref.split('/').pop() || '']
|
||||
: schema;
|
||||
|
||||
// Use VoiceSelector for voice field in TTS service (except Sarvam which uses enum)
|
||||
if (service === "tts" && field === "voice") {
|
||||
const currentProvider = serviceProviders.tts;
|
||||
// Sarvam uses enum-based voice selection, not VoiceSelector
|
||||
if (currentProvider !== "sarvam" && !actualSchema?.enum) {
|
||||
return (
|
||||
<VoiceSelector
|
||||
provider={currentProvider}
|
||||
value={watch(`${service}_${field}`) as string || ""}
|
||||
onChange={(voiceId) => {
|
||||
setValue(`${service}_${field}`, voiceId, { shouldDirty: true });
|
||||
}}
|
||||
/>
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
// Handle LLM model field with manual input toggle (uses examples from schema)
|
||||
if (service === "llm" && field === "model" && actualSchema?.examples) {
|
||||
const currentValue = watch(`${service}_${field}`) as string || "";
|
||||
const modelOptions = actualSchema.examples;
|
||||
|
||||
if (isManualModelInput) {
|
||||
return (
|
||||
<div className="space-y-2">
|
||||
<Input
|
||||
type="text"
|
||||
placeholder="Enter model name"
|
||||
value={currentValue}
|
||||
onChange={(e) => {
|
||||
setValue(`${service}_${field}`, e.target.value, { shouldDirty: true });
|
||||
}}
|
||||
/>
|
||||
<div className="flex items-center space-x-2">
|
||||
<Checkbox
|
||||
id="manual-model-input"
|
||||
checked={isManualModelInput}
|
||||
onCheckedChange={(checked) => {
|
||||
setIsManualModelInput(checked as boolean);
|
||||
if (!checked && modelOptions.length > 0) {
|
||||
// Reset to first option when switching back
|
||||
setValue(`${service}_${field}`, modelOptions[0], { shouldDirty: true });
|
||||
}
|
||||
}}
|
||||
/>
|
||||
<Label
|
||||
htmlFor="manual-model-input"
|
||||
className="text-sm font-normal cursor-pointer"
|
||||
>
|
||||
Add Model Manually
|
||||
</Label>
|
||||
</div>
|
||||
</div>
|
||||
);
|
||||
}
|
||||
|
||||
return (
|
||||
<div className="space-y-2">
|
||||
<Select
|
||||
value={currentValue}
|
||||
onValueChange={(value) => {
|
||||
if (!value) return;
|
||||
setValue(`${service}_${field}`, value, { shouldDirty: true });
|
||||
}}
|
||||
>
|
||||
<SelectTrigger className="w-full">
|
||||
<SelectValue placeholder="Select model" />
|
||||
</SelectTrigger>
|
||||
<SelectContent>
|
||||
{modelOptions.map((value: string) => (
|
||||
<SelectItem key={value} value={value}>
|
||||
{value}
|
||||
</SelectItem>
|
||||
))}
|
||||
</SelectContent>
|
||||
</Select>
|
||||
<div className="flex items-center space-x-2">
|
||||
<Checkbox
|
||||
id="manual-model-input-dropdown"
|
||||
checked={isManualModelInput}
|
||||
onCheckedChange={(checked) => {
|
||||
setIsManualModelInput(checked as boolean);
|
||||
}}
|
||||
/>
|
||||
<Label
|
||||
htmlFor="manual-model-input-dropdown"
|
||||
className="text-sm font-normal cursor-pointer"
|
||||
>
|
||||
Add Model Manually
|
||||
</Label>
|
||||
</div>
|
||||
</div>
|
||||
);
|
||||
}
|
||||
|
||||
if (actualSchema?.enum) {
|
||||
// Use friendly display names for language and voice fields
|
||||
const getDisplayName = (value: string) => {
|
||||
if (field === "language") {
|
||||
return LANGUAGE_DISPLAY_NAMES[value] || value;
|
||||
}
|
||||
if (field === "voice") {
|
||||
return VOICE_DISPLAY_NAMES[value] || value;
|
||||
}
|
||||
return value;
|
||||
};
|
||||
|
||||
return (
|
||||
<Select
|
||||
value={watch(`${service}_${field}`) as string || ""}
|
||||
|
|
@ -308,7 +506,7 @@ export default function ServiceConfiguration() {
|
|||
<SelectContent>
|
||||
{actualSchema.enum.map((value: string) => (
|
||||
<SelectItem key={value} value={value}>
|
||||
{value}
|
||||
{getDisplayName(value)}
|
||||
</SelectItem>
|
||||
))}
|
||||
</SelectContent>
|
||||
|
|
|
|||
384
ui/src/components/VoiceSelector.tsx
Normal file
384
ui/src/components/VoiceSelector.tsx
Normal file
|
|
@ -0,0 +1,384 @@
|
|||
"use client";
|
||||
|
||||
import { ChevronDown, Loader2, Search, Volume2 } from "lucide-react";
|
||||
import { useCallback, useEffect, useState } from "react";
|
||||
|
||||
import { getVoicesApiV1UserConfigurationsVoicesProviderGet } from "@/client/sdk.gen";
|
||||
import { VoiceInfo } from "@/client/types.gen";
|
||||
import { Button } from "@/components/ui/button";
|
||||
import { Checkbox } from "@/components/ui/checkbox";
|
||||
import { Input } from "@/components/ui/input";
|
||||
import { Label } from "@/components/ui/label";
|
||||
import { Popover, PopoverContent, PopoverTrigger } from "@/components/ui/popover";
|
||||
import { useUserConfig } from "@/context/UserConfigContext";
|
||||
import { cn } from "@/lib/utils";
|
||||
|
||||
// Providers that have MPS voice endpoints
|
||||
type TTSProviderWithVoices = "elevenlabs" | "deepgram" | "sarvam" | "cartesia" | "dograh";
|
||||
const MPS_VOICE_PROVIDERS: TTSProviderWithVoices[] = ["elevenlabs", "deepgram", "sarvam", "cartesia", "dograh"];
|
||||
|
||||
interface VoiceSelectorProps {
|
||||
provider: string;
|
||||
value: string;
|
||||
onChange: (voiceId: string) => void;
|
||||
className?: string;
|
||||
}
|
||||
|
||||
export const VoiceSelector: React.FC<VoiceSelectorProps> = ({
|
||||
provider,
|
||||
value,
|
||||
onChange,
|
||||
className,
|
||||
}) => {
|
||||
const { accessToken } = useUserConfig();
|
||||
const [isOpen, setIsOpen] = useState(false);
|
||||
const [searchTerm, setSearchTerm] = useState("");
|
||||
const [isManualInput, setIsManualInput] = useState(false);
|
||||
const [manualVoiceId, setManualVoiceId] = useState(value || "");
|
||||
const [voices, setVoices] = useState<VoiceInfo[]>([]);
|
||||
const [isLoading, setIsLoading] = useState(false);
|
||||
const [error, setError] = useState<string | null>(null);
|
||||
const [playingPreview, setPlayingPreview] = useState<string | null>(null);
|
||||
const [currentAudio, setCurrentAudio] = useState<HTMLAudioElement | null>(null);
|
||||
|
||||
// Check if provider has MPS voice endpoint
|
||||
const hasMPSVoiceEndpoint = useCallback((providerName: string): boolean => {
|
||||
return MPS_VOICE_PROVIDERS.includes(providerName.toLowerCase() as TTSProviderWithVoices);
|
||||
}, []);
|
||||
|
||||
// Map provider names to API-compatible provider names
|
||||
const getProviderKey = useCallback((providerName: string): TTSProviderWithVoices | null => {
|
||||
const providerMap: Record<string, TTSProviderWithVoices> = {
|
||||
elevenlabs: "elevenlabs",
|
||||
deepgram: "deepgram",
|
||||
sarvam: "sarvam",
|
||||
cartesia: "cartesia",
|
||||
dograh: "dograh",
|
||||
};
|
||||
return providerMap[providerName.toLowerCase()] || null;
|
||||
}, []);
|
||||
|
||||
const fetchVoices = useCallback(async () => {
|
||||
const providerKey = getProviderKey(provider);
|
||||
if (!providerKey || !accessToken) {
|
||||
setVoices([]);
|
||||
return;
|
||||
}
|
||||
|
||||
setIsLoading(true);
|
||||
setError(null);
|
||||
|
||||
try {
|
||||
const response = await getVoicesApiV1UserConfigurationsVoicesProviderGet({
|
||||
path: { provider: providerKey },
|
||||
headers: {
|
||||
Authorization: `Bearer ${accessToken}`,
|
||||
},
|
||||
});
|
||||
|
||||
if (response.data?.voices) {
|
||||
setVoices(response.data.voices);
|
||||
}
|
||||
} catch (err) {
|
||||
console.error("Failed to fetch voices:", err);
|
||||
setError("Failed to load voices");
|
||||
setVoices([]);
|
||||
} finally {
|
||||
setIsLoading(false);
|
||||
}
|
||||
}, [provider, getProviderKey, accessToken]);
|
||||
|
||||
useEffect(() => {
|
||||
if (provider) {
|
||||
fetchVoices();
|
||||
}
|
||||
}, [provider, fetchVoices]);
|
||||
|
||||
// Check if the current value exists in the voices list
|
||||
useEffect(() => {
|
||||
if (value && voices.length > 0) {
|
||||
const voiceExists = voices.some((v) => v.voice_id === value);
|
||||
if (!voiceExists) {
|
||||
// If the value doesn't exist in the list, switch to manual input mode
|
||||
setIsManualInput(true);
|
||||
setManualVoiceId(value);
|
||||
}
|
||||
}
|
||||
}, [value, voices]);
|
||||
|
||||
// Cleanup audio on unmount or when popover closes
|
||||
useEffect(() => {
|
||||
if (!isOpen && currentAudio) {
|
||||
currentAudio.pause();
|
||||
currentAudio.currentTime = 0;
|
||||
setCurrentAudio(null);
|
||||
setPlayingPreview(null);
|
||||
}
|
||||
}, [isOpen, currentAudio]);
|
||||
|
||||
// Cleanup on unmount
|
||||
useEffect(() => {
|
||||
return () => {
|
||||
if (currentAudio) {
|
||||
currentAudio.pause();
|
||||
}
|
||||
};
|
||||
}, [currentAudio]);
|
||||
|
||||
const filteredVoices = voices.filter((voice) => {
|
||||
const searchLower = searchTerm.toLowerCase();
|
||||
return (
|
||||
voice.name.toLowerCase().includes(searchLower) ||
|
||||
voice.voice_id.toLowerCase().includes(searchLower) ||
|
||||
(voice.description?.toLowerCase() || "").includes(searchLower) ||
|
||||
(voice.accent?.toLowerCase() || "").includes(searchLower) ||
|
||||
(voice.gender?.toLowerCase() || "").includes(searchLower) ||
|
||||
(voice.language?.toLowerCase() || "").includes(searchLower)
|
||||
);
|
||||
});
|
||||
|
||||
const handleSelectVoice = (voiceId: string) => {
|
||||
onChange(voiceId);
|
||||
setIsOpen(false);
|
||||
setSearchTerm("");
|
||||
};
|
||||
|
||||
const handleManualInputToggle = (checked: boolean) => {
|
||||
setIsManualInput(checked);
|
||||
if (checked) {
|
||||
setManualVoiceId(value || "");
|
||||
} else {
|
||||
// When switching back to dropdown, try to find the current value in voices
|
||||
const existingVoice = voices.find((v) => v.voice_id === value);
|
||||
if (!existingVoice && voices.length > 0) {
|
||||
// If current value not in list, select the first voice
|
||||
onChange(voices[0].voice_id);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
const handleManualVoiceIdChange = (newValue: string) => {
|
||||
setManualVoiceId(newValue);
|
||||
onChange(newValue);
|
||||
};
|
||||
|
||||
const getSelectedVoiceName = () => {
|
||||
if (isManualInput && value) {
|
||||
return value;
|
||||
}
|
||||
const voice = voices.find((v) => v.voice_id === value);
|
||||
return voice?.name || value || "Select a voice";
|
||||
};
|
||||
|
||||
const playPreview = (previewUrl: string, voiceId: string) => {
|
||||
// Stop current audio if playing
|
||||
if (currentAudio) {
|
||||
currentAudio.pause();
|
||||
currentAudio.currentTime = 0;
|
||||
setCurrentAudio(null);
|
||||
}
|
||||
|
||||
// If clicking the same voice that's playing, just stop it
|
||||
if (playingPreview === voiceId) {
|
||||
setPlayingPreview(null);
|
||||
return;
|
||||
}
|
||||
|
||||
setPlayingPreview(voiceId);
|
||||
const audio = new Audio(previewUrl);
|
||||
setCurrentAudio(audio);
|
||||
audio.onended = () => {
|
||||
setPlayingPreview(null);
|
||||
setCurrentAudio(null);
|
||||
};
|
||||
audio.onerror = () => {
|
||||
setPlayingPreview(null);
|
||||
setCurrentAudio(null);
|
||||
};
|
||||
audio.play().catch(() => {
|
||||
setPlayingPreview(null);
|
||||
setCurrentAudio(null);
|
||||
});
|
||||
};
|
||||
|
||||
// For providers without MPS voice endpoint, show simple input
|
||||
if (!hasMPSVoiceEndpoint(provider)) {
|
||||
return (
|
||||
<div className={cn("space-y-2", className)}>
|
||||
<Input
|
||||
type="text"
|
||||
placeholder="Enter voice ID"
|
||||
value={value || ""}
|
||||
onChange={(e) => onChange(e.target.value)}
|
||||
/>
|
||||
</div>
|
||||
);
|
||||
}
|
||||
|
||||
if (isManualInput) {
|
||||
return (
|
||||
<div className={cn("space-y-2", className)}>
|
||||
<Input
|
||||
type="text"
|
||||
placeholder="Enter voice ID"
|
||||
value={manualVoiceId}
|
||||
onChange={(e) => handleManualVoiceIdChange(e.target.value)}
|
||||
/>
|
||||
<div className="flex items-center space-x-2">
|
||||
<Checkbox
|
||||
id="manual-voice-input"
|
||||
checked={isManualInput}
|
||||
onCheckedChange={(checked) => handleManualInputToggle(checked as boolean)}
|
||||
/>
|
||||
<Label
|
||||
htmlFor="manual-voice-input"
|
||||
className="text-sm font-normal cursor-pointer"
|
||||
>
|
||||
Add Voice ID Manually
|
||||
</Label>
|
||||
</div>
|
||||
</div>
|
||||
);
|
||||
}
|
||||
|
||||
return (
|
||||
<div className={cn("space-y-2", className)}>
|
||||
<Popover open={isOpen} onOpenChange={setIsOpen}>
|
||||
<PopoverTrigger asChild>
|
||||
<Button
|
||||
variant="outline"
|
||||
role="combobox"
|
||||
aria-expanded={isOpen}
|
||||
className={cn(
|
||||
"w-full justify-between",
|
||||
!value && "text-muted-foreground"
|
||||
)}
|
||||
disabled={isLoading}
|
||||
>
|
||||
<span className="truncate">
|
||||
{isLoading ? "Loading voices..." : getSelectedVoiceName()}
|
||||
</span>
|
||||
{isLoading ? (
|
||||
<Loader2 className="ml-2 h-4 w-4 shrink-0 animate-spin" />
|
||||
) : (
|
||||
<ChevronDown className="ml-2 h-4 w-4 shrink-0 opacity-50" />
|
||||
)}
|
||||
</Button>
|
||||
</PopoverTrigger>
|
||||
<PopoverContent className="w-[400px] p-0" align="start">
|
||||
<div className="p-2 space-y-2">
|
||||
<div className="relative">
|
||||
<Search className="absolute left-2 top-2.5 h-4 w-4 text-muted-foreground" />
|
||||
<Input
|
||||
placeholder="Search voices..."
|
||||
value={searchTerm}
|
||||
onChange={(e) => setSearchTerm(e.target.value)}
|
||||
className="pl-8"
|
||||
/>
|
||||
</div>
|
||||
|
||||
<div className="max-h-[300px] overflow-auto space-y-1">
|
||||
{error ? (
|
||||
<p className="text-sm text-red-500 text-center py-4">
|
||||
{error}
|
||||
</p>
|
||||
) : isLoading ? (
|
||||
<div className="flex items-center justify-center py-4">
|
||||
<Loader2 className="h-6 w-6 animate-spin text-muted-foreground" />
|
||||
</div>
|
||||
) : filteredVoices.length === 0 ? (
|
||||
<p className="text-sm text-muted-foreground text-center py-4">
|
||||
No voices found
|
||||
</p>
|
||||
) : (
|
||||
filteredVoices.map((voice) => (
|
||||
<div
|
||||
key={voice.voice_id}
|
||||
className={cn(
|
||||
"flex items-start space-x-3 p-2 hover:bg-accent rounded-sm cursor-pointer",
|
||||
value === voice.voice_id && "bg-accent"
|
||||
)}
|
||||
onClick={() => handleSelectVoice(voice.voice_id)}
|
||||
>
|
||||
<div className="flex-1 min-w-0">
|
||||
<div className="flex items-center gap-2">
|
||||
<p className="text-sm font-medium truncate">
|
||||
{voice.name}
|
||||
</p>
|
||||
{voice.gender && (
|
||||
<span className="text-xs text-muted-foreground capitalize">
|
||||
{voice.gender}
|
||||
</span>
|
||||
)}
|
||||
</div>
|
||||
{voice.description && (
|
||||
<p className="text-xs text-muted-foreground line-clamp-2">
|
||||
{voice.description}
|
||||
</p>
|
||||
)}
|
||||
<div className="flex items-center gap-2 mt-1">
|
||||
{voice.accent && (
|
||||
<span className="text-xs bg-secondary px-1.5 py-0.5 rounded capitalize">
|
||||
{voice.accent}
|
||||
</span>
|
||||
)}
|
||||
{voice.language && (
|
||||
<span className="text-xs bg-secondary px-1.5 py-0.5 rounded uppercase">
|
||||
{voice.language}
|
||||
</span>
|
||||
)}
|
||||
</div>
|
||||
</div>
|
||||
{voice.preview_url && (
|
||||
<Button
|
||||
variant="ghost"
|
||||
size="sm"
|
||||
className="h-8 w-8 p-0 shrink-0"
|
||||
onClick={(e) => {
|
||||
e.stopPropagation();
|
||||
playPreview(voice.preview_url!, voice.voice_id);
|
||||
}}
|
||||
>
|
||||
<Volume2
|
||||
className={cn(
|
||||
"h-4 w-4",
|
||||
playingPreview === voice.voice_id &&
|
||||
"text-primary animate-pulse"
|
||||
)}
|
||||
/>
|
||||
</Button>
|
||||
)}
|
||||
</div>
|
||||
))
|
||||
)}
|
||||
</div>
|
||||
|
||||
<div className="pt-2 border-t flex items-center justify-between">
|
||||
<div className="flex items-center space-x-2">
|
||||
<Checkbox
|
||||
id="manual-voice-input-popup"
|
||||
checked={isManualInput}
|
||||
onCheckedChange={(checked) => {
|
||||
handleManualInputToggle(checked as boolean);
|
||||
if (checked) {
|
||||
setIsOpen(false);
|
||||
}
|
||||
}}
|
||||
/>
|
||||
<Label
|
||||
htmlFor="manual-voice-input-popup"
|
||||
className="text-sm font-normal cursor-pointer"
|
||||
>
|
||||
Add Voice ID Manually
|
||||
</Label>
|
||||
</div>
|
||||
<p className="text-xs text-muted-foreground">
|
||||
{voices.length} voices available
|
||||
</p>
|
||||
</div>
|
||||
</div>
|
||||
</PopoverContent>
|
||||
</Popover>
|
||||
</div>
|
||||
);
|
||||
};
|
||||
Loading…
Add table
Add a link
Reference in a new issue