From e83f3a36d2b422171113ecdf6432d8aaf9c63789 Mon Sep 17 00:00:00 2001 From: Abhishek Kumar Date: Fri, 26 Dec 2025 16:00:02 +0530 Subject: [PATCH] fix: change type definition from enum to str for consistency --- api/services/auth/depends.py | 14 +- api/services/configuration/registry.py | 207 +++++++++++---------- api/services/pipecat/service_factory.py | 63 +++---- ui/src/components/ServiceConfiguration.tsx | 13 +- 4 files changed, 147 insertions(+), 150 deletions(-) diff --git a/api/services/auth/depends.py b/api/services/auth/depends.py index d995ade..6d38b33 100644 --- a/api/services/auth/depends.py +++ b/api/services/auth/depends.py @@ -10,11 +10,7 @@ from api.db import db_client from api.db.models import UserModel from api.schemas.user_configuration import UserConfiguration from api.services.auth.stack_auth import stackauth -from api.services.configuration.registry import ( - DograhSTTModel, - DograhTTSModel, - ServiceProviders, -) +from api.services.configuration.registry import ServiceProviders async def get_user( @@ -242,18 +238,18 @@ async def create_user_configuration_with_mps_key( "llm": { "provider": ServiceProviders.DOGRAH.value, "api_key": service_key, - "model": "default", # Default model + "model": "default", }, "tts": { "provider": ServiceProviders.DOGRAH.value, "api_key": service_key, - "model": DograhTTSModel.DEFAULT.value, # Default model - "voice": "default", # Default voice + "model": "default", + "voice": "default", }, "stt": { "provider": ServiceProviders.DOGRAH.value, "api_key": service_key, - "model": DograhSTTModel.DEFAULT.value, # Default model + "model": "default", }, } user_config = UserConfiguration(**configuration) diff --git a/api/services/configuration/registry.py b/api/services/configuration/registry.py index c6f0324..6db0131 100644 --- a/api/services/configuration/registry.py +++ b/api/services/configuration/registry.py @@ -95,8 +95,21 @@ def register_stt(cls: Type[BaseSTTConfiguration]): ###################################################### LLM ######################################################################## # Suggested models for each provider (used for UI dropdown) -OPENAI_MODELS = ["gpt-4.1", "gpt-4.1-mini", "gpt-4.1-nano", "gpt-5", "gpt-5-mini", "gpt-5-nano", "gpt-3.5-turbo"] -GOOGLE_MODELS = ["gemini-2.0-flash", "gemini-2.0-flash-lite", "gemini-2.5-flash", "gemini-2.5-flash-lite"] +OPENAI_MODELS = [ + "gpt-4.1", + "gpt-4.1-mini", + "gpt-4.1-nano", + "gpt-5", + "gpt-5-mini", + "gpt-5-nano", + "gpt-3.5-turbo", +] +GOOGLE_MODELS = [ + "gemini-2.0-flash", + "gemini-2.0-flash-lite", + "gemini-2.5-flash", + "gemini-2.5-flash-lite", +] GROQ_MODELS = [ "llama-3.3-70b-versatile", "deepseek-r1-distill-llama-70b", @@ -121,21 +134,27 @@ class OpenAILLMService(BaseLLMConfiguration): @register_llm class GoogleLLMService(BaseLLMConfiguration): provider: Literal[ServiceProviders.GOOGLE] = ServiceProviders.GOOGLE - model: str = Field(default="gemini-2.0-flash", json_schema_extra={"examples": GOOGLE_MODELS}) + model: str = Field( + default="gemini-2.0-flash", json_schema_extra={"examples": GOOGLE_MODELS} + ) api_key: str @register_llm class GroqLLMService(BaseLLMConfiguration): provider: Literal[ServiceProviders.GROQ] = ServiceProviders.GROQ - model: str = Field(default="llama-3.3-70b-versatile", json_schema_extra={"examples": GROQ_MODELS}) + model: str = Field( + default="llama-3.3-70b-versatile", json_schema_extra={"examples": GROQ_MODELS} + ) api_key: str @register_llm class AzureLLMService(BaseLLMConfiguration): provider: Literal[ServiceProviders.AZURE] = ServiceProviders.AZURE - model: str = Field(default="gpt-4.1-mini", json_schema_extra={"examples": AZURE_MODELS}) + model: str = Field( + default="gpt-4.1-mini", json_schema_extra={"examples": AZURE_MODELS} + ) api_key: str endpoint: str @@ -143,7 +162,9 @@ class AzureLLMService(BaseLLMConfiguration): @register_llm class DograhLLMService(BaseLLMConfiguration): provider: Literal[ServiceProviders.DOGRAH] = ServiceProviders.DOGRAH - model: str = Field(default="default", json_schema_extra={"examples": DOGRAH_LLM_MODELS}) + model: str = Field( + default="default", json_schema_extra={"examples": DOGRAH_LLM_MODELS} + ) api_key: str @@ -181,8 +202,7 @@ class DeepgramTTSConfiguration(BaseServiceConfiguration): return "aura-2" -class ElevenlabsModel(str, Enum): - FLASH_2 = "eleven_flash_v2_5" +ELEVENLABS_TTS_MODELS = ["eleven_flash_v2_5"] @register_tts @@ -190,72 +210,63 @@ class ElevenlabsTTSConfiguration(BaseServiceConfiguration): provider: Literal[ServiceProviders.ELEVENLABS] = ServiceProviders.ELEVENLABS voice: str = "21m00Tcm4TlvDq8ikWAM" # Rachel voice ID speed: float = Field(default=1.0, ge=0.1, le=2.0, description="Speed of the voice") - model: ElevenlabsModel = ElevenlabsModel.FLASH_2 + model: str = Field( + default="eleven_flash_v2_5", + json_schema_extra={"examples": ELEVENLABS_TTS_MODELS}, + ) api_key: str -class OpenAITTSModel(str, Enum): - GPT_4o_MINI = "gpt-4o-mini-tts" +OPENAI_TTS_MODELS = ["gpt-4o-mini-tts"] @register_tts class OpenAITTSService(BaseTTSConfiguration): provider: Literal[ServiceProviders.OPENAI] = ServiceProviders.OPENAI - model: OpenAITTSModel = OpenAITTSModel.GPT_4o_MINI + model: str = Field( + default="gpt-4o-mini-tts", json_schema_extra={"examples": OPENAI_TTS_MODELS} + ) voice: str = "alloy" api_key: str -class DograhTTSModel(str, Enum): - DEFAULT = "default" +DOGRAH_TTS_MODELS = ["default"] @register_tts class DograhTTSService(BaseTTSConfiguration): provider: Literal[ServiceProviders.DOGRAH] = ServiceProviders.DOGRAH - model: DograhTTSModel = DograhTTSModel.DEFAULT + model: str = Field( + default="default", json_schema_extra={"examples": DOGRAH_TTS_MODELS} + ) voice: str = "default" api_key: str -class SarvamTTSModel(str, Enum): - BULBUL_V2 = "bulbul:v2" - BULBUL_V3 = "bulbul:v3" - - -class SarvamVoice(str, Enum): - # Female voices - ANUSHKA = "anushka" - MANISHA = "manisha" - VIDYA = "vidya" - ARYA = "arya" - # Male voices - ABHILASH = "abhilash" - KARUN = "karun" - HITESH = "hitesh" - - -class SarvamLanguage(str, Enum): - BENGALI = "bn-IN" - ENGLISH_INDIA = "en-IN" - GUJARATI = "gu-IN" - HINDI = "hi-IN" - KANNADA = "kn-IN" - MALAYALAM = "ml-IN" - MARATHI = "mr-IN" - ODIA = "od-IN" - PUNJABI = "pa-IN" - TAMIL = "ta-IN" - TELUGU = "te-IN" - ASSAMESE = "as-IN" +SARVAM_TTS_MODELS = ["bulbul:v2", "bulbul:v3"] +SARVAM_VOICES = ["anushka", "manisha", "vidya", "arya", "abhilash", "karun", "hitesh"] +SARVAM_LANGUAGES = [ + "bn-IN", + "en-IN", + "gu-IN", + "hi-IN", + "kn-IN", + "ml-IN", + "mr-IN", + "od-IN", + "pa-IN", + "ta-IN", + "te-IN", + "as-IN", +] # @register_tts # class SarvamTTSConfiguration(BaseTTSConfiguration): # provider: Literal[ServiceProviders.SARVAM] = ServiceProviders.SARVAM -# model: SarvamTTSModel = SarvamTTSModel.BULBUL_V2 -# voice: SarvamVoice = SarvamVoice.ANUSHKA -# language: SarvamLanguage = SarvamLanguage.HINDI +# model: str = Field(default="bulbul:v2", json_schema_extra={"examples": SARVAM_TTS_MODELS}) +# voice: str = Field(default="anushka", json_schema_extra={"examples": SARVAM_VOICES}) +# language: str = Field(default="hi-IN", json_schema_extra={"examples": SARVAM_LANGUAGES}) # api_key: str @@ -273,49 +284,51 @@ TTSConfig = Annotated[ ###################################################### STT ######################################################################## -class DeepgramSTTModel(str, Enum): - NOVA_3_GENERAL = "nova-3-general" - - -class DeepgramLanguage(str, Enum): - MULTI = "multi" - ENGLISH = "en" - ENGLISH_US = "en-US" - ENGLISH_GB = "en-GB" - ENGLISH_AU = "en-AU" - ENGLISH_IN = "en-IN" - SPANISH = "es" - SPANISH_LATAM = "es-419" - FRENCH = "fr" - FRENCH_CA = "fr-CA" - GERMAN = "de" - ITALIAN = "it" - PORTUGUESE = "pt" - PORTUGUESE_BR = "pt-BR" - DUTCH = "nl" - HINDI = "hi" - JAPANESE = "ja" - KOREAN = "ko" - CHINESE_SIMPLIFIED = "zh-CN" - CHINESE_TRADITIONAL = "zh-TW" - RUSSIAN = "ru" - POLISH = "pl" - TURKISH = "tr" - UKRAINIAN = "uk" - VIETNAMESE = "vi" - SWEDISH = "sv" - DANISH = "da" - NORWEGIAN = "no" - FINNISH = "fi" - INDONESIAN = "id" - THAI = "th" +DEEPGRAM_STT_MODELS = ["nova-3-general"] +DEEPGRAM_LANGUAGES = [ + "multi", + "en", + "en-US", + "en-GB", + "en-AU", + "en-IN", + "es", + "es-419", + "fr", + "fr-CA", + "de", + "it", + "pt", + "pt-BR", + "nl", + "hi", + "ja", + "ko", + "zh-CN", + "zh-TW", + "ru", + "pl", + "tr", + "uk", + "vi", + "sv", + "da", + "no", + "fi", + "id", + "th", +] @register_stt class DeepgramSTTConfiguration(BaseSTTConfiguration): provider: Literal[ServiceProviders.DEEPGRAM] = ServiceProviders.DEEPGRAM - model: DeepgramSTTModel = DeepgramSTTModel.NOVA_3_GENERAL - language: DeepgramLanguage = DeepgramLanguage.MULTI + model: str = Field( + default="nova-3-general", json_schema_extra={"examples": DEEPGRAM_STT_MODELS} + ) + language: str = Field( + default="multi", json_schema_extra={"examples": DEEPGRAM_LANGUAGES} + ) api_key: str @@ -325,40 +338,40 @@ class CartesiaSTTConfiguration(BaseSTTConfiguration): api_key: str -class OpenAISTTModel(str, Enum): - GPT_4o_TRANSCRIBE = "gpt-4o-transcribe" +OPENAI_STT_MODELS = ["gpt-4o-transcribe"] @register_stt class OpenAISTTConfiguration(BaseSTTConfiguration): provider: Literal[ServiceProviders.OPENAI] = ServiceProviders.OPENAI - model: OpenAISTTModel = OpenAISTTModel.GPT_4o_TRANSCRIBE + model: str = Field( + default="gpt-4o-transcribe", json_schema_extra={"examples": OPENAI_STT_MODELS} + ) api_key: str # Dograh STT Service -class DograhSTTModel(str, Enum): - DEFAULT = "default" +DOGRAH_STT_MODELS = ["default"] @register_stt class DograhSTTService(BaseSTTConfiguration): provider: Literal[ServiceProviders.DOGRAH] = ServiceProviders.DOGRAH - model: DograhSTTModel = DograhSTTModel.DEFAULT + model: str = Field( + default="default", json_schema_extra={"examples": DOGRAH_STT_MODELS} + ) api_key: str # Sarvam STT Service -class SarvamSTTModel(str, Enum): - SAARIKA_V2_5 = "saarika:v2.5" - SAARAS_V2 = "saaras:v2" # STT-Translate model (auto-detects language) +SARVAM_STT_MODELS = ["saarika:v2.5", "saaras:v2"] # @register_stt # class SarvamSTTConfiguration(BaseSTTConfiguration): # provider: Literal[ServiceProviders.SARVAM] = ServiceProviders.SARVAM -# model: SarvamSTTModel = SarvamSTTModel.SAARIKA_V2_5 -# language: SarvamLanguage = SarvamLanguage.HINDI +# model: str = Field(default="saarika:v2.5", json_schema_extra={"examples": SARVAM_STT_MODELS}) +# language: str = Field(default="hi-IN", json_schema_extra={"examples": SARVAM_LANGUAGES}) # api_key: str diff --git a/api/services/pipecat/service_factory.py b/api/services/pipecat/service_factory.py index 9869e78..948fbdd 100644 --- a/api/services/pipecat/service_factory.py +++ b/api/services/pipecat/service_factory.py @@ -30,12 +30,9 @@ def create_stt_service(user_config): """Create and return appropriate STT service based on user configuration""" if user_config.stt.provider == ServiceProviders.DEEPGRAM.value: # Use language from user config, defaulting to "multi" for multilingual support - language = getattr(user_config.stt, "language", None) - language_value = ( - language.value if hasattr(language, "value") else (language or "multi") - ) + language = getattr(user_config.stt, "language", None) or "multi" live_options = LiveOptions( - language=language_value, profanity_filter=False, endpointing=100 + language=language, profanity_filter=False, endpointing=100 ) return DeepgramSTTService( live_options=live_options, @@ -45,7 +42,7 @@ def create_stt_service(user_config): elif user_config.stt.provider == ServiceProviders.OPENAI.value: return OpenAISTTService( api_key=user_config.stt.api_key, - model=user_config.stt.model.value, + model=user_config.stt.model, audio_passthrough=False, # Disable passthrough since audio is buffered separately ) elif user_config.stt.provider == ServiceProviders.CARTESIA.value: @@ -58,7 +55,7 @@ def create_stt_service(user_config): return DograhSTTService( base_url=base_url, api_key=user_config.stt.api_key, - model=user_config.stt.model.value, + model=user_config.stt.model, audio_passthrough=False, # Disable passthrough since audio is buffered separately ) elif user_config.stt.provider == ServiceProviders.SARVAM.value: @@ -78,12 +75,10 @@ def create_stt_service(user_config): "as-IN": Language.AS_IN, } language = getattr(user_config.stt, "language", None) - language_value = language.value if hasattr(language, "value") else language - pipecat_language = language_mapping.get(language_value, Language.HI_IN) - + pipecat_language = language_mapping.get(language, Language.HI_IN) return SarvamSTTService( api_key=user_config.stt.api_key, - model=user_config.stt.model.value, + model=user_config.stt.model, params=SarvamSTTService.InputParams(language=pipecat_language), audio_passthrough=False, ) @@ -105,13 +100,13 @@ def create_tts_service(user_config, audio_config: "AudioConfig"): if user_config.tts.provider == ServiceProviders.DEEPGRAM.value: return DeepgramTTSService( api_key=user_config.tts.api_key, - voice=user_config.tts.voice.value, + voice=user_config.tts.voice, text_filters=[xml_function_tag_filter], ) elif user_config.tts.provider == ServiceProviders.OPENAI.value: return OpenAITTSService( api_key=user_config.tts.api_key, - model=user_config.tts.model.value, + model=user_config.tts.model, text_filters=[xml_function_tag_filter], ) elif user_config.tts.provider == ServiceProviders.ELEVENLABS.value: @@ -120,12 +115,11 @@ def create_tts_service(user_config, audio_config: "AudioConfig"): voice_id = user_config.tts.voice.split(" - ")[1] except IndexError: voice_id = user_config.tts.voice - return ElevenLabsTTSService( reconnect_on_error=False, api_key=user_config.tts.api_key, voice_id=voice_id, - model=user_config.tts.model.value, + model=user_config.tts.model, params=ElevenLabsTTSService.InputParams( stability=0.8, speed=user_config.tts.speed, similarity_boost=0.75 ), @@ -134,12 +128,11 @@ def create_tts_service(user_config, audio_config: "AudioConfig"): elif user_config.tts.provider == ServiceProviders.DOGRAH.value: # Convert HTTP URL to WebSocket URL for TTS base_url = MPS_API_URL.replace("http://", "ws://").replace("https://", "wss://") - # Handle both enum and string values for model and voice return DograhTTSService( base_url=base_url, api_key=user_config.tts.api_key, - model=user_config.tts.model.value, - voice=user_config.tts.voice.value, + model=user_config.tts.model, + voice=user_config.tts.voice, text_filters=[xml_function_tag_filter], ) elif user_config.tts.provider == ServiceProviders.SARVAM.value: @@ -158,16 +151,13 @@ def create_tts_service(user_config, audio_config: "AudioConfig"): "te-IN": Language.TE, } language = getattr(user_config.tts, "language", None) - language_value = language.value if hasattr(language, "value") else language - pipecat_language = language_mapping.get(language_value, Language.HI) - - voice = getattr(user_config.tts, "voice", None) - voice_value = voice.value if hasattr(voice, "value") else (voice or "anushka") + pipecat_language = language_mapping.get(language, Language.HI) + voice = getattr(user_config.tts, "voice", None) or "anushka" return SarvamTTSService( api_key=user_config.tts.api_key, - model=user_config.tts.model.value, - voice_id=voice_value, + model=user_config.tts.model, + voice_id=voice, params=SarvamTTSService.InputParams(language=pipecat_language), text_filters=[xml_function_tag_filter], ) @@ -179,17 +169,12 @@ def create_tts_service(user_config, audio_config: "AudioConfig"): def create_llm_service(user_config): """Create and return appropriate LLM service based on user configuration""" - # Handle both enum and string values for model - model_value = ( - user_config.llm.model.value - if hasattr(user_config.llm.model, "value") - else user_config.llm.model - ) + model = user_config.llm.model if user_config.llm.provider == ServiceProviders.OPENAI.value: - if "gpt-5" in model_value: + if "gpt-5" in model: return OpenAILLMService( api_key=user_config.llm.api_key, - model=model_value, + model=model, params=OpenAILLMService.InputParams( reasoning_effort="minimal", verbosity="low" ), @@ -197,16 +182,16 @@ def create_llm_service(user_config): else: return OpenAILLMService( api_key=user_config.llm.api_key, - model=model_value, + model=model, params=OpenAILLMService.InputParams(temperature=0.1), ) elif user_config.llm.provider == ServiceProviders.GROQ.value: print( - f"Creating Groq LLM service with API key: {user_config.llm.api_key} and model: {model_value}" + f"Creating Groq LLM service with API key: {user_config.llm.api_key} and model: {model}" ) return GroqLLMService( api_key=user_config.llm.api_key, - model=model_value, + model=model, params=OpenAILLMService.InputParams(temperature=0.1), ) elif user_config.llm.provider == ServiceProviders.GOOGLE.value: @@ -214,21 +199,21 @@ def create_llm_service(user_config): # NOT_GIVEN sentinels that break Pydantic validation in GoogleLLMService. return GoogleLLMService( api_key=user_config.llm.api_key, - model=model_value, + model=model, params=GoogleLLMService.InputParams(temperature=0.1), ) elif user_config.llm.provider == ServiceProviders.AZURE.value: return AzureLLMService( api_key=user_config.llm.api_key, endpoint=user_config.llm.endpoint, - model=model_value, # Azure uses deployment name as model + model=model, # Azure uses deployment name as model params=AzureLLMService.InputParams(temperature=0.1), ) elif user_config.llm.provider == ServiceProviders.DOGRAH.value: return DograhLLMService( base_url=f"{MPS_API_URL}/api/v1/llm", api_key=user_config.llm.api_key, - model=model_value, + model=model, ) else: raise HTTPException(status_code=400, detail="Invalid LLM provider") diff --git a/ui/src/components/ServiceConfiguration.tsx b/ui/src/components/ServiceConfiguration.tsx index 63c37a8..c2d88ff 100644 --- a/ui/src/components/ServiceConfiguration.tsx +++ b/ui/src/components/ServiceConfiguration.tsx @@ -383,11 +383,12 @@ export default function ServiceConfiguration() { ? providerSchema.$defs[schema.$ref.split('/').pop() || ''] : schema; - // Use VoiceSelector for voice field in TTS service (except Sarvam which uses enum) + // Use VoiceSelector for voice field in TTS service (except Sarvam which uses predefined options) if (service === "tts" && field === "voice") { const currentProvider = serviceProviders.tts; - // Sarvam uses enum-based voice selection, not VoiceSelector - if (currentProvider !== "sarvam" && !actualSchema?.enum) { + // Sarvam uses predefined voice options, not VoiceSelector + const hasVoiceOptions = actualSchema?.enum || actualSchema?.examples; + if (currentProvider !== "sarvam" && !hasVoiceOptions) { return ( 0) { // Use friendly display names for language and voice fields const getDisplayName = (value: string) => { if (field === "language") { @@ -504,7 +507,7 @@ export default function ServiceConfiguration() { - {actualSchema.enum.map((value: string) => ( + {dropdownOptions.map((value: string) => ( {getDisplayName(value)}