feat: add custom sarvam tts voice (#449)

* feat: add custom sarvam tts voice * chore: refactor registry and add deepgram multi --------- Co-authored-by: Abhishek Kumar <abhishek@a6k.me>
2026-06-19 08:28:10 +02:00 · 2026-06-18 12:33:21 +05:30 · 2026-06-18 12:33:21 +05:30 · 951e73a645
commit 951e73a645
parent 344c8220de
9 changed files with 268 additions and 69 deletions
--- a/api/services/configuration/options/init.py
+++ b/api/services/configuration/options/init.py
@ -9,7 +9,13 @@ from .azure import (
    AZURE_SPEECH_TTS_LANGUAGES,
    AZURE_SPEECH_TTS_VOICES,
 )
-from .deepgram import DEEPGRAM_LANGUAGES, DEEPGRAM_STT_MODELS
+from .deepgram import (
+    DEEPGRAM_FLUX_MODELS,
+    DEEPGRAM_FLUX_MULTILINGUAL_LANGUAGE_OPTIONS,
+    DEEPGRAM_FLUX_MULTILINGUAL_LANGUAGES,
+    DEEPGRAM_LANGUAGES,
+    DEEPGRAM_STT_MODELS,
+)
 from .gladia import GLADIA_STT_LANGUAGES, GLADIA_STT_MODELS
 from .google import (
    GOOGLE_MODELS,
@ -35,6 +41,11 @@ from .sarvam import (
    SARVAM_V2_VOICES,
    SARVAM_V3_VOICES,
 )
+from .smallest import (
+    SMALLEST_TTS_LANGUAGES,
+    SMALLEST_TTS_MODELS,
+    SMALLEST_TTS_VOICES,
+)
 from .speechmatics import SPEECHMATICS_STT_LANGUAGES

 __all__ = [
@ -47,6 +58,9 @@ __all__ = [
    "AZURE_SPEECH_STT_LANGUAGES",
    "AZURE_SPEECH_TTS_LANGUAGES",
    "AZURE_SPEECH_TTS_VOICES",
+    "DEEPGRAM_FLUX_MODELS",
+    "DEEPGRAM_FLUX_MULTILINGUAL_LANGUAGES",
+    "DEEPGRAM_FLUX_MULTILINGUAL_LANGUAGE_OPTIONS",
    "DEEPGRAM_LANGUAGES",
    "DEEPGRAM_STT_MODELS",
    "GLADIA_STT_LANGUAGES",
@ -71,5 +85,8 @@ __all__ = [
    "SARVAM_TTS_MODELS",
    "SARVAM_V2_VOICES",
    "SARVAM_V3_VOICES",
+    "SMALLEST_TTS_LANGUAGES",
+    "SMALLEST_TTS_MODELS",
+    "SMALLEST_TTS_VOICES",
    "SPEECHMATICS_STT_LANGUAGES",
 ]
--- a/api/services/configuration/options/deepgram.py
+++ b/api/services/configuration/options/deepgram.py
@ -1,4 +1,21 @@
-DEEPGRAM_STT_MODELS = ("nova-3-general", "flux-general-en", "flux-general-multi")
+DEEPGRAM_FLUX_MODELS = ("flux-general-en", "flux-general-multi")
+DEEPGRAM_FLUX_MULTILINGUAL_LANGUAGES = (
+    "de",
+    "en",
+    "es",
+    "fr",
+    "hi",
+    "it",
+    "ja",
+    "nl",
+    "pt",
+    "ru",
+)
+DEEPGRAM_FLUX_MULTILINGUAL_LANGUAGE_OPTIONS = (
+    "multi",
+    *DEEPGRAM_FLUX_MULTILINGUAL_LANGUAGES,
+)
+DEEPGRAM_STT_MODELS = ("nova-3-general", *DEEPGRAM_FLUX_MODELS)
 DEEPGRAM_LANGUAGES = (
    "multi",
    "ar",
--- a/api/services/configuration/options/smallest.py
+++ b/api/services/configuration/options/smallest.py
@ -0,0 +1,36 @@
+SMALLEST_TTS_MODELS = ("lightning_v3.1", "lightning_v3.1_pro")
+SMALLEST_TTS_VOICES = (
+    "sophia",
+    "avery",
+    "liam",
+    "lucas",
+    "olivia",
+    "ryan",
+    "freya",
+    "william",
+    "devansh",
+    "arjun",
+    "niharika",
+    "maya",
+    "dhruv",
+    "mia",
+    "maithili",
+)
+SMALLEST_TTS_LANGUAGES = (
+    "en",
+    "hi",
+    "fr",
+    "de",
+    "es",
+    "it",
+    "nl",
+    "pl",
+    "ru",
+    "ar",
+    "bn",
+    "gu",
+    "he",
+    "kn",
+    "mr",
+    "ta",
+)
--- a/api/services/configuration/registry.py
+++ b/api/services/configuration/registry.py
@ -14,6 +14,7 @@ from api.services.configuration.options import (
    AZURE_SPEECH_STT_LANGUAGES,
    AZURE_SPEECH_TTS_LANGUAGES,
    AZURE_SPEECH_TTS_VOICES,
+    DEEPGRAM_FLUX_MULTILINGUAL_LANGUAGE_OPTIONS,
    DEEPGRAM_LANGUAGES,
    DEEPGRAM_STT_MODELS,
    GLADIA_STT_LANGUAGES,
@ -38,6 +39,9 @@ from api.services.configuration.options import (
    SARVAM_TTS_MODELS,
    SARVAM_V2_VOICES,
    SARVAM_V3_VOICES,
+    SMALLEST_TTS_LANGUAGES,
+    SMALLEST_TTS_MODELS,
+    SMALLEST_TTS_VOICES,
    SPEECHMATICS_STT_LANGUAGES,
 )
 from api.services.configuration.options.google import GOOGLE_VERTEX_MODELS
@ -987,9 +991,10 @@ class SarvamTTSConfiguration(BaseTTSConfiguration):
    )
    voice: str = Field(
        default="anushka",
-        description="Sarvam voice name; must match the selected model's voice list.",
+        description="Sarvam voice name or custom voice ID.",
        json_schema_extra={
            "examples": SARVAM_V2_VOICES,
+            "allow_custom_input": True,
            "model_options": {
                "bulbul:v2": SARVAM_V2_VOICES,
                "bulbul:v3": SARVAM_V3_VOICES,
@ -1172,43 +1177,6 @@ SMALLEST_PROVIDER_MODEL_CONFIG = provider_model_config(
    provider_docs_url="https://smallest.ai/docs",
 )

-SMALLEST_TTS_MODELS = ["lightning_v3.1", "lightning_v3.1_pro"]
-SMALLEST_TTS_VOICES = [
-    "sophia",
-    "avery",
-    "liam",
-    "lucas",
-    "olivia",
-    "ryan",
-    "freya",
-    "william",
-    "devansh",
-    "arjun",
-    "niharika",
-    "maya",
-    "dhruv",
-    "mia",
-    "maithili",
-]
-SMALLEST_TTS_LANGUAGES = [
-    "en",
-    "hi",
-    "fr",
-    "de",
-    "es",
-    "it",
-    "nl",
-    "pl",
-    "ru",
-    "ar",
-    "bn",
-    "gu",
-    "he",
-    "kn",
-    "mr",
-    "ta",
-]
-

@register_tts
 class SmallestAITTSConfiguration(BaseTTSConfiguration):
@ -1273,12 +1241,16 @@ class DeepgramSTTConfiguration(BaseSTTConfiguration):
    )
    language: str = Field(
        default="multi",
-        description="Language code; 'multi' enables auto-detect (Nova-3 only).",
+        description=(
+            "Language code. 'multi' enables Nova-3 auto-detect and omits "
+            "language hints for Flux multilingual auto-detect."
+        ),
        json_schema_extra={
            "examples": DEEPGRAM_LANGUAGES,
            "model_options": {
                "nova-3-general": DEEPGRAM_LANGUAGES,
                "flux-general-en": ("en",),
+                "flux-general-multi": DEEPGRAM_FLUX_MULTILINGUAL_LANGUAGE_OPTIONS,
            },
        },
    )
--- a/api/services/pipecat/run_pipeline.py
+++ b/api/services/pipecat/run_pipeline.py
@ -6,6 +6,7 @@ from loguru import logger

 from api.db import db_client
 from api.enums import WorkflowRunMode
+from api.services.configuration.options import DEEPGRAM_FLUX_MODELS
 from api.services.configuration.registry import ServiceProviders
 from api.services.integrations import (
    IntegrationRuntimeContext,
@ -626,7 +627,7 @@ async def _run_pipeline(
        # Other models use configurable turn detection strategy
        is_deepgram_flux = (
            user_config.stt.provider == ServiceProviders.DEEPGRAM.value
-            and user_config.stt.model == "flux-general-en"
+            and user_config.stt.model in DEEPGRAM_FLUX_MODELS
        )

        if is_deepgram_flux:
--- a/api/services/pipecat/service_factory.py
+++ b/api/services/pipecat/service_factory.py
@ -6,6 +6,7 @@ from fastapi import HTTPException
 from loguru import logger

 from api.constants import MPS_API_URL
+from api.services.configuration.options import DEEPGRAM_FLUX_MODELS
 from api.services.configuration.registry import ServiceProviders
 from api.services.pipecat.minimax_tts import MiniMaxOwnedSessionTTSService
 from api.utils.url_security import validate_user_configured_service_url
@ -78,6 +79,20 @@ if TYPE_CHECKING:
    from api.services.pipecat.audio_config import AudioConfig


+DEEPGRAM_FLUX_LANGUAGE_HINTS = {
+    "de": Language.DE,
+    "en": Language.EN,
+    "es": Language.ES,
+    "fr": Language.FR,
+    "hi": Language.HI,
+    "it": Language.IT,
+    "ja": Language.JA,
+    "nl": Language.NL,
+    "pt": Language.PT,
+    "ru": Language.RU,
+}
+
+
 def _validate_runtime_service_url(url: str, field_name: str) -> None:
    try:
        validate_user_configured_service_url(
@ -104,17 +119,23 @@ def create_stt_service(
        f"Creating STT service: provider={user_config.stt.provider}, model={user_config.stt.model}"
    )
    if user_config.stt.provider == ServiceProviders.DEEPGRAM.value:
-        # Check if using Flux model (English-only, no language selection)
-        if user_config.stt.model == "flux-general-en":
+        if user_config.stt.model in DEEPGRAM_FLUX_MODELS:
+            settings_kwargs = {
+                "model": user_config.stt.model,
+                "eot_timeout_ms": 3000,
+                "eot_threshold": 0.7,
+                "eager_eot_threshold": 0.5,
+                "keyterm": keyterms or [],
+            }
+            if user_config.stt.model == "flux-general-multi":
+                language = getattr(user_config.stt, "language", None)
+                language_hint = DEEPGRAM_FLUX_LANGUAGE_HINTS.get(language)
+                if language_hint:
+                    settings_kwargs["language_hints"] = [language_hint]
+
            return DeepgramFluxSTTService(
                api_key=user_config.stt.api_key,
-                settings=DeepgramFluxSTTSettings(
-                    model=user_config.stt.model,
-                    eot_timeout_ms=3000,
-                    eot_threshold=0.7,
-                    eager_eot_threshold=0.5,
-                    keyterm=keyterms or [],
-                ),
+                settings=DeepgramFluxSTTSettings(**settings_kwargs),
                should_interrupt=False,  # Let UserAggregator take care of sending InterruptionFrame
                sample_rate=audio_config.transport_in_sample_rate,
            )
@ -534,7 +555,9 @@ def create_tts_service(
        language = getattr(user_config.tts, "language", None)
        pipecat_language = language_mapping.get(language, Language.HI)

-        voice = getattr(user_config.tts, "voice", None) or "anushka"
+        voice = (
+            getattr(user_config.tts, "voice", None) or ""
+        ).strip().lower() or "anushka"
        speed = getattr(user_config.tts, "speed", None)
        settings_kwargs = {
            "model": user_config.tts.model,