feat: add google stt and tts. add folders to organize agents

This commit is contained in:
Abhishek Kumar 2026-05-22 14:36:50 +05:30
parent 21951eca18
commit ad2fa07058
52 changed files with 3412 additions and 621 deletions

View file

@ -0,0 +1,49 @@
from .deepgram import DEEPGRAM_LANGUAGES, DEEPGRAM_STT_MODELS
from .gladia import GLADIA_STT_LANGUAGES, GLADIA_STT_MODELS
from .google import (
GOOGLE_MODELS,
GOOGLE_REALTIME_LANGUAGES,
GOOGLE_REALTIME_MODELS,
GOOGLE_REALTIME_VOICES,
GOOGLE_STT_LANGUAGES,
GOOGLE_STT_MODELS,
GOOGLE_TTS_LANGUAGES,
GOOGLE_TTS_MODELS,
GOOGLE_TTS_VOICES,
GOOGLE_VERTEX_REALTIME_LANGUAGES,
GOOGLE_VERTEX_REALTIME_MODELS,
GOOGLE_VERTEX_REALTIME_VOICES,
)
from .sarvam import (
SARVAM_LANGUAGES,
SARVAM_STT_MODELS,
SARVAM_TTS_MODELS,
SARVAM_V2_VOICES,
SARVAM_V3_VOICES,
)
from .speechmatics import SPEECHMATICS_STT_LANGUAGES
__all__ = [
"DEEPGRAM_LANGUAGES",
"DEEPGRAM_STT_MODELS",
"GLADIA_STT_LANGUAGES",
"GLADIA_STT_MODELS",
"GOOGLE_MODELS",
"GOOGLE_REALTIME_LANGUAGES",
"GOOGLE_REALTIME_MODELS",
"GOOGLE_REALTIME_VOICES",
"GOOGLE_STT_LANGUAGES",
"GOOGLE_STT_MODELS",
"GOOGLE_TTS_LANGUAGES",
"GOOGLE_TTS_MODELS",
"GOOGLE_TTS_VOICES",
"GOOGLE_VERTEX_REALTIME_LANGUAGES",
"GOOGLE_VERTEX_REALTIME_MODELS",
"GOOGLE_VERTEX_REALTIME_VOICES",
"SARVAM_LANGUAGES",
"SARVAM_STT_MODELS",
"SARVAM_TTS_MODELS",
"SARVAM_V2_VOICES",
"SARVAM_V3_VOICES",
"SPEECHMATICS_STT_LANGUAGES",
]

View file

@ -0,0 +1,84 @@
DEEPGRAM_STT_MODELS = ("nova-3-general", "flux-general-en", "flux-general-multi")
DEEPGRAM_LANGUAGES = (
"multi",
"ar",
"ar-AE",
"ar-SA",
"ar-QA",
"ar-KW",
"ar-SY",
"ar-LB",
"ar-PS",
"ar-JO",
"ar-EG",
"ar-SD",
"ar-TD",
"ar-MA",
"ar-DZ",
"ar-TN",
"ar-IQ",
"ar-IR",
"be",
"bn",
"bs",
"bg",
"ca",
"cs",
"da",
"da-DK",
"de",
"de-CH",
"el",
"en",
"en-US",
"en-AU",
"en-GB",
"en-IN",
"en-NZ",
"es",
"es-419",
"et",
"fa",
"fi",
"fr",
"fr-CA",
"he",
"hi",
"hr",
"hu",
"id",
"it",
"ja",
"kn",
"ko",
"ko-KR",
"lt",
"lv",
"mk",
"mr",
"ms",
"nl",
"nl-BE",
"no",
"pl",
"pt",
"pt-BR",
"pt-PT",
"ro",
"ru",
"sk",
"sl",
"sr",
"sv",
"sv-SE",
"ta",
"te",
"th",
"tl",
"tr",
"uk",
"ur",
"vi",
"zh-CN",
"zh-TW",
)

View file

@ -0,0 +1,103 @@
GLADIA_STT_MODELS = ("solaria-1",)
GLADIA_STT_LANGUAGES = (
"af",
"am",
"ar",
"as",
"az",
"ba",
"be",
"bg",
"bn",
"bo",
"br",
"bs",
"ca",
"cs",
"cy",
"da",
"de",
"el",
"en",
"es",
"et",
"eu",
"fa",
"fi",
"fo",
"fr",
"gl",
"gu",
"ha",
"haw",
"he",
"hi",
"hr",
"ht",
"hu",
"hy",
"id",
"is",
"it",
"ja",
"jw",
"ka",
"kk",
"km",
"kn",
"ko",
"la",
"lb",
"ln",
"lo",
"lt",
"lv",
"mg",
"mi",
"mk",
"ml",
"mn",
"mr",
"ms",
"mt",
"my",
"ne",
"nl",
"nn",
"no",
"oc",
"pa",
"pl",
"ps",
"pt",
"ro",
"ru",
"sa",
"sd",
"si",
"sk",
"sl",
"sn",
"so",
"sq",
"sr",
"su",
"sv",
"sw",
"ta",
"te",
"tg",
"th",
"tk",
"tl",
"tr",
"tt",
"uk",
"ur",
"uz",
"vi",
"wo",
"yi",
"yo",
"zh",
)

View file

@ -0,0 +1,273 @@
GOOGLE_MODELS = (
"gemini-2.0-flash",
"gemini-2.0-flash-lite",
"gemini-2.5-flash",
"gemini-2.5-flash-lite",
)
GOOGLE_REALTIME_MODELS = ("gemini-3.1-flash-live-preview",)
GOOGLE_REALTIME_VOICES = ("Puck", "Charon", "Kore", "Fenrir", "Aoede")
GOOGLE_REALTIME_LANGUAGES = (
"ar",
"bn",
"de",
"en",
"es",
"fr",
"gu",
"hi",
"id",
"it",
"ja",
"kn",
"ko",
"ml",
"mr",
"nl",
"pl",
"pt",
"ru",
"ta",
"te",
"th",
"tr",
"vi",
"zh",
)
GOOGLE_VERTEX_REALTIME_MODELS = ("google/gemini-live-2.5-flash-native-audio",)
GOOGLE_VERTEX_REALTIME_VOICES = GOOGLE_REALTIME_VOICES
GOOGLE_VERTEX_REALTIME_LANGUAGES = GOOGLE_REALTIME_LANGUAGES
GOOGLE_STT_MODELS = ("latest_long", "latest_short", "chirp_3")
# Docs-derived from Google Cloud Speech-to-Text V2 supported languages.
GOOGLE_STT_LANGUAGES = (
"af-ZA",
"am-ET",
"ar-AE",
"ar-BH",
"ar-DZ",
"ar-EG",
"ar-IL",
"ar-IQ",
"ar-JO",
"ar-KW",
"ar-LB",
"ar-MA",
"ar-MR",
"ar-OM",
"ar-PS",
"ar-QA",
"ar-SA",
"ar-SY",
"ar-TN",
"ar-XA",
"ar-YE",
"as-IN",
"ast-ES",
"az-AZ",
"be-BY",
"bg-BG",
"bn-BD",
"bn-IN",
"bs-BA",
"ca-ES",
"ceb-PH",
"ckb-IQ",
"cmn-Hans-CN",
"cmn-Hant-TW",
"cs-CZ",
"cy-GB",
"da-DK",
"de-AT",
"de-CH",
"de-DE",
"el-GR",
"en-AU",
"en-GB",
"en-HK",
"en-IE",
"en-IN",
"en-NZ",
"en-PH",
"en-PK",
"en-SG",
"en-US",
"es-419",
"es-AR",
"es-BO",
"es-CL",
"es-CO",
"es-CR",
"es-DO",
"es-EC",
"es-ES",
"es-GT",
"es-HN",
"es-MX",
"es-NI",
"es-PA",
"es-PE",
"es-PR",
"es-SV",
"es-US",
"es-UY",
"es-VE",
"et-EE",
"eu-ES",
"fa-IR",
"ff-SN",
"fi-FI",
"fil-PH",
"fr-BE",
"fr-CA",
"fr-CH",
"fr-FR",
"ga-IE",
"gl-ES",
"gu-IN",
"ha-NG",
"hi-IN",
"hr-HR",
"hu-HU",
"hy-AM",
"id-ID",
"ig-NG",
"is-IS",
"it-CH",
"it-IT",
"iw-IL",
"ja-JP",
"jv-ID",
"ka-GE",
"kam-KE",
"kea-CV",
"kk-KZ",
"km-KH",
"kn-IN",
"ko-KR",
"ky-KG",
"lb-LU",
"lg-UG",
"ln-CD",
"lo-LA",
"lt-LT",
"luo-KE",
"lv-LV",
"mi-NZ",
"mk-MK",
"ml-IN",
"mn-MN",
"mr-IN",
"ms-MY",
"mt-MT",
"my-MM",
"ne-NP",
"nl-BE",
"nl-NL",
"no-NO",
"nso-ZA",
"ny-MW",
"oc-FR",
"om-ET",
"or-IN",
"pa-Guru-IN",
"pl-PL",
"ps-AF",
"pt-BR",
"pt-PT",
"ro-RO",
"ru-RU",
"rup-BG",
"rw-RW",
"sd-IN",
"si-LK",
"sk-SK",
"sl-SI",
"sn-ZW",
"so-SO",
"sq-AL",
"sr-RS",
"ss-Latn-ZA",
"st-ZA",
"su-ID",
"sv-SE",
"sw",
"sw-KE",
"ta-IN",
"te-IN",
"tg-TJ",
"th-TH",
"tn-Latn-ZA",
"tr-TR",
"ts-ZA",
"uk-UA",
"umb-AO",
"ur-PK",
"uz-UZ",
"ve-ZA",
"vi-VN",
"wo-SN",
"xh-ZA",
"yo-NG",
"yue-Hant-HK",
"zu-ZA",
)
GOOGLE_TTS_MODELS = ("chirp_3_hd",)
GOOGLE_TTS_VOICES = ("en-US-Chirp3-HD-Charon",)
GOOGLE_TTS_LANGUAGES = (
"ar-XA",
"bn-IN",
"bg-BG",
"yue-HK",
"hr-HR",
"cs-CZ",
"da-DK",
"nl-BE",
"nl-NL",
"en-AU",
"en-IN",
"en-GB",
"en-US",
"et-EE",
"fi-FI",
"fr-CA",
"fr-FR",
"de-DE",
"el-GR",
"gu-IN",
"he-IL",
"hi-IN",
"hu-HU",
"id-ID",
"it-IT",
"ja-JP",
"kn-IN",
"ko-KR",
"lv-LV",
"lt-LT",
"ml-IN",
"cmn-CN",
"mr-IN",
"nb-NO",
"pl-PL",
"pt-BR",
"pa-IN",
"ro-RO",
"ru-RU",
"sr-RS",
"sk-SK",
"sl-SI",
"es-ES",
"es-US",
"sw-KE",
"sv-SE",
"ta-IN",
"te-IN",
"th-TH",
"tr-TR",
"uk-UA",
"ur-IN",
"vi-VN",
)

View file

@ -0,0 +1,66 @@
SARVAM_TTS_MODELS = ("bulbul:v2", "bulbul:v3")
SARVAM_V2_VOICES = (
"anushka",
"manisha",
"vidya",
"arya",
"abhilash",
"karun",
"hitesh",
)
SARVAM_V3_VOICES = (
"shubh",
"aditya",
"ritu",
"priya",
"neha",
"rahul",
"pooja",
"rohan",
"simran",
"kavya",
"amit",
"dev",
"ishita",
"shreya",
"ratan",
"varun",
"manan",
"sumit",
"roopa",
"kabir",
"aayan",
"ashutosh",
"advait",
"amelia",
"sophia",
"anand",
"tanya",
"tarun",
"sunny",
"mani",
"gokul",
"vijay",
"shruti",
"suhani",
"mohit",
"kavitha",
"rehan",
"soham",
"rupali",
)
SARVAM_LANGUAGES = (
"bn-IN",
"en-IN",
"gu-IN",
"hi-IN",
"kn-IN",
"ml-IN",
"mr-IN",
"od-IN",
"pa-IN",
"ta-IN",
"te-IN",
"as-IN",
)
SARVAM_STT_MODELS = ("saarika:v2.5", "saaras:v2")

View file

@ -0,0 +1,63 @@
SPEECHMATICS_STT_LANGUAGES = (
"ar",
"ar_en",
"ba",
"eu",
"be",
"bn",
"bg",
"yue",
"ca",
"hr",
"cs",
"da",
"nl",
"en",
"eo",
"et",
"fi",
"fr",
"gl",
"de",
"el",
"he",
"hi",
"hu",
"id",
"ia",
"ga",
"it",
"ja",
"ko",
"lv",
"lt",
"ms",
"en_ms",
"mt",
"cmn",
"cmn_en",
"cmn_en_ms_ta",
"mr",
"mn",
"no",
"fa",
"pl",
"pt",
"ro",
"ru",
"sk",
"sl",
"es",
"sw",
"sv",
"tl",
"ta",
"en_ta",
"th",
"tr",
"uk",
"ur",
"ug",
"vi",
"cy",
)

View file

@ -2,7 +2,32 @@ import random
from enum import Enum, auto
from typing import Annotated, Dict, Literal, Type, TypeVar, Union
from pydantic import BaseModel, Field, computed_field, field_validator
from pydantic import BaseModel, ConfigDict, Field, computed_field, field_validator
from api.services.configuration.options import (
DEEPGRAM_LANGUAGES,
DEEPGRAM_STT_MODELS,
GLADIA_STT_LANGUAGES,
GLADIA_STT_MODELS,
GOOGLE_MODELS,
GOOGLE_REALTIME_LANGUAGES,
GOOGLE_REALTIME_MODELS,
GOOGLE_REALTIME_VOICES,
GOOGLE_STT_LANGUAGES,
GOOGLE_STT_MODELS,
GOOGLE_TTS_LANGUAGES,
GOOGLE_TTS_MODELS,
GOOGLE_TTS_VOICES,
GOOGLE_VERTEX_REALTIME_LANGUAGES,
GOOGLE_VERTEX_REALTIME_MODELS,
GOOGLE_VERTEX_REALTIME_VOICES,
SARVAM_LANGUAGES,
SARVAM_STT_MODELS,
SARVAM_TTS_MODELS,
SARVAM_V2_VOICES,
SARVAM_V3_VOICES,
SPEECHMATICS_STT_LANGUAGES,
)
class ServiceType(Enum):
@ -153,9 +178,56 @@ def register_embeddings(cls: Type[BaseEmbeddingsConfiguration]):
return register_service(ServiceType.EMBEDDINGS)(cls)
def provider_model_config(
title: str,
*,
description: str | None = None,
provider_docs_url: str | None = None,
) -> ConfigDict:
json_schema_extra: dict[str, str] = {}
if description is not None:
json_schema_extra["description"] = description
if provider_docs_url is not None:
json_schema_extra["provider_docs_url"] = provider_docs_url
if json_schema_extra:
return ConfigDict(title=title, json_schema_extra=json_schema_extra)
return ConfigDict(title=title)
###################################################### LLM ########################################################################
# Suggested models for each provider (used for UI dropdown)
OPENAI_PROVIDER_MODEL_CONFIG = provider_model_config("OpenAI")
GOOGLE_PROVIDER_MODEL_CONFIG = provider_model_config("Google")
GROQ_PROVIDER_MODEL_CONFIG = provider_model_config("Groq")
OPENROUTER_PROVIDER_MODEL_CONFIG = provider_model_config("Open Router")
AZURE_OPENAI_PROVIDER_MODEL_CONFIG = provider_model_config("Azure OpenAI")
DOGRAH_PROVIDER_MODEL_CONFIG = provider_model_config("Dograh")
AWS_BEDROCK_PROVIDER_MODEL_CONFIG = provider_model_config("AWS Bedrock")
OPENAI_REALTIME_PROVIDER_MODEL_CONFIG = provider_model_config("OpenAI Realtime")
GOOGLE_REALTIME_PROVIDER_MODEL_CONFIG = provider_model_config("Google Realtime")
GOOGLE_VERTEX_REALTIME_PROVIDER_MODEL_CONFIG = provider_model_config(
"Google Vertex Realtime"
)
DEEPGRAM_PROVIDER_MODEL_CONFIG = provider_model_config("Deepgram")
ELEVENLABS_PROVIDER_MODEL_CONFIG = provider_model_config("ElevenLabs")
CARTESIA_PROVIDER_MODEL_CONFIG = provider_model_config("Cartesia")
SARVAM_PROVIDER_MODEL_CONFIG = provider_model_config("Sarvam")
CAMB_PROVIDER_MODEL_CONFIG = provider_model_config("Camb.ai")
RIME_PROVIDER_MODEL_CONFIG = provider_model_config("Rime")
GOOGLE_CLOUD_PROVIDER_MODEL_CONFIG = provider_model_config("Google Cloud")
SPEECHMATICS_PROVIDER_MODEL_CONFIG = provider_model_config("Speechmatics")
ASSEMBLYAI_PROVIDER_MODEL_CONFIG = provider_model_config("AssemblyAI")
GLADIA_PROVIDER_MODEL_CONFIG = provider_model_config("Gladia")
SPEACHES_PROVIDER_MODEL_CONFIG = provider_model_config(
"Local Models (Speaches)",
description=(
"Self-hosted OpenAI-compatible local models. See the Speaches project "
"for setup and supported backends."
),
provider_docs_url="https://github.com/speaches-ai/speaches",
)
OPENAI_MODELS = [
"gpt-4.1",
"gpt-4.1-mini",
@ -165,12 +237,6 @@ OPENAI_MODELS = [
"gpt-5-nano",
"gpt-3.5-turbo",
]
GOOGLE_MODELS = [
"gemini-2.0-flash",
"gemini-2.0-flash-lite",
"gemini-2.5-flash",
"gemini-2.5-flash-lite",
]
GROQ_MODELS = [
"llama-3.3-70b-versatile",
"deepseek-r1-distill-llama-70b",
@ -204,6 +270,7 @@ AWS_BEDROCK_MODELS = [
@register_llm
class OpenAILLMService(BaseLLMConfiguration):
model_config = OPENAI_PROVIDER_MODEL_CONFIG
provider: Literal[ServiceProviders.OPENAI] = ServiceProviders.OPENAI
model: str = Field(
default="gpt-4.1",
@ -214,6 +281,7 @@ class OpenAILLMService(BaseLLMConfiguration):
@register_llm
class GoogleLLMService(BaseLLMConfiguration):
model_config = GOOGLE_PROVIDER_MODEL_CONFIG
provider: Literal[ServiceProviders.GOOGLE] = ServiceProviders.GOOGLE
model: str = Field(
default="gemini-2.0-flash",
@ -224,6 +292,7 @@ class GoogleLLMService(BaseLLMConfiguration):
@register_llm
class GroqLLMService(BaseLLMConfiguration):
model_config = GROQ_PROVIDER_MODEL_CONFIG
provider: Literal[ServiceProviders.GROQ] = ServiceProviders.GROQ
model: str = Field(
default="llama-3.3-70b-versatile",
@ -234,6 +303,7 @@ class GroqLLMService(BaseLLMConfiguration):
@register_llm
class OpenRouterLLMConfiguration(BaseLLMConfiguration):
model_config = OPENROUTER_PROVIDER_MODEL_CONFIG
provider: Literal[ServiceProviders.OPENROUTER] = ServiceProviders.OPENROUTER
model: str = Field(
default="openai/gpt-4.1",
@ -249,6 +319,7 @@ class OpenRouterLLMConfiguration(BaseLLMConfiguration):
@register_llm
class AzureLLMService(BaseLLMConfiguration):
model_config = AZURE_OPENAI_PROVIDER_MODEL_CONFIG
provider: Literal[ServiceProviders.AZURE] = ServiceProviders.AZURE
model: str = Field(
default="gpt-4.1-mini",
@ -263,6 +334,7 @@ class AzureLLMService(BaseLLMConfiguration):
@register_llm
class DograhLLMService(BaseLLMConfiguration):
model_config = DOGRAH_PROVIDER_MODEL_CONFIG
provider: Literal[ServiceProviders.DOGRAH] = ServiceProviders.DOGRAH
model: str = Field(
default="default",
@ -273,6 +345,7 @@ class DograhLLMService(BaseLLMConfiguration):
@register_llm
class AWSBedrockLLMConfiguration(BaseLLMConfiguration):
model_config = AWS_BEDROCK_PROVIDER_MODEL_CONFIG
provider: Literal[ServiceProviders.AWS_BEDROCK] = ServiceProviders.AWS_BEDROCK
model: str = Field(
default="us.amazon.nova-pro-v1:0",
@ -302,6 +375,7 @@ SPEACHES_LLM_MODELS = ["llama3", "mistral", "phi3", "qwen2", "gemma2", "deepseek
@register_llm
class SpeachesLLMConfiguration(BaseLLMConfiguration):
model_config = SPEACHES_PROVIDER_MODEL_CONFIG
provider: Literal[ServiceProviders.SPEACHES] = ServiceProviders.SPEACHES
model: str = Field(
default="llama3",
@ -336,6 +410,7 @@ OPENAI_REALTIME_VOICES = [
@register_service(ServiceType.REALTIME)
class OpenAIRealtimeLLMConfiguration(BaseLLMConfiguration):
model_config = OPENAI_REALTIME_PROVIDER_MODEL_CONFIG
provider: Literal[ServiceProviders.OPENAI_REALTIME] = (
ServiceProviders.OPENAI_REALTIME
)
@ -357,39 +432,9 @@ class OpenAIRealtimeLLMConfiguration(BaseLLMConfiguration):
)
GOOGLE_REALTIME_MODELS = ["gemini-3.1-flash-live-preview"]
GOOGLE_REALTIME_VOICES = ["Puck", "Charon", "Kore", "Fenrir", "Aoede"]
GOOGLE_REALTIME_LANGUAGES = [
"ar",
"bn",
"de",
"en",
"es",
"fr",
"gu",
"hi",
"id",
"it",
"ja",
"kn",
"ko",
"ml",
"mr",
"nl",
"pl",
"pt",
"ru",
"ta",
"te",
"th",
"tr",
"vi",
"zh",
]
@register_service(ServiceType.REALTIME)
class GoogleRealtimeLLMConfiguration(BaseLLMConfiguration):
model_config = GOOGLE_REALTIME_PROVIDER_MODEL_CONFIG
provider: Literal[ServiceProviders.GOOGLE_REALTIME] = (
ServiceProviders.GOOGLE_REALTIME
)
@ -419,15 +464,9 @@ class GoogleRealtimeLLMConfiguration(BaseLLMConfiguration):
)
GOOGLE_VERTEX_REALTIME_MODELS = [
"google/gemini-live-2.5-flash-native-audio",
]
GOOGLE_VERTEX_REALTIME_VOICES = GOOGLE_REALTIME_VOICES
GOOGLE_VERTEX_REALTIME_LANGUAGES = GOOGLE_REALTIME_LANGUAGES
@register_service(ServiceType.REALTIME)
class GoogleVertexRealtimeLLMConfiguration(BaseLLMConfiguration):
model_config = GOOGLE_VERTEX_REALTIME_PROVIDER_MODEL_CONFIG
provider: Literal[ServiceProviders.GOOGLE_VERTEX_REALTIME] = (
ServiceProviders.GOOGLE_VERTEX_REALTIME
)
@ -512,6 +551,7 @@ RealtimeConfig = Annotated[
@register_tts
class DeepgramTTSConfiguration(BaseServiceConfiguration):
model_config = DEEPGRAM_PROVIDER_MODEL_CONFIG
provider: Literal[ServiceProviders.DEEPGRAM] = ServiceProviders.DEEPGRAM
voice: str = Field(
default="aura-2-helena-en",
@ -537,6 +577,7 @@ ELEVENLABS_TTS_MODELS = ["eleven_flash_v2_5"]
@register_tts
class ElevenlabsTTSConfiguration(BaseServiceConfiguration):
model_config = ELEVENLABS_PROVIDER_MODEL_CONFIG
provider: Literal[ServiceProviders.ELEVENLABS] = ServiceProviders.ELEVENLABS
voice: str = Field(
default="21m00Tcm4TlvDq8ikWAM",
@ -558,11 +599,70 @@ class ElevenlabsTTSConfiguration(BaseServiceConfiguration):
)
@register_tts
class GoogleTTSConfiguration(BaseTTSConfiguration):
model_config = GOOGLE_CLOUD_PROVIDER_MODEL_CONFIG
provider: Literal[ServiceProviders.GOOGLE] = ServiceProviders.GOOGLE
model: str = Field(
default="chirp_3_hd",
description=(
"Google Cloud low-latency TTS engine. Dograh maps this to Pipecat's "
"streaming Google TTS service for Chirp 3 HD and Journey voices."
),
json_schema_extra={
"examples": GOOGLE_TTS_MODELS,
"allow_custom_input": True,
},
)
voice: str = Field(
default="en-US-Chirp3-HD-Charon",
description="Google Cloud voice name. Use a Chirp 3 HD or Journey voice for streaming TTS.",
json_schema_extra={
"examples": GOOGLE_TTS_VOICES,
"allow_custom_input": True,
},
)
language: str = Field(
default="en-US",
description="BCP-47 language code for synthesis.",
json_schema_extra={
"examples": GOOGLE_TTS_LANGUAGES,
"allow_custom_input": True,
},
)
speed: float = Field(
default=1.0,
ge=0.25,
le=2.0,
description="Speech speed multiplier for Google streaming TTS.",
)
location: str | None = Field(
default=None,
description=(
"Optional Google Cloud regional Text-to-Speech endpoint (for example "
"'us-central1'). Leave blank to use the default endpoint."
),
)
credentials: str | None = Field(
default=None,
description=(
"Paste the entire Google Cloud service-account JSON. If omitted, "
"the server falls back to Application Default Credentials (ADC)."
),
json_schema_extra={"multiline": True},
)
api_key: str | list[str] | None = Field(
default=None,
description="Not used for Google Cloud TTS. Leave blank.",
)
OPENAI_TTS_MODELS = ["gpt-4o-mini-tts"]
@register_tts
class OpenAITTSService(BaseTTSConfiguration):
model_config = OPENAI_PROVIDER_MODEL_CONFIG
provider: Literal[ServiceProviders.OPENAI] = ServiceProviders.OPENAI
model: str = Field(
default="gpt-4o-mini-tts",
@ -580,6 +680,7 @@ DOGRAH_TTS_MODELS = ["default"]
@register_tts
class DograhTTSService(BaseTTSConfiguration):
model_config = DOGRAH_PROVIDER_MODEL_CONFIG
provider: Literal[ServiceProviders.DOGRAH] = ServiceProviders.DOGRAH
model: str = Field(
default="default",
@ -598,6 +699,7 @@ CARTESIA_TTS_MODELS = ["sonic-3"]
@register_tts
class CartesiaTTSConfiguration(BaseTTSConfiguration):
model_config = CARTESIA_PROVIDER_MODEL_CONFIG
provider: Literal[ServiceProviders.CARTESIA] = ServiceProviders.CARTESIA
model: str = Field(
default="sonic-3",
@ -617,75 +719,9 @@ class CartesiaTTSConfiguration(BaseTTSConfiguration):
)
SARVAM_TTS_MODELS = ["bulbul:v2", "bulbul:v3"]
SARVAM_V2_VOICES = [
"anushka",
"manisha",
"vidya",
"arya",
"abhilash",
"karun",
"hitesh",
]
SARVAM_V3_VOICES = [
"shubh",
"aditya",
"ritu",
"priya",
"neha",
"rahul",
"pooja",
"rohan",
"simran",
"kavya",
"amit",
"dev",
"ishita",
"shreya",
"ratan",
"varun",
"manan",
"sumit",
"roopa",
"kabir",
"aayan",
"ashutosh",
"advait",
"amelia",
"sophia",
"anand",
"tanya",
"tarun",
"sunny",
"mani",
"gokul",
"vijay",
"shruti",
"suhani",
"mohit",
"kavitha",
"rehan",
"soham",
"rupali",
]
SARVAM_LANGUAGES = [
"bn-IN",
"en-IN",
"gu-IN",
"hi-IN",
"kn-IN",
"ml-IN",
"mr-IN",
"od-IN",
"pa-IN",
"ta-IN",
"te-IN",
"as-IN",
]
@register_tts
class SarvamTTSConfiguration(BaseTTSConfiguration):
model_config = SARVAM_PROVIDER_MODEL_CONFIG
provider: Literal[ServiceProviders.SARVAM] = ServiceProviders.SARVAM
model: str = Field(
default="bulbul:v2",
@ -715,6 +751,7 @@ CAMB_TTS_MODELS = ["mars-flash", "mars-pro", "mars-instruct"]
@register_tts
class CambTTSConfiguration(BaseTTSConfiguration):
model_config = CAMB_PROVIDER_MODEL_CONFIG
provider: Literal[ServiceProviders.CAMB] = ServiceProviders.CAMB
model: str = Field(
default="mars-flash",
@ -731,6 +768,7 @@ RIME_TTS_LANGUAGES = ["en", "de", "fr", "es", "hi"]
@register_tts
class RimeTTSConfiguration(BaseTTSConfiguration):
model_config = RIME_PROVIDER_MODEL_CONFIG
provider: Literal[ServiceProviders.RIME] = ServiceProviders.RIME
model: str = Field(
default="arcana",
@ -756,6 +794,7 @@ SPEACHES_TTS_MODELS = ["hexgrad/Kokoro-82M"]
@register_tts
class SpeachesTTSConfiguration(BaseTTSConfiguration):
model_config = SPEACHES_PROVIDER_MODEL_CONFIG
provider: Literal[ServiceProviders.SPEACHES] = ServiceProviders.SPEACHES
model: str = Field(
default="kokoro",
@ -786,6 +825,7 @@ class SpeachesTTSConfiguration(BaseTTSConfiguration):
TTSConfig = Annotated[
Union[
DeepgramTTSConfiguration,
GoogleTTSConfiguration,
OpenAITTSService,
ElevenlabsTTSConfiguration,
CartesiaTTSConfiguration,
@ -801,94 +841,9 @@ TTSConfig = Annotated[
###################################################### STT ########################################################################
DEEPGRAM_STT_MODELS = ["nova-3-general", "flux-general-en", "flux-general-multi"]
DEEPGRAM_LANGUAGES = [
"multi",
"ar",
"ar-AE",
"ar-SA",
"ar-QA",
"ar-KW",
"ar-SY",
"ar-LB",
"ar-PS",
"ar-JO",
"ar-EG",
"ar-SD",
"ar-TD",
"ar-MA",
"ar-DZ",
"ar-TN",
"ar-IQ",
"ar-IR",
"be",
"bn",
"bs",
"bg",
"ca",
"cs",
"da",
"da-DK",
"de",
"de-CH",
"el",
"en",
"en-US",
"en-AU",
"en-GB",
"en-IN",
"en-NZ",
"es",
"es-419",
"et",
"fa",
"fi",
"fr",
"fr-CA",
"he",
"hi",
"hr",
"hu",
"id",
"it",
"ja",
"kn",
"ko",
"ko-KR",
"lt",
"lv",
"mk",
"mr",
"ms",
"nl",
"nl-BE",
"no",
"pl",
"pt",
"pt-BR",
"pt-PT",
"ro",
"ru",
"sk",
"sl",
"sr",
"sv",
"sv-SE",
"ta",
"te",
"th",
"tl",
"tr",
"uk",
"ur",
"vi",
"zh-CN",
"zh-TW",
]
@register_stt
class DeepgramSTTConfiguration(BaseSTTConfiguration):
model_config = DEEPGRAM_PROVIDER_MODEL_CONFIG
provider: Literal[ServiceProviders.DEEPGRAM] = ServiceProviders.DEEPGRAM
model: str = Field(
default="nova-3-general",
@ -902,7 +857,7 @@ class DeepgramSTTConfiguration(BaseSTTConfiguration):
"examples": DEEPGRAM_LANGUAGES,
"model_options": {
"nova-3-general": DEEPGRAM_LANGUAGES,
"flux-general-en": ["en"],
"flux-general-en": ("en",),
},
},
)
@ -913,6 +868,7 @@ CARTESIA_STT_MODELS = ["ink-whisper"]
@register_stt
class CartesiaSTTConfiguration(BaseSTTConfiguration):
model_config = CARTESIA_PROVIDER_MODEL_CONFIG
provider: Literal[ServiceProviders.CARTESIA] = ServiceProviders.CARTESIA
model: str = Field(
default="ink-whisper",
@ -926,6 +882,7 @@ OPENAI_STT_MODELS = ["gpt-4o-transcribe"]
@register_stt
class OpenAISTTConfiguration(BaseSTTConfiguration):
model_config = OPENAI_PROVIDER_MODEL_CONFIG
provider: Literal[ServiceProviders.OPENAI] = ServiceProviders.OPENAI
model: str = Field(
default="gpt-4o-transcribe",
@ -934,6 +891,45 @@ class OpenAISTTConfiguration(BaseSTTConfiguration):
)
@register_stt
class GoogleSTTConfiguration(BaseSTTConfiguration):
model_config = GOOGLE_CLOUD_PROVIDER_MODEL_CONFIG
provider: Literal[ServiceProviders.GOOGLE] = ServiceProviders.GOOGLE
model: str = Field(
default="latest_long",
description="Google Cloud Speech-to-Text V2 recognition model.",
json_schema_extra={
"examples": GOOGLE_STT_MODELS,
"allow_custom_input": True,
},
)
language: str = Field(
default="en-US",
description="Primary BCP-47 language code for recognition.",
json_schema_extra={
"examples": GOOGLE_STT_LANGUAGES,
"allow_custom_input": True,
"docs_url": "https://docs.cloud.google.com/speech-to-text/docs/speech-to-text-supported-languages",
},
)
location: str = Field(
default="global",
description="Google Cloud Speech-to-Text region (for example 'global' or 'us-central1').",
)
credentials: str | None = Field(
default=None,
description=(
"Paste the entire Google Cloud service-account JSON. If omitted, "
"the server falls back to Application Default Credentials (ADC)."
),
json_schema_extra={"multiline": True},
)
api_key: str | list[str] | None = Field(
default=None,
description="Not used for Google Cloud STT. Leave blank.",
)
# Dograh STT Service
DOGRAH_STT_MODELS = ["default"]
DOGRAH_STT_LANGUAGES = DEEPGRAM_LANGUAGES
@ -941,6 +937,7 @@ DOGRAH_STT_LANGUAGES = DEEPGRAM_LANGUAGES
@register_stt
class DograhSTTService(BaseSTTConfiguration):
model_config = DOGRAH_PROVIDER_MODEL_CONFIG
provider: Literal[ServiceProviders.DOGRAH] = ServiceProviders.DOGRAH
model: str = Field(
default="default",
@ -954,12 +951,9 @@ class DograhSTTService(BaseSTTConfiguration):
)
# Sarvam STT Service
SARVAM_STT_MODELS = ["saarika:v2.5", "saaras:v2"]
@register_stt
class SarvamSTTConfiguration(BaseSTTConfiguration):
model_config = SARVAM_PROVIDER_MODEL_CONFIG
provider: Literal[ServiceProviders.SARVAM] = ServiceProviders.SARVAM
model: str = Field(
default="saarika:v2.5",
@ -973,74 +967,9 @@ class SarvamSTTConfiguration(BaseSTTConfiguration):
)
# Speechmatics STT Service
SPEECHMATICS_STT_LANGUAGES = [
"ar",
"ar_en",
"ba",
"eu",
"be",
"bn",
"bg",
"yue",
"ca",
"hr",
"cs",
"da",
"nl",
"en",
"eo",
"et",
"fi",
"fr",
"gl",
"de",
"el",
"he",
"hi",
"hu",
"id",
"ia",
"ga",
"it",
"ja",
"ko",
"lv",
"lt",
"ms",
"en_ms",
"mt",
"cmn",
"cmn_en",
"cmn_en_ms_ta",
"mr",
"mn",
"no",
"fa",
"pl",
"pt",
"ro",
"ru",
"sk",
"sl",
"es",
"sw",
"sv",
"tl",
"ta",
"en_ta",
"th",
"tr",
"uk",
"ur",
"ug",
"vi",
"cy",
]
@register_stt
class SpeechmaticsSTTConfiguration(BaseSTTConfiguration):
model_config = SPEECHMATICS_PROVIDER_MODEL_CONFIG
provider: Literal[ServiceProviders.SPEECHMATICS] = ServiceProviders.SPEECHMATICS
model: str = Field(
default="enhanced",
@ -1062,6 +991,7 @@ SPEACHES_STT_LANGUAGES = ["en", "ar", "nl", "fr", "de", "hi", "it", "pt", "es"]
@register_stt
class SpeachesSTTConfiguration(BaseSTTConfiguration):
model_config = SPEACHES_PROVIDER_MODEL_CONFIG
provider: Literal[ServiceProviders.SPEACHES] = ServiceProviders.SPEACHES
model: str = Field(
default="Systran/faster-distil-whisper-small.en",
@ -1095,6 +1025,7 @@ ASSEMBLYAI_STT_LANGUAGES = ["en", "es", "de", "fr", "pt", "it"]
@register_stt
class AssemblyAISTTConfiguration(BaseSTTConfiguration):
model_config = ASSEMBLYAI_PROVIDER_MODEL_CONFIG
provider: Literal[ServiceProviders.ASSEMBLYAI] = ServiceProviders.ASSEMBLYAI
model: str = Field(
default="u3-rt-pro",
@ -1108,113 +1039,9 @@ class AssemblyAISTTConfiguration(BaseSTTConfiguration):
)
GLADIA_STT_MODELS = ["solaria-1"]
GLADIA_STT_LANGUAGES = [
"af",
"am",
"ar",
"as",
"az",
"ba",
"be",
"bg",
"bn",
"bo",
"br",
"bs",
"ca",
"cs",
"cy",
"da",
"de",
"el",
"en",
"es",
"et",
"eu",
"fa",
"fi",
"fo",
"fr",
"gl",
"gu",
"ha",
"haw",
"he",
"hi",
"hr",
"ht",
"hu",
"hy",
"id",
"is",
"it",
"ja",
"jw",
"ka",
"kk",
"km",
"kn",
"ko",
"la",
"lb",
"ln",
"lo",
"lt",
"lv",
"mg",
"mi",
"mk",
"ml",
"mn",
"mr",
"ms",
"mt",
"my",
"ne",
"nl",
"nn",
"no",
"oc",
"pa",
"pl",
"ps",
"pt",
"ro",
"ru",
"sa",
"sd",
"si",
"sk",
"sl",
"sn",
"so",
"sq",
"sr",
"su",
"sv",
"sw",
"ta",
"te",
"tg",
"th",
"tk",
"tl",
"tr",
"tt",
"uk",
"ur",
"uz",
"vi",
"wo",
"yi",
"yo",
"zh",
]
@register_stt
class GladiaSTTConfiguration(BaseSTTConfiguration):
model_config = GLADIA_PROVIDER_MODEL_CONFIG
provider: Literal[ServiceProviders.GLADIA] = ServiceProviders.GLADIA
model: str = Field(
default="solaria-1",
@ -1233,6 +1060,7 @@ STTConfig = Annotated[
DeepgramSTTConfiguration,
CartesiaSTTConfiguration,
OpenAISTTConfiguration,
GoogleSTTConfiguration,
DograhSTTService,
SpeechmaticsSTTConfiguration,
SarvamSTTConfiguration,
@ -1250,6 +1078,7 @@ OPENAI_EMBEDDING_MODELS = ["text-embedding-3-small"]
@register_embeddings
class OpenAIEmbeddingsConfiguration(BaseEmbeddingsConfiguration):
model_config = OPENAI_PROVIDER_MODEL_CONFIG
provider: Literal[ServiceProviders.OPENAI] = ServiceProviders.OPENAI
model: str = Field(
default="text-embedding-3-small",
@ -1263,6 +1092,7 @@ OPENROUTER_EMBEDDING_MODELS = ["openai/text-embedding-3-small"]
@register_embeddings
class OpenRouterEmbeddingsConfiguration(BaseEmbeddingsConfiguration):
model_config = OPENROUTER_PROVIDER_MODEL_CONFIG
provider: Literal[ServiceProviders.OPENROUTER] = ServiceProviders.OPENROUTER
model: str = Field(
default="openai/text-embedding-3-small",