feat: add google stt and tts. add folders to organize agents

This commit is contained in:
Abhishek Kumar 2026-05-22 14:36:50 +05:30
parent 21951eca18
commit ad2fa07058
52 changed files with 3412 additions and 621 deletions

View file

@ -0,0 +1,49 @@
from .deepgram import DEEPGRAM_LANGUAGES, DEEPGRAM_STT_MODELS
from .gladia import GLADIA_STT_LANGUAGES, GLADIA_STT_MODELS
from .google import (
GOOGLE_MODELS,
GOOGLE_REALTIME_LANGUAGES,
GOOGLE_REALTIME_MODELS,
GOOGLE_REALTIME_VOICES,
GOOGLE_STT_LANGUAGES,
GOOGLE_STT_MODELS,
GOOGLE_TTS_LANGUAGES,
GOOGLE_TTS_MODELS,
GOOGLE_TTS_VOICES,
GOOGLE_VERTEX_REALTIME_LANGUAGES,
GOOGLE_VERTEX_REALTIME_MODELS,
GOOGLE_VERTEX_REALTIME_VOICES,
)
from .sarvam import (
SARVAM_LANGUAGES,
SARVAM_STT_MODELS,
SARVAM_TTS_MODELS,
SARVAM_V2_VOICES,
SARVAM_V3_VOICES,
)
from .speechmatics import SPEECHMATICS_STT_LANGUAGES
__all__ = [
"DEEPGRAM_LANGUAGES",
"DEEPGRAM_STT_MODELS",
"GLADIA_STT_LANGUAGES",
"GLADIA_STT_MODELS",
"GOOGLE_MODELS",
"GOOGLE_REALTIME_LANGUAGES",
"GOOGLE_REALTIME_MODELS",
"GOOGLE_REALTIME_VOICES",
"GOOGLE_STT_LANGUAGES",
"GOOGLE_STT_MODELS",
"GOOGLE_TTS_LANGUAGES",
"GOOGLE_TTS_MODELS",
"GOOGLE_TTS_VOICES",
"GOOGLE_VERTEX_REALTIME_LANGUAGES",
"GOOGLE_VERTEX_REALTIME_MODELS",
"GOOGLE_VERTEX_REALTIME_VOICES",
"SARVAM_LANGUAGES",
"SARVAM_STT_MODELS",
"SARVAM_TTS_MODELS",
"SARVAM_V2_VOICES",
"SARVAM_V3_VOICES",
"SPEECHMATICS_STT_LANGUAGES",
]

View file

@ -0,0 +1,84 @@
DEEPGRAM_STT_MODELS = ("nova-3-general", "flux-general-en", "flux-general-multi")
DEEPGRAM_LANGUAGES = (
"multi",
"ar",
"ar-AE",
"ar-SA",
"ar-QA",
"ar-KW",
"ar-SY",
"ar-LB",
"ar-PS",
"ar-JO",
"ar-EG",
"ar-SD",
"ar-TD",
"ar-MA",
"ar-DZ",
"ar-TN",
"ar-IQ",
"ar-IR",
"be",
"bn",
"bs",
"bg",
"ca",
"cs",
"da",
"da-DK",
"de",
"de-CH",
"el",
"en",
"en-US",
"en-AU",
"en-GB",
"en-IN",
"en-NZ",
"es",
"es-419",
"et",
"fa",
"fi",
"fr",
"fr-CA",
"he",
"hi",
"hr",
"hu",
"id",
"it",
"ja",
"kn",
"ko",
"ko-KR",
"lt",
"lv",
"mk",
"mr",
"ms",
"nl",
"nl-BE",
"no",
"pl",
"pt",
"pt-BR",
"pt-PT",
"ro",
"ru",
"sk",
"sl",
"sr",
"sv",
"sv-SE",
"ta",
"te",
"th",
"tl",
"tr",
"uk",
"ur",
"vi",
"zh-CN",
"zh-TW",
)

View file

@ -0,0 +1,103 @@
GLADIA_STT_MODELS = ("solaria-1",)
GLADIA_STT_LANGUAGES = (
"af",
"am",
"ar",
"as",
"az",
"ba",
"be",
"bg",
"bn",
"bo",
"br",
"bs",
"ca",
"cs",
"cy",
"da",
"de",
"el",
"en",
"es",
"et",
"eu",
"fa",
"fi",
"fo",
"fr",
"gl",
"gu",
"ha",
"haw",
"he",
"hi",
"hr",
"ht",
"hu",
"hy",
"id",
"is",
"it",
"ja",
"jw",
"ka",
"kk",
"km",
"kn",
"ko",
"la",
"lb",
"ln",
"lo",
"lt",
"lv",
"mg",
"mi",
"mk",
"ml",
"mn",
"mr",
"ms",
"mt",
"my",
"ne",
"nl",
"nn",
"no",
"oc",
"pa",
"pl",
"ps",
"pt",
"ro",
"ru",
"sa",
"sd",
"si",
"sk",
"sl",
"sn",
"so",
"sq",
"sr",
"su",
"sv",
"sw",
"ta",
"te",
"tg",
"th",
"tk",
"tl",
"tr",
"tt",
"uk",
"ur",
"uz",
"vi",
"wo",
"yi",
"yo",
"zh",
)

View file

@ -0,0 +1,273 @@
GOOGLE_MODELS = (
"gemini-2.0-flash",
"gemini-2.0-flash-lite",
"gemini-2.5-flash",
"gemini-2.5-flash-lite",
)
GOOGLE_REALTIME_MODELS = ("gemini-3.1-flash-live-preview",)
GOOGLE_REALTIME_VOICES = ("Puck", "Charon", "Kore", "Fenrir", "Aoede")
GOOGLE_REALTIME_LANGUAGES = (
"ar",
"bn",
"de",
"en",
"es",
"fr",
"gu",
"hi",
"id",
"it",
"ja",
"kn",
"ko",
"ml",
"mr",
"nl",
"pl",
"pt",
"ru",
"ta",
"te",
"th",
"tr",
"vi",
"zh",
)
GOOGLE_VERTEX_REALTIME_MODELS = ("google/gemini-live-2.5-flash-native-audio",)
GOOGLE_VERTEX_REALTIME_VOICES = GOOGLE_REALTIME_VOICES
GOOGLE_VERTEX_REALTIME_LANGUAGES = GOOGLE_REALTIME_LANGUAGES
GOOGLE_STT_MODELS = ("latest_long", "latest_short", "chirp_3")
# Docs-derived from Google Cloud Speech-to-Text V2 supported languages.
GOOGLE_STT_LANGUAGES = (
"af-ZA",
"am-ET",
"ar-AE",
"ar-BH",
"ar-DZ",
"ar-EG",
"ar-IL",
"ar-IQ",
"ar-JO",
"ar-KW",
"ar-LB",
"ar-MA",
"ar-MR",
"ar-OM",
"ar-PS",
"ar-QA",
"ar-SA",
"ar-SY",
"ar-TN",
"ar-XA",
"ar-YE",
"as-IN",
"ast-ES",
"az-AZ",
"be-BY",
"bg-BG",
"bn-BD",
"bn-IN",
"bs-BA",
"ca-ES",
"ceb-PH",
"ckb-IQ",
"cmn-Hans-CN",
"cmn-Hant-TW",
"cs-CZ",
"cy-GB",
"da-DK",
"de-AT",
"de-CH",
"de-DE",
"el-GR",
"en-AU",
"en-GB",
"en-HK",
"en-IE",
"en-IN",
"en-NZ",
"en-PH",
"en-PK",
"en-SG",
"en-US",
"es-419",
"es-AR",
"es-BO",
"es-CL",
"es-CO",
"es-CR",
"es-DO",
"es-EC",
"es-ES",
"es-GT",
"es-HN",
"es-MX",
"es-NI",
"es-PA",
"es-PE",
"es-PR",
"es-SV",
"es-US",
"es-UY",
"es-VE",
"et-EE",
"eu-ES",
"fa-IR",
"ff-SN",
"fi-FI",
"fil-PH",
"fr-BE",
"fr-CA",
"fr-CH",
"fr-FR",
"ga-IE",
"gl-ES",
"gu-IN",
"ha-NG",
"hi-IN",
"hr-HR",
"hu-HU",
"hy-AM",
"id-ID",
"ig-NG",
"is-IS",
"it-CH",
"it-IT",
"iw-IL",
"ja-JP",
"jv-ID",
"ka-GE",
"kam-KE",
"kea-CV",
"kk-KZ",
"km-KH",
"kn-IN",
"ko-KR",
"ky-KG",
"lb-LU",
"lg-UG",
"ln-CD",
"lo-LA",
"lt-LT",
"luo-KE",
"lv-LV",
"mi-NZ",
"mk-MK",
"ml-IN",
"mn-MN",
"mr-IN",
"ms-MY",
"mt-MT",
"my-MM",
"ne-NP",
"nl-BE",
"nl-NL",
"no-NO",
"nso-ZA",
"ny-MW",
"oc-FR",
"om-ET",
"or-IN",
"pa-Guru-IN",
"pl-PL",
"ps-AF",
"pt-BR",
"pt-PT",
"ro-RO",
"ru-RU",
"rup-BG",
"rw-RW",
"sd-IN",
"si-LK",
"sk-SK",
"sl-SI",
"sn-ZW",
"so-SO",
"sq-AL",
"sr-RS",
"ss-Latn-ZA",
"st-ZA",
"su-ID",
"sv-SE",
"sw",
"sw-KE",
"ta-IN",
"te-IN",
"tg-TJ",
"th-TH",
"tn-Latn-ZA",
"tr-TR",
"ts-ZA",
"uk-UA",
"umb-AO",
"ur-PK",
"uz-UZ",
"ve-ZA",
"vi-VN",
"wo-SN",
"xh-ZA",
"yo-NG",
"yue-Hant-HK",
"zu-ZA",
)
GOOGLE_TTS_MODELS = ("chirp_3_hd",)
GOOGLE_TTS_VOICES = ("en-US-Chirp3-HD-Charon",)
GOOGLE_TTS_LANGUAGES = (
"ar-XA",
"bn-IN",
"bg-BG",
"yue-HK",
"hr-HR",
"cs-CZ",
"da-DK",
"nl-BE",
"nl-NL",
"en-AU",
"en-IN",
"en-GB",
"en-US",
"et-EE",
"fi-FI",
"fr-CA",
"fr-FR",
"de-DE",
"el-GR",
"gu-IN",
"he-IL",
"hi-IN",
"hu-HU",
"id-ID",
"it-IT",
"ja-JP",
"kn-IN",
"ko-KR",
"lv-LV",
"lt-LT",
"ml-IN",
"cmn-CN",
"mr-IN",
"nb-NO",
"pl-PL",
"pt-BR",
"pa-IN",
"ro-RO",
"ru-RU",
"sr-RS",
"sk-SK",
"sl-SI",
"es-ES",
"es-US",
"sw-KE",
"sv-SE",
"ta-IN",
"te-IN",
"th-TH",
"tr-TR",
"uk-UA",
"ur-IN",
"vi-VN",
)

View file

@ -0,0 +1,66 @@
SARVAM_TTS_MODELS = ("bulbul:v2", "bulbul:v3")
SARVAM_V2_VOICES = (
"anushka",
"manisha",
"vidya",
"arya",
"abhilash",
"karun",
"hitesh",
)
SARVAM_V3_VOICES = (
"shubh",
"aditya",
"ritu",
"priya",
"neha",
"rahul",
"pooja",
"rohan",
"simran",
"kavya",
"amit",
"dev",
"ishita",
"shreya",
"ratan",
"varun",
"manan",
"sumit",
"roopa",
"kabir",
"aayan",
"ashutosh",
"advait",
"amelia",
"sophia",
"anand",
"tanya",
"tarun",
"sunny",
"mani",
"gokul",
"vijay",
"shruti",
"suhani",
"mohit",
"kavitha",
"rehan",
"soham",
"rupali",
)
SARVAM_LANGUAGES = (
"bn-IN",
"en-IN",
"gu-IN",
"hi-IN",
"kn-IN",
"ml-IN",
"mr-IN",
"od-IN",
"pa-IN",
"ta-IN",
"te-IN",
"as-IN",
)
SARVAM_STT_MODELS = ("saarika:v2.5", "saaras:v2")

View file

@ -0,0 +1,63 @@
SPEECHMATICS_STT_LANGUAGES = (
"ar",
"ar_en",
"ba",
"eu",
"be",
"bn",
"bg",
"yue",
"ca",
"hr",
"cs",
"da",
"nl",
"en",
"eo",
"et",
"fi",
"fr",
"gl",
"de",
"el",
"he",
"hi",
"hu",
"id",
"ia",
"ga",
"it",
"ja",
"ko",
"lv",
"lt",
"ms",
"en_ms",
"mt",
"cmn",
"cmn_en",
"cmn_en_ms_ta",
"mr",
"mn",
"no",
"fa",
"pl",
"pt",
"ro",
"ru",
"sk",
"sl",
"es",
"sw",
"sv",
"tl",
"ta",
"en_ta",
"th",
"tr",
"uk",
"ur",
"ug",
"vi",
"cy",
)

View file

@ -2,7 +2,32 @@ import random
from enum import Enum, auto
from typing import Annotated, Dict, Literal, Type, TypeVar, Union
from pydantic import BaseModel, Field, computed_field, field_validator
from pydantic import BaseModel, ConfigDict, Field, computed_field, field_validator
from api.services.configuration.options import (
DEEPGRAM_LANGUAGES,
DEEPGRAM_STT_MODELS,
GLADIA_STT_LANGUAGES,
GLADIA_STT_MODELS,
GOOGLE_MODELS,
GOOGLE_REALTIME_LANGUAGES,
GOOGLE_REALTIME_MODELS,
GOOGLE_REALTIME_VOICES,
GOOGLE_STT_LANGUAGES,
GOOGLE_STT_MODELS,
GOOGLE_TTS_LANGUAGES,
GOOGLE_TTS_MODELS,
GOOGLE_TTS_VOICES,
GOOGLE_VERTEX_REALTIME_LANGUAGES,
GOOGLE_VERTEX_REALTIME_MODELS,
GOOGLE_VERTEX_REALTIME_VOICES,
SARVAM_LANGUAGES,
SARVAM_STT_MODELS,
SARVAM_TTS_MODELS,
SARVAM_V2_VOICES,
SARVAM_V3_VOICES,
SPEECHMATICS_STT_LANGUAGES,
)
class ServiceType(Enum):
@ -153,9 +178,56 @@ def register_embeddings(cls: Type[BaseEmbeddingsConfiguration]):
return register_service(ServiceType.EMBEDDINGS)(cls)
def provider_model_config(
title: str,
*,
description: str | None = None,
provider_docs_url: str | None = None,
) -> ConfigDict:
json_schema_extra: dict[str, str] = {}
if description is not None:
json_schema_extra["description"] = description
if provider_docs_url is not None:
json_schema_extra["provider_docs_url"] = provider_docs_url
if json_schema_extra:
return ConfigDict(title=title, json_schema_extra=json_schema_extra)
return ConfigDict(title=title)
###################################################### LLM ########################################################################
# Suggested models for each provider (used for UI dropdown)
OPENAI_PROVIDER_MODEL_CONFIG = provider_model_config("OpenAI")
GOOGLE_PROVIDER_MODEL_CONFIG = provider_model_config("Google")
GROQ_PROVIDER_MODEL_CONFIG = provider_model_config("Groq")
OPENROUTER_PROVIDER_MODEL_CONFIG = provider_model_config("Open Router")
AZURE_OPENAI_PROVIDER_MODEL_CONFIG = provider_model_config("Azure OpenAI")
DOGRAH_PROVIDER_MODEL_CONFIG = provider_model_config("Dograh")
AWS_BEDROCK_PROVIDER_MODEL_CONFIG = provider_model_config("AWS Bedrock")
OPENAI_REALTIME_PROVIDER_MODEL_CONFIG = provider_model_config("OpenAI Realtime")
GOOGLE_REALTIME_PROVIDER_MODEL_CONFIG = provider_model_config("Google Realtime")
GOOGLE_VERTEX_REALTIME_PROVIDER_MODEL_CONFIG = provider_model_config(
"Google Vertex Realtime"
)
DEEPGRAM_PROVIDER_MODEL_CONFIG = provider_model_config("Deepgram")
ELEVENLABS_PROVIDER_MODEL_CONFIG = provider_model_config("ElevenLabs")
CARTESIA_PROVIDER_MODEL_CONFIG = provider_model_config("Cartesia")
SARVAM_PROVIDER_MODEL_CONFIG = provider_model_config("Sarvam")
CAMB_PROVIDER_MODEL_CONFIG = provider_model_config("Camb.ai")
RIME_PROVIDER_MODEL_CONFIG = provider_model_config("Rime")
GOOGLE_CLOUD_PROVIDER_MODEL_CONFIG = provider_model_config("Google Cloud")
SPEECHMATICS_PROVIDER_MODEL_CONFIG = provider_model_config("Speechmatics")
ASSEMBLYAI_PROVIDER_MODEL_CONFIG = provider_model_config("AssemblyAI")
GLADIA_PROVIDER_MODEL_CONFIG = provider_model_config("Gladia")
SPEACHES_PROVIDER_MODEL_CONFIG = provider_model_config(
"Local Models (Speaches)",
description=(
"Self-hosted OpenAI-compatible local models. See the Speaches project "
"for setup and supported backends."
),
provider_docs_url="https://github.com/speaches-ai/speaches",
)
OPENAI_MODELS = [
"gpt-4.1",
"gpt-4.1-mini",
@ -165,12 +237,6 @@ OPENAI_MODELS = [
"gpt-5-nano",
"gpt-3.5-turbo",
]
GOOGLE_MODELS = [
"gemini-2.0-flash",
"gemini-2.0-flash-lite",
"gemini-2.5-flash",
"gemini-2.5-flash-lite",
]
GROQ_MODELS = [
"llama-3.3-70b-versatile",
"deepseek-r1-distill-llama-70b",
@ -204,6 +270,7 @@ AWS_BEDROCK_MODELS = [
@register_llm
class OpenAILLMService(BaseLLMConfiguration):
model_config = OPENAI_PROVIDER_MODEL_CONFIG
provider: Literal[ServiceProviders.OPENAI] = ServiceProviders.OPENAI
model: str = Field(
default="gpt-4.1",
@ -214,6 +281,7 @@ class OpenAILLMService(BaseLLMConfiguration):
@register_llm
class GoogleLLMService(BaseLLMConfiguration):
model_config = GOOGLE_PROVIDER_MODEL_CONFIG
provider: Literal[ServiceProviders.GOOGLE] = ServiceProviders.GOOGLE
model: str = Field(
default="gemini-2.0-flash",
@ -224,6 +292,7 @@ class GoogleLLMService(BaseLLMConfiguration):
@register_llm
class GroqLLMService(BaseLLMConfiguration):
model_config = GROQ_PROVIDER_MODEL_CONFIG
provider: Literal[ServiceProviders.GROQ] = ServiceProviders.GROQ
model: str = Field(
default="llama-3.3-70b-versatile",
@ -234,6 +303,7 @@ class GroqLLMService(BaseLLMConfiguration):
@register_llm
class OpenRouterLLMConfiguration(BaseLLMConfiguration):
model_config = OPENROUTER_PROVIDER_MODEL_CONFIG
provider: Literal[ServiceProviders.OPENROUTER] = ServiceProviders.OPENROUTER
model: str = Field(
default="openai/gpt-4.1",
@ -249,6 +319,7 @@ class OpenRouterLLMConfiguration(BaseLLMConfiguration):
@register_llm
class AzureLLMService(BaseLLMConfiguration):
model_config = AZURE_OPENAI_PROVIDER_MODEL_CONFIG
provider: Literal[ServiceProviders.AZURE] = ServiceProviders.AZURE
model: str = Field(
default="gpt-4.1-mini",
@ -263,6 +334,7 @@ class AzureLLMService(BaseLLMConfiguration):
@register_llm
class DograhLLMService(BaseLLMConfiguration):
model_config = DOGRAH_PROVIDER_MODEL_CONFIG
provider: Literal[ServiceProviders.DOGRAH] = ServiceProviders.DOGRAH
model: str = Field(
default="default",
@ -273,6 +345,7 @@ class DograhLLMService(BaseLLMConfiguration):
@register_llm
class AWSBedrockLLMConfiguration(BaseLLMConfiguration):
model_config = AWS_BEDROCK_PROVIDER_MODEL_CONFIG
provider: Literal[ServiceProviders.AWS_BEDROCK] = ServiceProviders.AWS_BEDROCK
model: str = Field(
default="us.amazon.nova-pro-v1:0",
@ -302,6 +375,7 @@ SPEACHES_LLM_MODELS = ["llama3", "mistral", "phi3", "qwen2", "gemma2", "deepseek
@register_llm
class SpeachesLLMConfiguration(BaseLLMConfiguration):
model_config = SPEACHES_PROVIDER_MODEL_CONFIG
provider: Literal[ServiceProviders.SPEACHES] = ServiceProviders.SPEACHES
model: str = Field(
default="llama3",
@ -336,6 +410,7 @@ OPENAI_REALTIME_VOICES = [
@register_service(ServiceType.REALTIME)
class OpenAIRealtimeLLMConfiguration(BaseLLMConfiguration):
model_config = OPENAI_REALTIME_PROVIDER_MODEL_CONFIG
provider: Literal[ServiceProviders.OPENAI_REALTIME] = (
ServiceProviders.OPENAI_REALTIME
)
@ -357,39 +432,9 @@ class OpenAIRealtimeLLMConfiguration(BaseLLMConfiguration):
)
GOOGLE_REALTIME_MODELS = ["gemini-3.1-flash-live-preview"]
GOOGLE_REALTIME_VOICES = ["Puck", "Charon", "Kore", "Fenrir", "Aoede"]
GOOGLE_REALTIME_LANGUAGES = [
"ar",
"bn",
"de",
"en",
"es",
"fr",
"gu",
"hi",
"id",
"it",
"ja",
"kn",
"ko",
"ml",
"mr",
"nl",
"pl",
"pt",
"ru",
"ta",
"te",
"th",
"tr",
"vi",
"zh",
]
@register_service(ServiceType.REALTIME)
class GoogleRealtimeLLMConfiguration(BaseLLMConfiguration):
model_config = GOOGLE_REALTIME_PROVIDER_MODEL_CONFIG
provider: Literal[ServiceProviders.GOOGLE_REALTIME] = (
ServiceProviders.GOOGLE_REALTIME
)
@ -419,15 +464,9 @@ class GoogleRealtimeLLMConfiguration(BaseLLMConfiguration):
)
GOOGLE_VERTEX_REALTIME_MODELS = [
"google/gemini-live-2.5-flash-native-audio",
]
GOOGLE_VERTEX_REALTIME_VOICES = GOOGLE_REALTIME_VOICES
GOOGLE_VERTEX_REALTIME_LANGUAGES = GOOGLE_REALTIME_LANGUAGES
@register_service(ServiceType.REALTIME)
class GoogleVertexRealtimeLLMConfiguration(BaseLLMConfiguration):
model_config = GOOGLE_VERTEX_REALTIME_PROVIDER_MODEL_CONFIG
provider: Literal[ServiceProviders.GOOGLE_VERTEX_REALTIME] = (
ServiceProviders.GOOGLE_VERTEX_REALTIME
)
@ -512,6 +551,7 @@ RealtimeConfig = Annotated[
@register_tts
class DeepgramTTSConfiguration(BaseServiceConfiguration):
model_config = DEEPGRAM_PROVIDER_MODEL_CONFIG
provider: Literal[ServiceProviders.DEEPGRAM] = ServiceProviders.DEEPGRAM
voice: str = Field(
default="aura-2-helena-en",
@ -537,6 +577,7 @@ ELEVENLABS_TTS_MODELS = ["eleven_flash_v2_5"]
@register_tts
class ElevenlabsTTSConfiguration(BaseServiceConfiguration):
model_config = ELEVENLABS_PROVIDER_MODEL_CONFIG
provider: Literal[ServiceProviders.ELEVENLABS] = ServiceProviders.ELEVENLABS
voice: str = Field(
default="21m00Tcm4TlvDq8ikWAM",
@ -558,11 +599,70 @@ class ElevenlabsTTSConfiguration(BaseServiceConfiguration):
)
@register_tts
class GoogleTTSConfiguration(BaseTTSConfiguration):
model_config = GOOGLE_CLOUD_PROVIDER_MODEL_CONFIG
provider: Literal[ServiceProviders.GOOGLE] = ServiceProviders.GOOGLE
model: str = Field(
default="chirp_3_hd",
description=(
"Google Cloud low-latency TTS engine. Dograh maps this to Pipecat's "
"streaming Google TTS service for Chirp 3 HD and Journey voices."
),
json_schema_extra={
"examples": GOOGLE_TTS_MODELS,
"allow_custom_input": True,
},
)
voice: str = Field(
default="en-US-Chirp3-HD-Charon",
description="Google Cloud voice name. Use a Chirp 3 HD or Journey voice for streaming TTS.",
json_schema_extra={
"examples": GOOGLE_TTS_VOICES,
"allow_custom_input": True,
},
)
language: str = Field(
default="en-US",
description="BCP-47 language code for synthesis.",
json_schema_extra={
"examples": GOOGLE_TTS_LANGUAGES,
"allow_custom_input": True,
},
)
speed: float = Field(
default=1.0,
ge=0.25,
le=2.0,
description="Speech speed multiplier for Google streaming TTS.",
)
location: str | None = Field(
default=None,
description=(
"Optional Google Cloud regional Text-to-Speech endpoint (for example "
"'us-central1'). Leave blank to use the default endpoint."
),
)
credentials: str | None = Field(
default=None,
description=(
"Paste the entire Google Cloud service-account JSON. If omitted, "
"the server falls back to Application Default Credentials (ADC)."
),
json_schema_extra={"multiline": True},
)
api_key: str | list[str] | None = Field(
default=None,
description="Not used for Google Cloud TTS. Leave blank.",
)
OPENAI_TTS_MODELS = ["gpt-4o-mini-tts"]
@register_tts
class OpenAITTSService(BaseTTSConfiguration):
model_config = OPENAI_PROVIDER_MODEL_CONFIG
provider: Literal[ServiceProviders.OPENAI] = ServiceProviders.OPENAI
model: str = Field(
default="gpt-4o-mini-tts",
@ -580,6 +680,7 @@ DOGRAH_TTS_MODELS = ["default"]
@register_tts
class DograhTTSService(BaseTTSConfiguration):
model_config = DOGRAH_PROVIDER_MODEL_CONFIG
provider: Literal[ServiceProviders.DOGRAH] = ServiceProviders.DOGRAH
model: str = Field(
default="default",
@ -598,6 +699,7 @@ CARTESIA_TTS_MODELS = ["sonic-3"]
@register_tts
class CartesiaTTSConfiguration(BaseTTSConfiguration):
model_config = CARTESIA_PROVIDER_MODEL_CONFIG
provider: Literal[ServiceProviders.CARTESIA] = ServiceProviders.CARTESIA
model: str = Field(
default="sonic-3",
@ -617,75 +719,9 @@ class CartesiaTTSConfiguration(BaseTTSConfiguration):
)
SARVAM_TTS_MODELS = ["bulbul:v2", "bulbul:v3"]
SARVAM_V2_VOICES = [
"anushka",
"manisha",
"vidya",
"arya",
"abhilash",
"karun",
"hitesh",
]
SARVAM_V3_VOICES = [
"shubh",
"aditya",
"ritu",
"priya",
"neha",
"rahul",
"pooja",
"rohan",
"simran",
"kavya",
"amit",
"dev",
"ishita",
"shreya",
"ratan",
"varun",
"manan",
"sumit",
"roopa",
"kabir",
"aayan",
"ashutosh",
"advait",
"amelia",
"sophia",
"anand",
"tanya",
"tarun",
"sunny",
"mani",
"gokul",
"vijay",
"shruti",
"suhani",
"mohit",
"kavitha",
"rehan",
"soham",
"rupali",
]
SARVAM_LANGUAGES = [
"bn-IN",
"en-IN",
"gu-IN",
"hi-IN",
"kn-IN",
"ml-IN",
"mr-IN",
"od-IN",
"pa-IN",
"ta-IN",
"te-IN",
"as-IN",
]
@register_tts
class SarvamTTSConfiguration(BaseTTSConfiguration):
model_config = SARVAM_PROVIDER_MODEL_CONFIG
provider: Literal[ServiceProviders.SARVAM] = ServiceProviders.SARVAM
model: str = Field(
default="bulbul:v2",
@ -715,6 +751,7 @@ CAMB_TTS_MODELS = ["mars-flash", "mars-pro", "mars-instruct"]
@register_tts
class CambTTSConfiguration(BaseTTSConfiguration):
model_config = CAMB_PROVIDER_MODEL_CONFIG
provider: Literal[ServiceProviders.CAMB] = ServiceProviders.CAMB
model: str = Field(
default="mars-flash",
@ -731,6 +768,7 @@ RIME_TTS_LANGUAGES = ["en", "de", "fr", "es", "hi"]
@register_tts
class RimeTTSConfiguration(BaseTTSConfiguration):
model_config = RIME_PROVIDER_MODEL_CONFIG
provider: Literal[ServiceProviders.RIME] = ServiceProviders.RIME
model: str = Field(
default="arcana",
@ -756,6 +794,7 @@ SPEACHES_TTS_MODELS = ["hexgrad/Kokoro-82M"]
@register_tts
class SpeachesTTSConfiguration(BaseTTSConfiguration):
model_config = SPEACHES_PROVIDER_MODEL_CONFIG
provider: Literal[ServiceProviders.SPEACHES] = ServiceProviders.SPEACHES
model: str = Field(
default="kokoro",
@ -786,6 +825,7 @@ class SpeachesTTSConfiguration(BaseTTSConfiguration):
TTSConfig = Annotated[
Union[
DeepgramTTSConfiguration,
GoogleTTSConfiguration,
OpenAITTSService,
ElevenlabsTTSConfiguration,
CartesiaTTSConfiguration,
@ -801,94 +841,9 @@ TTSConfig = Annotated[
###################################################### STT ########################################################################
DEEPGRAM_STT_MODELS = ["nova-3-general", "flux-general-en", "flux-general-multi"]
DEEPGRAM_LANGUAGES = [
"multi",
"ar",
"ar-AE",
"ar-SA",
"ar-QA",
"ar-KW",
"ar-SY",
"ar-LB",
"ar-PS",
"ar-JO",
"ar-EG",
"ar-SD",
"ar-TD",
"ar-MA",
"ar-DZ",
"ar-TN",
"ar-IQ",
"ar-IR",
"be",
"bn",
"bs",
"bg",
"ca",
"cs",
"da",
"da-DK",
"de",
"de-CH",
"el",
"en",
"en-US",
"en-AU",
"en-GB",
"en-IN",
"en-NZ",
"es",
"es-419",
"et",
"fa",
"fi",
"fr",
"fr-CA",
"he",
"hi",
"hr",
"hu",
"id",
"it",
"ja",
"kn",
"ko",
"ko-KR",
"lt",
"lv",
"mk",
"mr",
"ms",
"nl",
"nl-BE",
"no",
"pl",
"pt",
"pt-BR",
"pt-PT",
"ro",
"ru",
"sk",
"sl",
"sr",
"sv",
"sv-SE",
"ta",
"te",
"th",
"tl",
"tr",
"uk",
"ur",
"vi",
"zh-CN",
"zh-TW",
]
@register_stt
class DeepgramSTTConfiguration(BaseSTTConfiguration):
model_config = DEEPGRAM_PROVIDER_MODEL_CONFIG
provider: Literal[ServiceProviders.DEEPGRAM] = ServiceProviders.DEEPGRAM
model: str = Field(
default="nova-3-general",
@ -902,7 +857,7 @@ class DeepgramSTTConfiguration(BaseSTTConfiguration):
"examples": DEEPGRAM_LANGUAGES,
"model_options": {
"nova-3-general": DEEPGRAM_LANGUAGES,
"flux-general-en": ["en"],
"flux-general-en": ("en",),
},
},
)
@ -913,6 +868,7 @@ CARTESIA_STT_MODELS = ["ink-whisper"]
@register_stt
class CartesiaSTTConfiguration(BaseSTTConfiguration):
model_config = CARTESIA_PROVIDER_MODEL_CONFIG
provider: Literal[ServiceProviders.CARTESIA] = ServiceProviders.CARTESIA
model: str = Field(
default="ink-whisper",
@ -926,6 +882,7 @@ OPENAI_STT_MODELS = ["gpt-4o-transcribe"]
@register_stt
class OpenAISTTConfiguration(BaseSTTConfiguration):
model_config = OPENAI_PROVIDER_MODEL_CONFIG
provider: Literal[ServiceProviders.OPENAI] = ServiceProviders.OPENAI
model: str = Field(
default="gpt-4o-transcribe",
@ -934,6 +891,45 @@ class OpenAISTTConfiguration(BaseSTTConfiguration):
)
@register_stt
class GoogleSTTConfiguration(BaseSTTConfiguration):
model_config = GOOGLE_CLOUD_PROVIDER_MODEL_CONFIG
provider: Literal[ServiceProviders.GOOGLE] = ServiceProviders.GOOGLE
model: str = Field(
default="latest_long",
description="Google Cloud Speech-to-Text V2 recognition model.",
json_schema_extra={
"examples": GOOGLE_STT_MODELS,
"allow_custom_input": True,
},
)
language: str = Field(
default="en-US",
description="Primary BCP-47 language code for recognition.",
json_schema_extra={
"examples": GOOGLE_STT_LANGUAGES,
"allow_custom_input": True,
"docs_url": "https://docs.cloud.google.com/speech-to-text/docs/speech-to-text-supported-languages",
},
)
location: str = Field(
default="global",
description="Google Cloud Speech-to-Text region (for example 'global' or 'us-central1').",
)
credentials: str | None = Field(
default=None,
description=(
"Paste the entire Google Cloud service-account JSON. If omitted, "
"the server falls back to Application Default Credentials (ADC)."
),
json_schema_extra={"multiline": True},
)
api_key: str | list[str] | None = Field(
default=None,
description="Not used for Google Cloud STT. Leave blank.",
)
# Dograh STT Service
DOGRAH_STT_MODELS = ["default"]
DOGRAH_STT_LANGUAGES = DEEPGRAM_LANGUAGES
@ -941,6 +937,7 @@ DOGRAH_STT_LANGUAGES = DEEPGRAM_LANGUAGES
@register_stt
class DograhSTTService(BaseSTTConfiguration):
model_config = DOGRAH_PROVIDER_MODEL_CONFIG
provider: Literal[ServiceProviders.DOGRAH] = ServiceProviders.DOGRAH
model: str = Field(
default="default",
@ -954,12 +951,9 @@ class DograhSTTService(BaseSTTConfiguration):
)
# Sarvam STT Service
SARVAM_STT_MODELS = ["saarika:v2.5", "saaras:v2"]
@register_stt
class SarvamSTTConfiguration(BaseSTTConfiguration):
model_config = SARVAM_PROVIDER_MODEL_CONFIG
provider: Literal[ServiceProviders.SARVAM] = ServiceProviders.SARVAM
model: str = Field(
default="saarika:v2.5",
@ -973,74 +967,9 @@ class SarvamSTTConfiguration(BaseSTTConfiguration):
)
# Speechmatics STT Service
SPEECHMATICS_STT_LANGUAGES = [
"ar",
"ar_en",
"ba",
"eu",
"be",
"bn",
"bg",
"yue",
"ca",
"hr",
"cs",
"da",
"nl",
"en",
"eo",
"et",
"fi",
"fr",
"gl",
"de",
"el",
"he",
"hi",
"hu",
"id",
"ia",
"ga",
"it",
"ja",
"ko",
"lv",
"lt",
"ms",
"en_ms",
"mt",
"cmn",
"cmn_en",
"cmn_en_ms_ta",
"mr",
"mn",
"no",
"fa",
"pl",
"pt",
"ro",
"ru",
"sk",
"sl",
"es",
"sw",
"sv",
"tl",
"ta",
"en_ta",
"th",
"tr",
"uk",
"ur",
"ug",
"vi",
"cy",
]
@register_stt
class SpeechmaticsSTTConfiguration(BaseSTTConfiguration):
model_config = SPEECHMATICS_PROVIDER_MODEL_CONFIG
provider: Literal[ServiceProviders.SPEECHMATICS] = ServiceProviders.SPEECHMATICS
model: str = Field(
default="enhanced",
@ -1062,6 +991,7 @@ SPEACHES_STT_LANGUAGES = ["en", "ar", "nl", "fr", "de", "hi", "it", "pt", "es"]
@register_stt
class SpeachesSTTConfiguration(BaseSTTConfiguration):
model_config = SPEACHES_PROVIDER_MODEL_CONFIG
provider: Literal[ServiceProviders.SPEACHES] = ServiceProviders.SPEACHES
model: str = Field(
default="Systran/faster-distil-whisper-small.en",
@ -1095,6 +1025,7 @@ ASSEMBLYAI_STT_LANGUAGES = ["en", "es", "de", "fr", "pt", "it"]
@register_stt
class AssemblyAISTTConfiguration(BaseSTTConfiguration):
model_config = ASSEMBLYAI_PROVIDER_MODEL_CONFIG
provider: Literal[ServiceProviders.ASSEMBLYAI] = ServiceProviders.ASSEMBLYAI
model: str = Field(
default="u3-rt-pro",
@ -1108,113 +1039,9 @@ class AssemblyAISTTConfiguration(BaseSTTConfiguration):
)
GLADIA_STT_MODELS = ["solaria-1"]
GLADIA_STT_LANGUAGES = [
"af",
"am",
"ar",
"as",
"az",
"ba",
"be",
"bg",
"bn",
"bo",
"br",
"bs",
"ca",
"cs",
"cy",
"da",
"de",
"el",
"en",
"es",
"et",
"eu",
"fa",
"fi",
"fo",
"fr",
"gl",
"gu",
"ha",
"haw",
"he",
"hi",
"hr",
"ht",
"hu",
"hy",
"id",
"is",
"it",
"ja",
"jw",
"ka",
"kk",
"km",
"kn",
"ko",
"la",
"lb",
"ln",
"lo",
"lt",
"lv",
"mg",
"mi",
"mk",
"ml",
"mn",
"mr",
"ms",
"mt",
"my",
"ne",
"nl",
"nn",
"no",
"oc",
"pa",
"pl",
"ps",
"pt",
"ro",
"ru",
"sa",
"sd",
"si",
"sk",
"sl",
"sn",
"so",
"sq",
"sr",
"su",
"sv",
"sw",
"ta",
"te",
"tg",
"th",
"tk",
"tl",
"tr",
"tt",
"uk",
"ur",
"uz",
"vi",
"wo",
"yi",
"yo",
"zh",
]
@register_stt
class GladiaSTTConfiguration(BaseSTTConfiguration):
model_config = GLADIA_PROVIDER_MODEL_CONFIG
provider: Literal[ServiceProviders.GLADIA] = ServiceProviders.GLADIA
model: str = Field(
default="solaria-1",
@ -1233,6 +1060,7 @@ STTConfig = Annotated[
DeepgramSTTConfiguration,
CartesiaSTTConfiguration,
OpenAISTTConfiguration,
GoogleSTTConfiguration,
DograhSTTService,
SpeechmaticsSTTConfiguration,
SarvamSTTConfiguration,
@ -1250,6 +1078,7 @@ OPENAI_EMBEDDING_MODELS = ["text-embedding-3-small"]
@register_embeddings
class OpenAIEmbeddingsConfiguration(BaseEmbeddingsConfiguration):
model_config = OPENAI_PROVIDER_MODEL_CONFIG
provider: Literal[ServiceProviders.OPENAI] = ServiceProviders.OPENAI
model: str = Field(
default="text-embedding-3-small",
@ -1263,6 +1092,7 @@ OPENROUTER_EMBEDDING_MODELS = ["openai/text-embedding-3-small"]
@register_embeddings
class OpenRouterEmbeddingsConfiguration(BaseEmbeddingsConfiguration):
model_config = OPENROUTER_PROVIDER_MODEL_CONFIG
provider: Literal[ServiceProviders.OPENROUTER] = ServiceProviders.OPENROUTER
model: str = Field(
default="openai/text-embedding-3-small",

View file

@ -26,6 +26,8 @@ from pipecat.services.dograh.tts import DograhTTSService, DograhTTSSettings
from pipecat.services.elevenlabs.tts import ElevenLabsTTSService, ElevenLabsTTSSettings
from pipecat.services.gladia.stt import GladiaSTTService, GladiaSTTSettings
from pipecat.services.google.llm import GoogleLLMService, GoogleLLMSettings
from pipecat.services.google.stt import GoogleSTTService, GoogleSTTSettings
from pipecat.services.google.tts import GoogleTTSService, GoogleTTSSettings
from pipecat.services.groq.llm import GroqLLMService, GroqLLMSettings
from pipecat.services.openai.base_llm import OpenAILLMSettings
from pipecat.services.openai.llm import OpenAILLMService
@ -101,6 +103,23 @@ def create_stt_service(
api_key=user_config.stt.api_key,
settings=OpenAISTTSettings(model=user_config.stt.model),
)
elif user_config.stt.provider == ServiceProviders.GOOGLE.value:
language = getattr(user_config.stt, "language", None) or "en-US"
location = getattr(user_config.stt, "location", None) or "global"
credentials = getattr(user_config.stt, "credentials", None)
settings_kwargs = {"model": user_config.stt.model}
try:
settings_kwargs["languages"] = [Language(language)]
except ValueError:
settings_kwargs["language_codes"] = [language]
return GoogleSTTService(
credentials=credentials,
location=location,
settings=GoogleSTTSettings(**settings_kwargs),
sample_rate=audio_config.transport_in_sample_rate,
)
elif user_config.stt.provider == ServiceProviders.CARTESIA.value:
return CartesiaSTTService(
api_key=user_config.stt.api_key,
@ -241,6 +260,30 @@ def create_tts_service(user_config, audio_config: "AudioConfig"):
skip_aggregator_types=["recording_router", "recording"],
silence_time_s=1.0,
)
elif user_config.tts.provider == ServiceProviders.GOOGLE.value:
model = getattr(user_config.tts, "model", None) or "chirp_3_hd"
language = getattr(user_config.tts, "language", None) or "en-US"
voice = getattr(user_config.tts, "voice", None) or "en-US-Chirp3-HD-Charon"
speed = getattr(user_config.tts, "speed", None)
location = getattr(user_config.tts, "location", None) or None
credentials = getattr(user_config.tts, "credentials", None)
settings_kwargs = {
"model": model,
"voice": voice,
"language": language,
}
if speed is not None and speed != 1.0:
settings_kwargs["speaking_rate"] = speed
return GoogleTTSService(
credentials=credentials,
location=location,
settings=GoogleTTSSettings(**settings_kwargs),
text_filters=[xml_function_tag_filter],
skip_aggregator_types=["recording_router", "recording"],
silence_time_s=1.0,
)
elif user_config.tts.provider == ServiceProviders.ELEVENLABS.value:
# Backward compatible with older configuration "Name - voice_id"
try:

View file

@ -610,13 +610,14 @@ class GlobalNodeData(BaseNodeData, _PromptedNodeDataMixin):
"trigger_path": {
"display_name": "Trigger Path",
"description": (
"Auto-generated UUID-style path segment that uniquely identifies "
"Path segment that uniquely identifies "
"this trigger. Used in both URLs:\n"
" • Production: `/api/v1/public/agent/<trigger_path>` — executes "
"the published agent.\n"
" • Test: `/api/v1/public/agent/test/<trigger_path>` — executes "
"the latest draft.\n"
"Do not edit manually."
"Can be customized to a descriptive value up to 36 characters "
"using letters, numbers, hyphens, or underscores."
),
},
},

View file

@ -0,0 +1,142 @@
import copy
import re
import uuid
from dataclasses import dataclass
from typing import Optional
TRIGGER_PATH_MAX_LENGTH = 36
TRIGGER_PATH_PATTERN = re.compile(r"^[A-Za-z0-9_-]+$")
@dataclass(frozen=True)
class TriggerPathIssue:
node_id: str | None
trigger_path: str
message: str
def extract_trigger_paths(workflow_definition: Optional[dict]) -> list[str]:
"""Extract trigger paths from a workflow definition."""
if not workflow_definition:
return []
trigger_paths: list[str] = []
for node in workflow_definition.get("nodes") or []:
if node.get("type") != "trigger":
continue
trigger_path = (node.get("data") or {}).get("trigger_path")
if isinstance(trigger_path, str) and trigger_path:
trigger_paths.append(trigger_path)
return trigger_paths
def trigger_path_to_node_id(workflow_definition: Optional[dict]) -> dict[str, str]:
"""Map each trigger node's trigger_path to its node id."""
if not workflow_definition:
return {}
out: dict[str, str] = {}
for node in workflow_definition.get("nodes") or []:
if node.get("type") != "trigger":
continue
trigger_path = (node.get("data") or {}).get("trigger_path")
if isinstance(trigger_path, str) and trigger_path:
out[trigger_path] = node.get("id")
return out
def regenerate_trigger_uuids(workflow_definition: Optional[dict]) -> Optional[dict]:
"""Regenerate UUIDs for all trigger nodes in a workflow definition."""
if not workflow_definition:
return workflow_definition
updated_definition = copy.deepcopy(workflow_definition)
for node in updated_definition.get("nodes") or []:
if node.get("type") != "trigger":
continue
data = node.setdefault("data", {})
data["trigger_path"] = str(uuid.uuid4())
return updated_definition
def ensure_trigger_paths(workflow_definition: Optional[dict]) -> Optional[dict]:
"""Mint UUIDs for trigger nodes that do not already have a path."""
if not workflow_definition:
return workflow_definition
out = copy.deepcopy(workflow_definition)
for node in out.get("nodes") or []:
if node.get("type") != "trigger":
continue
data = node.setdefault("data", {})
if not data.get("trigger_path"):
data["trigger_path"] = str(uuid.uuid4())
return out
def validate_trigger_paths(
workflow_definition: Optional[dict],
) -> list[TriggerPathIssue]:
"""Validate custom trigger paths before they reach persistence/runtime."""
if not workflow_definition:
return []
issues: list[TriggerPathIssue] = []
seen_paths: dict[str, str | None] = {}
for node in workflow_definition.get("nodes") or []:
if node.get("type") != "trigger":
continue
node_id = node.get("id")
trigger_path = (node.get("data") or {}).get("trigger_path")
if not trigger_path:
continue
if not isinstance(trigger_path, str):
issues.append(
TriggerPathIssue(
node_id=node_id,
trigger_path=repr(trigger_path),
message="Trigger path must be a string.",
)
)
continue
if len(trigger_path) > TRIGGER_PATH_MAX_LENGTH:
issues.append(
TriggerPathIssue(
node_id=node_id,
trigger_path=trigger_path,
message=(
f"Trigger path must be {TRIGGER_PATH_MAX_LENGTH} "
"characters or fewer."
),
)
)
if not TRIGGER_PATH_PATTERN.fullmatch(trigger_path):
issues.append(
TriggerPathIssue(
node_id=node_id,
trigger_path=trigger_path,
message=(
"Trigger path must be a single URL path segment using "
"only letters, numbers, hyphens, and underscores."
),
)
)
first_node_id = seen_paths.get(trigger_path)
if first_node_id is None:
seen_paths[trigger_path] = node_id
else:
issues.append(
TriggerPathIssue(
node_id=node_id,
trigger_path=trigger_path,
message="Trigger path is duplicated in this workflow.",
)
)
return issues