From 0b005dad58a385c4a14cbe5a4edb987597245d94 Mon Sep 17 00:00:00 2001 From: Abhishek Kumar Date: Sat, 16 May 2026 17:44:49 +0530 Subject: [PATCH] fix: sampling rate fix for openai realtime --- api/services/configuration/check_validity.py | 24 +++ api/services/configuration/registry.py | 204 ++++++++++++++---- docs/configurations/inference-providers.mdx | 64 ++++++ pipecat | 2 +- .../components/ServiceConfigurationForm.tsx | 56 ++++- 5 files changed, 296 insertions(+), 54 deletions(-) diff --git a/api/services/configuration/check_validity.py b/api/services/configuration/check_validity.py index a78434d..ca46cfc 100644 --- a/api/services/configuration/check_validity.py +++ b/api/services/configuration/check_validity.py @@ -49,6 +49,7 @@ class UserConfigurationValidator: ServiceProviders.SPEACHES.value: self._check_speaches_api_key, ServiceProviders.OPENAI_REALTIME.value: self._check_openai_api_key, ServiceProviders.GOOGLE_REALTIME.value: self._check_google_api_key, + ServiceProviders.GOOGLE_VERTEX_REALTIME.value: self._check_google_vertex_realtime_api_key, ServiceProviders.ASSEMBLYAI.value: self._check_assemblyai_api_key, ServiceProviders.GLADIA.value: self._check_gladia_api_key, ServiceProviders.RIME.value: self._check_rime_api_key, @@ -116,6 +117,22 @@ class UserConfigurationValidator: return [{"model": service_name, "message": str(e)}] return [] + # Vertex Realtime uses service-account credentials (or ADC) instead of api_key + if provider == ServiceProviders.GOOGLE_VERTEX_REALTIME.value: + try: + if not self._check_google_vertex_realtime_api_key( + provider, service_config + ): + return [ + { + "model": service_name, + "message": f"Invalid {provider} configuration", + } + ] + except ValueError as e: + return [{"model": service_name, "message": str(e)}] + return [] + # AWS Bedrock uses AWS credentials instead of api_key if provider == ServiceProviders.AWS_BEDROCK.value: try: @@ -216,6 +233,13 @@ class UserConfigurationValidator: raise ValueError("base_url is required for Speaches services") return True + def _check_google_vertex_realtime_api_key(self, model: str, service_config) -> bool: + if not getattr(service_config, "project_id", None): + raise ValueError("project_id is required for Google Vertex Realtime") + if not getattr(service_config, "location", None): + raise ValueError("location is required for Google Vertex Realtime") + return True + def _check_aws_bedrock_api_key(self, model: str, service_config) -> bool: if not service_config.aws_access_key or not service_config.aws_secret_key: raise ValueError("AWS access key and secret key are required for Bedrock") diff --git a/api/services/configuration/registry.py b/api/services/configuration/registry.py index 441cd6e..3875fc7 100644 --- a/api/services/configuration/registry.py +++ b/api/services/configuration/registry.py @@ -207,6 +207,7 @@ class OpenAILLMService(BaseLLMConfiguration): provider: Literal[ServiceProviders.OPENAI] = ServiceProviders.OPENAI model: str = Field( default="gpt-4.1", + description="OpenAI chat model to use.", json_schema_extra={"examples": OPENAI_MODELS, "allow_custom_input": True}, ) @@ -216,6 +217,7 @@ class GoogleLLMService(BaseLLMConfiguration): provider: Literal[ServiceProviders.GOOGLE] = ServiceProviders.GOOGLE model: str = Field( default="gemini-2.0-flash", + description="Gemini model on Google AI Studio (not Vertex).", json_schema_extra={"examples": GOOGLE_MODELS, "allow_custom_input": True}, ) @@ -225,6 +227,7 @@ class GroqLLMService(BaseLLMConfiguration): provider: Literal[ServiceProviders.GROQ] = ServiceProviders.GROQ model: str = Field( default="llama-3.3-70b-versatile", + description="Groq-hosted model identifier.", json_schema_extra={"examples": GROQ_MODELS, "allow_custom_input": True}, ) @@ -234,10 +237,14 @@ class OpenRouterLLMConfiguration(BaseLLMConfiguration): provider: Literal[ServiceProviders.OPENROUTER] = ServiceProviders.OPENROUTER model: str = Field( default="openai/gpt-4.1", + description="OpenRouter model slug in 'vendor/model' form.", json_schema_extra={"examples": OPENROUTER_MODELS, "allow_custom_input": True}, ) - base_url: str = Field(default="https://openrouter.ai/api/v1") + base_url: str = Field( + default="https://openrouter.ai/api/v1", + description="Override only if proxying OpenRouter through your own gateway.", + ) @register_llm @@ -245,10 +252,13 @@ class AzureLLMService(BaseLLMConfiguration): provider: Literal[ServiceProviders.AZURE] = ServiceProviders.AZURE model: str = Field( default="gpt-4.1-mini", + description="Azure deployment name (not the upstream OpenAI model id).", json_schema_extra={"examples": AZURE_MODELS, "allow_custom_input": True}, ) - endpoint: str + endpoint: str = Field( + description="Azure OpenAI resource endpoint (e.g. https://.openai.azure.com).", + ) @register_llm @@ -256,6 +266,7 @@ class DograhLLMService(BaseLLMConfiguration): provider: Literal[ServiceProviders.DOGRAH] = ServiceProviders.DOGRAH model: str = Field( default="default", + description="Dograh-hosted model tier.", json_schema_extra={"examples": DOGRAH_LLM_MODELS, "allow_custom_input": True}, ) @@ -265,12 +276,25 @@ class AWSBedrockLLMConfiguration(BaseLLMConfiguration): provider: Literal[ServiceProviders.AWS_BEDROCK] = ServiceProviders.AWS_BEDROCK model: str = Field( default="us.amazon.nova-pro-v1:0", + description="Bedrock model ID — include the region inference-profile prefix (e.g. 'us.').", json_schema_extra={"examples": AWS_BEDROCK_MODELS, "allow_custom_input": True}, ) - aws_access_key: str = Field(default="") - aws_secret_key: str = Field(default="") - aws_region: str = Field(default="us-east-1") - api_key: str | list[str] | None = Field(default=None) + aws_access_key: str = Field( + default="", + description="AWS access key ID with bedrock:InvokeModel permission.", + ) + aws_secret_key: str = Field( + default="", + description="AWS secret access key paired with the access key ID.", + ) + aws_region: str = Field( + default="us-east-1", + description="AWS region where the Bedrock model is available.", + ) + api_key: str | list[str] | None = Field( + default=None, + description="Not used for Bedrock — authentication is via the AWS credentials above. Leave blank.", + ) SPEACHES_LLM_MODELS = ["llama3", "mistral", "phi3", "qwen2", "gemma2", "deepseek-r1"] @@ -281,6 +305,7 @@ class SpeachesLLMConfiguration(BaseLLMConfiguration): provider: Literal[ServiceProviders.SPEACHES] = ServiceProviders.SPEACHES model: str = Field( default="llama3", + description="Model name as exposed by your OpenAI-compatible server.", json_schema_extra={ "examples": SPEACHES_LLM_MODELS, "allow_custom_input": True, @@ -288,9 +313,12 @@ class SpeachesLLMConfiguration(BaseLLMConfiguration): ) base_url: str = Field( default="http://localhost:11434/v1", - description="OpenAI-compatible endpoint (Ollama, vLLM, etc.)", + description="OpenAI-compatible endpoint (Ollama, vLLM, etc.).", + ) + api_key: str | list[str] | None = Field( + default=None, + description="Usually not required for self-hosted endpoints. Leave blank unless your server enforces one.", ) - api_key: str | list[str] | None = Field(default=None) OPENAI_REALTIME_MODELS = ["gpt-realtime-2"] @@ -313,6 +341,7 @@ class OpenAIRealtimeLLMConfiguration(BaseLLMConfiguration): ) model: str = Field( default="gpt-realtime-2", + description="OpenAI realtime (speech-to-speech) model.", json_schema_extra={ "examples": OPENAI_REALTIME_MODELS, "allow_custom_input": True, @@ -320,6 +349,7 @@ class OpenAIRealtimeLLMConfiguration(BaseLLMConfiguration): ) voice: str = Field( default="alloy", + description="Voice the model speaks in.", json_schema_extra={ "examples": OPENAI_REALTIME_VOICES, "allow_custom_input": True, @@ -365,6 +395,7 @@ class GoogleRealtimeLLMConfiguration(BaseLLMConfiguration): ) model: str = Field( default="gemini-3.1-flash-live-preview", + description="Gemini Live model on Google AI Studio (not Vertex).", json_schema_extra={ "examples": GOOGLE_REALTIME_MODELS, "allow_custom_input": True, @@ -372,6 +403,7 @@ class GoogleRealtimeLLMConfiguration(BaseLLMConfiguration): ) voice: str = Field( default="Puck", + description="Voice the model speaks in.", json_schema_extra={ "examples": GOOGLE_REALTIME_VOICES, "allow_custom_input": True, @@ -379,6 +411,7 @@ class GoogleRealtimeLLMConfiguration(BaseLLMConfiguration): ) language: str = Field( default="en", + description="ISO 639-1 language code.", json_schema_extra={ "examples": GOOGLE_REALTIME_LANGUAGES, "allow_custom_input": True, @@ -400,6 +433,7 @@ class GoogleVertexRealtimeLLMConfiguration(BaseLLMConfiguration): ) model: str = Field( default="google/gemini-live-2.5-flash-native-audio", + description="Vertex AI publisher/model identifier.", json_schema_extra={ "examples": GOOGLE_VERTEX_REALTIME_MODELS, "allow_custom_input": True, @@ -407,13 +441,15 @@ class GoogleVertexRealtimeLLMConfiguration(BaseLLMConfiguration): ) voice: str = Field( default="Charon", + description="Voice the model speaks in.", json_schema_extra={ "examples": GOOGLE_VERTEX_REALTIME_VOICES, "allow_custom_input": True, }, ) language: str = Field( - default="en-US", + default="en", + description="BCP-47 language code (e.g. 'en-US').", json_schema_extra={ "examples": GOOGLE_VERTEX_REALTIME_LANGUAGES, "allow_custom_input": True, @@ -427,11 +463,18 @@ class GoogleVertexRealtimeLLMConfiguration(BaseLLMConfiguration): credentials: str | None = Field( default=None, description=( - "Service account JSON credentials string. If omitted, falls back to " - "Application Default Credentials (ADC)." + "Paste the entire service-account JSON file contents. If omitted, " + "falls back to Application Default Credentials (ADC)." + ), + json_schema_extra={"multiline": True}, + ) + api_key: str | list[str] | None = Field( + default=None, + description=( + "Not used for Vertex AI — authentication is via the service account " + "in `credentials` (or ADC). Leave blank." ), ) - api_key: str | list[str] | None = Field(default=None) REALTIME_PROVIDERS = { @@ -470,7 +513,10 @@ RealtimeConfig = Annotated[ @register_tts class DeepgramTTSConfiguration(BaseServiceConfiguration): provider: Literal[ServiceProviders.DEEPGRAM] = ServiceProviders.DEEPGRAM - voice: str = "aura-2-helena-en" + voice: str = Field( + default="aura-2-helena-en", + description="Deepgram voice ID (model is inferred from the 'aura-N' prefix).", + ) @computed_field @property @@ -492,10 +538,14 @@ ELEVENLABS_TTS_MODELS = ["eleven_flash_v2_5"] @register_tts class ElevenlabsTTSConfiguration(BaseServiceConfiguration): provider: Literal[ServiceProviders.ELEVENLABS] = ServiceProviders.ELEVENLABS - voice: str = "21m00Tcm4TlvDq8ikWAM" # Rachel voice ID - speed: float = Field(default=1.0, ge=0.1, le=2.0, description="Speed of the voice") + voice: str = Field( + default="21m00Tcm4TlvDq8ikWAM", + description="ElevenLabs voice ID from your Voice Library.", + ) + speed: float = Field(default=1.0, ge=0.1, le=2.0, description="Speed of the voice.") model: str = Field( default="eleven_flash_v2_5", + description="ElevenLabs TTS model.", json_schema_extra={"examples": ELEVENLABS_TTS_MODELS}, ) base_url: str = Field( @@ -515,9 +565,14 @@ OPENAI_TTS_MODELS = ["gpt-4o-mini-tts"] class OpenAITTSService(BaseTTSConfiguration): provider: Literal[ServiceProviders.OPENAI] = ServiceProviders.OPENAI model: str = Field( - default="gpt-4o-mini-tts", json_schema_extra={"examples": OPENAI_TTS_MODELS} + default="gpt-4o-mini-tts", + description="OpenAI TTS model.", + json_schema_extra={"examples": OPENAI_TTS_MODELS}, + ) + voice: str = Field( + default="alloy", + description="OpenAI TTS voice name.", ) - voice: str = "alloy" DOGRAH_TTS_MODELS = ["default"] @@ -527,10 +582,15 @@ DOGRAH_TTS_MODELS = ["default"] class DograhTTSService(BaseTTSConfiguration): provider: Literal[ServiceProviders.DOGRAH] = ServiceProviders.DOGRAH model: str = Field( - default="default", json_schema_extra={"examples": DOGRAH_TTS_MODELS} + default="default", + description="Dograh TTS tier.", + json_schema_extra={"examples": DOGRAH_TTS_MODELS}, ) - voice: str = "default" - speed: float = Field(default=1.0, ge=0.5, le=2.0, description="Speed of the voice") + voice: str = Field( + default="default", + description="Voice preset.", + ) + speed: float = Field(default=1.0, ge=0.5, le=2.0, description="Speed of the voice.") CARTESIA_TTS_MODELS = ["sonic-3"] @@ -540,15 +600,20 @@ CARTESIA_TTS_MODELS = ["sonic-3"] class CartesiaTTSConfiguration(BaseTTSConfiguration): provider: Literal[ServiceProviders.CARTESIA] = ServiceProviders.CARTESIA model: str = Field( - default="sonic-3", json_schema_extra={"examples": CARTESIA_TTS_MODELS} + default="sonic-3", + description="Cartesia TTS model.", + json_schema_extra={"examples": CARTESIA_TTS_MODELS}, ) - voice: str = Field(default="3faa81ae-d3d8-4ab1-9e44-e50e46d33c30") - speed: float = Field(default=1.0, ge=0.6, le=1.5, description="Speed of the voice") + voice: str = Field( + default="3faa81ae-d3d8-4ab1-9e44-e50e46d33c30", + description="Cartesia voice UUID from your Cartesia dashboard.", + ) + speed: float = Field(default=1.0, ge=0.6, le=1.5, description="Speed of the voice.") volume: float = Field( default=1.0, ge=0.5, le=2.0, - description="Volume multiplier for generated speech", + description="Volume multiplier for generated speech.", ) @@ -623,10 +688,13 @@ SARVAM_LANGUAGES = [ class SarvamTTSConfiguration(BaseTTSConfiguration): provider: Literal[ServiceProviders.SARVAM] = ServiceProviders.SARVAM model: str = Field( - default="bulbul:v2", json_schema_extra={"examples": SARVAM_TTS_MODELS} + default="bulbul:v2", + description="Sarvam TTS model (voice list depends on this).", + json_schema_extra={"examples": SARVAM_TTS_MODELS}, ) voice: str = Field( default="anushka", + description="Sarvam voice name; must match the selected model's voice list.", json_schema_extra={ "examples": SARVAM_V2_VOICES, "model_options": { @@ -636,7 +704,9 @@ class SarvamTTSConfiguration(BaseTTSConfiguration): }, ) language: str = Field( - default="hi-IN", json_schema_extra={"examples": SARVAM_LANGUAGES} + default="hi-IN", + description="BCP-47 Indian-language code (e.g. hi-IN, en-IN).", + json_schema_extra={"examples": SARVAM_LANGUAGES}, ) @@ -647,10 +717,12 @@ CAMB_TTS_MODELS = ["mars-flash", "mars-pro", "mars-instruct"] class CambTTSConfiguration(BaseTTSConfiguration): provider: Literal[ServiceProviders.CAMB] = ServiceProviders.CAMB model: str = Field( - default="mars-flash", json_schema_extra={"examples": CAMB_TTS_MODELS} + default="mars-flash", + description="Camb.ai TTS model.", + json_schema_extra={"examples": CAMB_TTS_MODELS}, ) - voice: str = Field(default="147320", description="Camb.ai voice ID") - language: str = Field(default="en-us", description="BCP-47 language code") + voice: str = Field(default="147320", description="Camb.ai voice ID.") + language: str = Field(default="en-us", description="BCP-47 language code.") RIME_TTS_MODELS = ["arcana", "mistv3", "mistv2", "mist"] @@ -662,17 +734,19 @@ class RimeTTSConfiguration(BaseTTSConfiguration): provider: Literal[ServiceProviders.RIME] = ServiceProviders.RIME model: str = Field( default="arcana", + description="Rime TTS model.", json_schema_extra={"examples": RIME_TTS_MODELS, "allow_custom_input": True}, ) voice: str = Field( default="celeste", - description="Rime voice ID", + description="Rime voice ID.", ) speed: float = Field( - default=1.0, ge=0.5, le=2.0, description="Speech speed multiplier" + default=1.0, ge=0.5, le=2.0, description="Speech speed multiplier." ) language: str = Field( default="en", + description="ISO 639-1 language code.", json_schema_extra={"examples": RIME_TTS_LANGUAGES, "allow_custom_input": True}, ) @@ -685,6 +759,7 @@ class SpeachesTTSConfiguration(BaseTTSConfiguration): provider: Literal[ServiceProviders.SPEACHES] = ServiceProviders.SPEACHES model: str = Field( default="kokoro", + description="Model name as served by your TTS endpoint (e.g. Kokoro-FastAPI).", json_schema_extra={ "examples": SPEACHES_TTS_MODELS, "allow_custom_input": True, @@ -693,16 +768,19 @@ class SpeachesTTSConfiguration(BaseTTSConfiguration): voice: str = Field( default="af_heart", json_schema_extra={"allow_custom_input": True}, - description="Voice ID for the TTS engine", + description="Voice ID for the TTS engine.", ) base_url: str = Field( default="http://localhost:8000/v1", - description="OpenAI-compatible TTS endpoint (Kokoro-FastAPI, etc.)", + description="OpenAI-compatible TTS endpoint (Kokoro-FastAPI, etc.).", ) speed: float = Field( - default=1.0, ge=0.25, le=4.0, description="Speech speed (0.25 to 4.0)" + default=1.0, ge=0.25, le=4.0, description="Speech speed (0.25 to 4.0)." + ) + api_key: str | list[str] | None = Field( + default=None, + description="Usually not required for self-hosted TTS. Leave blank unless enforced.", ) - api_key: str | list[str] | None = Field(default=None) TTSConfig = Annotated[ @@ -813,10 +891,13 @@ DEEPGRAM_LANGUAGES = [ class DeepgramSTTConfiguration(BaseSTTConfiguration): provider: Literal[ServiceProviders.DEEPGRAM] = ServiceProviders.DEEPGRAM model: str = Field( - default="nova-3-general", json_schema_extra={"examples": DEEPGRAM_STT_MODELS} + default="nova-3-general", + description="Deepgram STT model.", + json_schema_extra={"examples": DEEPGRAM_STT_MODELS}, ) language: str = Field( default="multi", + description="Language code; 'multi' enables auto-detect (Nova-3 only).", json_schema_extra={ "examples": DEEPGRAM_LANGUAGES, "model_options": { @@ -834,7 +915,9 @@ CARTESIA_STT_MODELS = ["ink-whisper"] class CartesiaSTTConfiguration(BaseSTTConfiguration): provider: Literal[ServiceProviders.CARTESIA] = ServiceProviders.CARTESIA model: str = Field( - default="ink-whisper", json_schema_extra={"examples": CARTESIA_STT_MODELS} + default="ink-whisper", + description="Cartesia STT model.", + json_schema_extra={"examples": CARTESIA_STT_MODELS}, ) @@ -845,7 +928,9 @@ OPENAI_STT_MODELS = ["gpt-4o-transcribe"] class OpenAISTTConfiguration(BaseSTTConfiguration): provider: Literal[ServiceProviders.OPENAI] = ServiceProviders.OPENAI model: str = Field( - default="gpt-4o-transcribe", json_schema_extra={"examples": OPENAI_STT_MODELS} + default="gpt-4o-transcribe", + description="OpenAI transcription model.", + json_schema_extra={"examples": OPENAI_STT_MODELS}, ) @@ -858,10 +943,14 @@ DOGRAH_STT_LANGUAGES = DEEPGRAM_LANGUAGES class DograhSTTService(BaseSTTConfiguration): provider: Literal[ServiceProviders.DOGRAH] = ServiceProviders.DOGRAH model: str = Field( - default="default", json_schema_extra={"examples": DOGRAH_STT_MODELS} + default="default", + description="Dograh STT tier.", + json_schema_extra={"examples": DOGRAH_STT_MODELS}, ) language: str = Field( - default="multi", json_schema_extra={"examples": DOGRAH_STT_LANGUAGES} + default="multi", + description="Language code; use 'multi' for auto-detect.", + json_schema_extra={"examples": DOGRAH_STT_LANGUAGES}, ) @@ -873,10 +962,14 @@ SARVAM_STT_MODELS = ["saarika:v2.5", "saaras:v2"] class SarvamSTTConfiguration(BaseSTTConfiguration): provider: Literal[ServiceProviders.SARVAM] = ServiceProviders.SARVAM model: str = Field( - default="saarika:v2.5", json_schema_extra={"examples": SARVAM_STT_MODELS} + default="saarika:v2.5", + description="Sarvam STT model.", + json_schema_extra={"examples": SARVAM_STT_MODELS}, ) language: str = Field( - default="hi-IN", json_schema_extra={"examples": SARVAM_LANGUAGES} + default="hi-IN", + description="BCP-47 Indian-language code.", + json_schema_extra={"examples": SARVAM_LANGUAGES}, ) @@ -912,10 +1005,13 @@ SPEECHMATICS_STT_LANGUAGES = [ class SpeechmaticsSTTConfiguration(BaseSTTConfiguration): provider: Literal[ServiceProviders.SPEECHMATICS] = ServiceProviders.SPEECHMATICS model: str = Field( - default="enhanced", description="Operating point: standard or enhanced" + default="enhanced", + description="Speechmatics operating point: 'standard' or 'enhanced'.", ) language: str = Field( - default="en", json_schema_extra={"examples": SPEECHMATICS_STT_LANGUAGES} + default="en", + description="ISO 639-1 language code.", + json_schema_extra={"examples": SPEECHMATICS_STT_LANGUAGES}, ) @@ -931,6 +1027,7 @@ class SpeachesSTTConfiguration(BaseSTTConfiguration): provider: Literal[ServiceProviders.SPEACHES] = ServiceProviders.SPEACHES model: str = Field( default="Systran/faster-distil-whisper-small.en", + description="Whisper model identifier as served by your STT endpoint.", json_schema_extra={ "examples": SPEACHES_STT_MODELS, "allow_custom_input": True, @@ -938,6 +1035,7 @@ class SpeachesSTTConfiguration(BaseSTTConfiguration): ) language: str = Field( default="en", + description="ISO 639-1 language code.", json_schema_extra={ "examples": SPEACHES_STT_LANGUAGES, "allow_custom_input": True, @@ -945,9 +1043,12 @@ class SpeachesSTTConfiguration(BaseSTTConfiguration): ) base_url: str = Field( default="http://localhost:8000/v1", - description="OpenAI-compatible STT endpoint (Speaches, etc.)", + description="OpenAI-compatible STT endpoint (Speaches, etc.).", + ) + api_key: str | list[str] | None = Field( + default=None, + description="Usually not required for self-hosted STT. Leave blank unless enforced.", ) - api_key: str | list[str] | None = Field(default=None) ASSEMBLYAI_STT_MODELS = ["u3-rt-pro"] @@ -959,10 +1060,12 @@ class AssemblyAISTTConfiguration(BaseSTTConfiguration): provider: Literal[ServiceProviders.ASSEMBLYAI] = ServiceProviders.ASSEMBLYAI model: str = Field( default="u3-rt-pro", + description="AssemblyAI realtime STT model.", json_schema_extra={"examples": ASSEMBLYAI_STT_MODELS}, ) language: str = Field( default="en", + description="ISO 639-1 language code.", json_schema_extra={"examples": ASSEMBLYAI_STT_LANGUAGES}, ) @@ -1077,10 +1180,12 @@ class GladiaSTTConfiguration(BaseSTTConfiguration): provider: Literal[ServiceProviders.GLADIA] = ServiceProviders.GLADIA model: str = Field( default="solaria-1", + description="Gladia STT model.", json_schema_extra={"examples": GLADIA_STT_MODELS}, ) language: str = Field( default="en", + description="ISO 639-1 language code.", json_schema_extra={"examples": GLADIA_STT_LANGUAGES}, ) @@ -1110,6 +1215,7 @@ class OpenAIEmbeddingsConfiguration(BaseEmbeddingsConfiguration): provider: Literal[ServiceProviders.OPENAI] = ServiceProviders.OPENAI model: str = Field( default="text-embedding-3-small", + description="OpenAI embedding model.", json_schema_extra={"examples": OPENAI_EMBEDDING_MODELS}, ) @@ -1122,10 +1228,14 @@ class OpenRouterEmbeddingsConfiguration(BaseEmbeddingsConfiguration): provider: Literal[ServiceProviders.OPENROUTER] = ServiceProviders.OPENROUTER model: str = Field( default="openai/text-embedding-3-small", + description="OpenRouter-hosted embedding model slug.", json_schema_extra={"examples": OPENROUTER_EMBEDDING_MODELS}, ) - base_url: str = Field(default="https://openrouter.ai/api/v1") + base_url: str = Field( + default="https://openrouter.ai/api/v1", + description="Override only if proxying OpenRouter through your own gateway.", + ) EmbeddingsConfig = Annotated[ diff --git a/docs/configurations/inference-providers.mdx b/docs/configurations/inference-providers.mdx index 3fb62ba..d31dbc8 100644 --- a/docs/configurations/inference-providers.mdx +++ b/docs/configurations/inference-providers.mdx @@ -120,4 +120,68 @@ To use Gemini 3.1 Live with Dograh, you need a Google Gemini API key. Follow the When using a Realtime provider like Gemini Live, you do not need to configure separate TTS and STT services — the realtime model handles speech in and out. However, you **must** still configure an **LLM** under the LLM tab: it powers variable extraction and QA analysis, which the realtime service does not perform. + + +## Gemini Live on Vertex AI + +If you want to run Gemini Live through your own Google Cloud project — for billing consolidation, VPC controls, regional residency, or enterprise IAM — Dograh also supports Gemini Live via **Vertex AI** as a separate provider (`google_vertex_realtime`). The default model is `google/gemini-live-2.5-flash-native-audio`. + +Unlike Google AI Studio (which uses a single Gemini API key), Vertex AI authenticates with a **service account** belonging to your Google Cloud project. + +### Prerequisites + +1. A Google Cloud project with billing enabled. +2. The Vertex AI API enabled on that project: + + ```bash + gcloud services enable aiplatform.googleapis.com --project=YOUR_PROJECT_ID + ``` + +3. A service account with the **Vertex AI User** role (`roles/aiplatform.user`) on the project: + + ```bash + gcloud projects add-iam-policy-binding YOUR_PROJECT_ID \ + --member="serviceAccount:YOUR_SA@YOUR_PROJECT_ID.iam.gserviceaccount.com" \ + --role="roles/aiplatform.user" + ``` + +4. A **JSON** key for that service account (P12 keys are not supported). + +### Creating the service account key + +1. In the GCP Console, go to **IAM & Admin → Service Accounts**. +2. Pick an existing service account (or create a new one). +3. Open the **Keys** tab → **Add Key → Create new key**. +4. Choose **JSON** as the key type and click **Create**. +5. The key file will download to your computer — store it securely and treat it as a secret. + + + Always pick **JSON**, not P12. The Vertex AI client libraries used by Dograh only accept service-account JSON keys; P12 is a legacy format retained for older Google Workspace integrations. + + +### Configuring Vertex AI Realtime in Dograh + +1. Go to **Model Configurations** in your Dograh dashboard. +2. Enable the **Realtime** toggle. +3. Under the **Realtime** section, select `google_vertex_realtime` as the provider. +4. Fill in the fields: + + | Field | What to put in | + |---|---| + | **Model** | Vertex publisher/model id, e.g. `google/gemini-live-2.5-flash-native-audio` | + | **Voice** | One of the built-in voices (Puck, Charon, Kore, Fenrir, Aoede) | + | **Language** | BCP-47 code (e.g. `en-US`) | + | **Project Id** | The `project_id` value from your service-account JSON | + | **Location** | GCP region where the model is available (e.g. `us-east4`) | + | **Credentials** | Paste the **entire contents** of the service-account JSON file | + | **API Key** | Leave blank — Vertex AI does not use API keys | + +5. Save the configuration. + + + Paste the whole JSON file into the **Credentials** field — including `private_key`, `client_email`, and all other entries. Don't try to extract individual fields. If `Credentials` is left blank, Dograh falls back to **Application Default Credentials (ADC)** from the host environment, which is useful when running Dograh on a GCP VM or GKE pod with an attached service account. + + + + IAM changes can take up to ~60 seconds to propagate. If you see `Permission 'aiplatform.endpoints.predict' denied`, wait a minute and retry — or double-check that the role was granted to the same service account whose JSON you pasted. \ No newline at end of file diff --git a/pipecat b/pipecat index f780c6d..8590e53 160000 --- a/pipecat +++ b/pipecat @@ -1 +1 @@ -Subproject commit f780c6de083d607adc7779109cad37f8b5a7030d +Subproject commit 8590e5333d63eb69b78a193f9eeb2ff0584f9e9a diff --git a/ui/src/components/ServiceConfigurationForm.tsx b/ui/src/components/ServiceConfigurationForm.tsx index ddcf4c4..2425e06 100644 --- a/ui/src/components/ServiceConfigurationForm.tsx +++ b/ui/src/components/ServiceConfigurationForm.tsx @@ -13,6 +13,7 @@ import { Label } from "@/components/ui/label"; import { Select, SelectContent, SelectItem, SelectTrigger, SelectValue } from "@/components/ui/select"; import { Switch } from "@/components/ui/switch"; import { Tabs, TabsContent, TabsList, TabsTrigger } from "@/components/ui/tabs"; +import { Textarea } from "@/components/ui/textarea"; import { VoiceSelector } from "@/components/VoiceSelector"; import { LANGUAGE_DISPLAY_NAMES } from "@/constants/languages"; import { useUserConfig } from "@/context/UserConfigContext"; @@ -30,6 +31,7 @@ interface SchemaProperty { $ref?: string; description?: string; format?: string; + multiline?: boolean; } interface ProviderSchema { @@ -501,18 +503,26 @@ export function ServiceConfigurationForm({ {currentProvider && providerSchema && configFields.length > 1 && (
- {configFields.slice(1).map((field) => ( -
- - {renderField(service, field, providerSchema)} -
- ))} + {configFields.slice(1).map((field) => { + const fieldSchema = providerSchema.properties[field]; + const actualFieldSchema = fieldSchema?.$ref && providerSchema.$defs + ? providerSchema.$defs[fieldSchema.$ref.split('/').pop() || ''] + : fieldSchema; + const fullWidth = actualFieldSchema?.multiline; + return ( +
+ + {renderField(service, field, providerSchema)} +
+ ); + })}
)} {currentProvider && providerSchema && providerSchema.properties.api_key && (
+ {renderFieldDescription("api_key", providerSchema)} {apiKeys[service].map((key, index) => (
{ + const schema = providerSchema.properties[field]; + if (!schema) return null; + const actualSchema = schema.$ref && providerSchema.$defs + ? providerSchema.$defs[schema.$ref.split('/').pop() || ''] + : schema; + if (!actualSchema?.description) return null; + return ( +

{actualSchema.description}

+ ); + }; + const renderField = (service: ServiceSegment, field: string, providerSchema: ProviderSchema) => { + return ( + <> + {renderFieldInput(service, field, providerSchema)} + {renderFieldDescription(field, providerSchema)} + + ); + }; + + const renderFieldInput = (service: ServiceSegment, field: string, providerSchema: ProviderSchema) => { const schema = providerSchema.properties[field]; const actualSchema = schema.$ref && providerSchema.$defs ? providerSchema.$defs[schema.$ref.split('/').pop() || ''] @@ -699,6 +730,19 @@ export function ServiceConfigurationForm({ ); } + if (actualSchema?.multiline) { + return ( +