fix: sampling rate fix for openai realtime

2026-07-25 12:01:04 +02:00 · 2026-05-16 17:44:49 +05:30 · 2026-05-16 17:44:49 +05:30 · 0b005dad58
commit 0b005dad58
parent d37d6d05c1
5 changed files with 296 additions and 54 deletions
--- a/api/services/configuration/check_validity.py
+++ b/api/services/configuration/check_validity.py
@ -49,6 +49,7 @@ class UserConfigurationValidator:
            ServiceProviders.SPEACHES.value: self._check_speaches_api_key,
            ServiceProviders.OPENAI_REALTIME.value: self._check_openai_api_key,
            ServiceProviders.GOOGLE_REALTIME.value: self._check_google_api_key,
+            ServiceProviders.GOOGLE_VERTEX_REALTIME.value: self._check_google_vertex_realtime_api_key,
            ServiceProviders.ASSEMBLYAI.value: self._check_assemblyai_api_key,
            ServiceProviders.GLADIA.value: self._check_gladia_api_key,
            ServiceProviders.RIME.value: self._check_rime_api_key,
@ -116,6 +117,22 @@ class UserConfigurationValidator:
                return [{"model": service_name, "message": str(e)}]
            return []

+        # Vertex Realtime uses service-account credentials (or ADC) instead of api_key
+        if provider == ServiceProviders.GOOGLE_VERTEX_REALTIME.value:
+            try:
+                if not self._check_google_vertex_realtime_api_key(
+                    provider, service_config
+                ):
+                    return [
+                        {
+                            "model": service_name,
+                            "message": f"Invalid {provider} configuration",
+                        }
+                    ]
+            except ValueError as e:
+                return [{"model": service_name, "message": str(e)}]
+            return []
+
        # AWS Bedrock uses AWS credentials instead of api_key
        if provider == ServiceProviders.AWS_BEDROCK.value:
            try:
@ -216,6 +233,13 @@ class UserConfigurationValidator:
            raise ValueError("base_url is required for Speaches services")
        return True

+    def _check_google_vertex_realtime_api_key(self, model: str, service_config) -> bool:
+        if not getattr(service_config, "project_id", None):
+            raise ValueError("project_id is required for Google Vertex Realtime")
+        if not getattr(service_config, "location", None):
+            raise ValueError("location is required for Google Vertex Realtime")
+        return True
+
    def _check_aws_bedrock_api_key(self, model: str, service_config) -> bool:
        if not service_config.aws_access_key or not service_config.aws_secret_key:
            raise ValueError("AWS access key and secret key are required for Bedrock")
--- a/api/services/configuration/registry.py
+++ b/api/services/configuration/registry.py
@ -207,6 +207,7 @@ class OpenAILLMService(BaseLLMConfiguration):
    provider: Literal[ServiceProviders.OPENAI] = ServiceProviders.OPENAI
    model: str = Field(
        default="gpt-4.1",
+        description="OpenAI chat model to use.",
        json_schema_extra={"examples": OPENAI_MODELS, "allow_custom_input": True},
    )

@ -216,6 +217,7 @@ class GoogleLLMService(BaseLLMConfiguration):
    provider: Literal[ServiceProviders.GOOGLE] = ServiceProviders.GOOGLE
    model: str = Field(
        default="gemini-2.0-flash",
+        description="Gemini model on Google AI Studio (not Vertex).",
        json_schema_extra={"examples": GOOGLE_MODELS, "allow_custom_input": True},
    )

@ -225,6 +227,7 @@ class GroqLLMService(BaseLLMConfiguration):
    provider: Literal[ServiceProviders.GROQ] = ServiceProviders.GROQ
    model: str = Field(
        default="llama-3.3-70b-versatile",
+        description="Groq-hosted model identifier.",
        json_schema_extra={"examples": GROQ_MODELS, "allow_custom_input": True},
    )

@ -234,10 +237,14 @@ class OpenRouterLLMConfiguration(BaseLLMConfiguration):
    provider: Literal[ServiceProviders.OPENROUTER] = ServiceProviders.OPENROUTER
    model: str = Field(
        default="openai/gpt-4.1",
+        description="OpenRouter model slug in 'vendor/model' form.",
        json_schema_extra={"examples": OPENROUTER_MODELS, "allow_custom_input": True},
    )

-    base_url: str = Field(default="https://openrouter.ai/api/v1")
+    base_url: str = Field(
+        default="https://openrouter.ai/api/v1",
+        description="Override only if proxying OpenRouter through your own gateway.",
+    )


@register_llm
@ -245,10 +252,13 @@ class AzureLLMService(BaseLLMConfiguration):
    provider: Literal[ServiceProviders.AZURE] = ServiceProviders.AZURE
    model: str = Field(
        default="gpt-4.1-mini",
+        description="Azure deployment name (not the upstream OpenAI model id).",
        json_schema_extra={"examples": AZURE_MODELS, "allow_custom_input": True},
    )

-    endpoint: str
+    endpoint: str = Field(
+        description="Azure OpenAI resource endpoint (e.g. https://<resource>.openai.azure.com).",
+    )


@register_llm
@ -256,6 +266,7 @@ class DograhLLMService(BaseLLMConfiguration):
    provider: Literal[ServiceProviders.DOGRAH] = ServiceProviders.DOGRAH
    model: str = Field(
        default="default",
+        description="Dograh-hosted model tier.",
        json_schema_extra={"examples": DOGRAH_LLM_MODELS, "allow_custom_input": True},
    )

@ -265,12 +276,25 @@ class AWSBedrockLLMConfiguration(BaseLLMConfiguration):
    provider: Literal[ServiceProviders.AWS_BEDROCK] = ServiceProviders.AWS_BEDROCK
    model: str = Field(
        default="us.amazon.nova-pro-v1:0",
+        description="Bedrock model ID — include the region inference-profile prefix (e.g. 'us.').",
        json_schema_extra={"examples": AWS_BEDROCK_MODELS, "allow_custom_input": True},
    )
-    aws_access_key: str = Field(default="")
-    aws_secret_key: str = Field(default="")
-    aws_region: str = Field(default="us-east-1")
-    api_key: str | list[str] | None = Field(default=None)
+    aws_access_key: str = Field(
+        default="",
+        description="AWS access key ID with bedrock:InvokeModel permission.",
+    )
+    aws_secret_key: str = Field(
+        default="",
+        description="AWS secret access key paired with the access key ID.",
+    )
+    aws_region: str = Field(
+        default="us-east-1",
+        description="AWS region where the Bedrock model is available.",
+    )
+    api_key: str | list[str] | None = Field(
+        default=None,
+        description="Not used for Bedrock — authentication is via the AWS credentials above. Leave blank.",
+    )


 SPEACHES_LLM_MODELS = ["llama3", "mistral", "phi3", "qwen2", "gemma2", "deepseek-r1"]
@ -281,6 +305,7 @@ class SpeachesLLMConfiguration(BaseLLMConfiguration):
    provider: Literal[ServiceProviders.SPEACHES] = ServiceProviders.SPEACHES
    model: str = Field(
        default="llama3",
+        description="Model name as exposed by your OpenAI-compatible server.",
        json_schema_extra={
            "examples": SPEACHES_LLM_MODELS,
            "allow_custom_input": True,
@ -288,9 +313,12 @@ class SpeachesLLMConfiguration(BaseLLMConfiguration):
    )
    base_url: str = Field(
        default="http://localhost:11434/v1",
-        description="OpenAI-compatible endpoint (Ollama, vLLM, etc.)",
+        description="OpenAI-compatible endpoint (Ollama, vLLM, etc.).",
+    )
+    api_key: str | list[str] | None = Field(
+        default=None,
+        description="Usually not required for self-hosted endpoints. Leave blank unless your server enforces one.",
    )
-    api_key: str | list[str] | None = Field(default=None)


 OPENAI_REALTIME_MODELS = ["gpt-realtime-2"]
@ -313,6 +341,7 @@ class OpenAIRealtimeLLMConfiguration(BaseLLMConfiguration):
    )
    model: str = Field(
        default="gpt-realtime-2",
+        description="OpenAI realtime (speech-to-speech) model.",
        json_schema_extra={
            "examples": OPENAI_REALTIME_MODELS,
            "allow_custom_input": True,
@ -320,6 +349,7 @@ class OpenAIRealtimeLLMConfiguration(BaseLLMConfiguration):
    )
    voice: str = Field(
        default="alloy",
+        description="Voice the model speaks in.",
        json_schema_extra={
            "examples": OPENAI_REALTIME_VOICES,
            "allow_custom_input": True,
@ -365,6 +395,7 @@ class GoogleRealtimeLLMConfiguration(BaseLLMConfiguration):
    )
    model: str = Field(
        default="gemini-3.1-flash-live-preview",
+        description="Gemini Live model on Google AI Studio (not Vertex).",
        json_schema_extra={
            "examples": GOOGLE_REALTIME_MODELS,
            "allow_custom_input": True,
@ -372,6 +403,7 @@ class GoogleRealtimeLLMConfiguration(BaseLLMConfiguration):
    )
    voice: str = Field(
        default="Puck",
+        description="Voice the model speaks in.",
        json_schema_extra={
            "examples": GOOGLE_REALTIME_VOICES,
            "allow_custom_input": True,
@ -379,6 +411,7 @@ class GoogleRealtimeLLMConfiguration(BaseLLMConfiguration):
    )
    language: str = Field(
        default="en",
+        description="ISO 639-1 language code.",
        json_schema_extra={
            "examples": GOOGLE_REALTIME_LANGUAGES,
            "allow_custom_input": True,
@ -400,6 +433,7 @@ class GoogleVertexRealtimeLLMConfiguration(BaseLLMConfiguration):
    )
    model: str = Field(
        default="google/gemini-live-2.5-flash-native-audio",
+        description="Vertex AI publisher/model identifier.",
        json_schema_extra={
            "examples": GOOGLE_VERTEX_REALTIME_MODELS,
            "allow_custom_input": True,
@ -407,13 +441,15 @@ class GoogleVertexRealtimeLLMConfiguration(BaseLLMConfiguration):
    )
    voice: str = Field(
        default="Charon",
+        description="Voice the model speaks in.",
        json_schema_extra={
            "examples": GOOGLE_VERTEX_REALTIME_VOICES,
            "allow_custom_input": True,
        },
    )
    language: str = Field(
-        default="en-US",
+        default="en",
+        description="BCP-47 language code (e.g. 'en-US').",
        json_schema_extra={
            "examples": GOOGLE_VERTEX_REALTIME_LANGUAGES,
            "allow_custom_input": True,
@ -427,11 +463,18 @@ class GoogleVertexRealtimeLLMConfiguration(BaseLLMConfiguration):
    credentials: str | None = Field(
        default=None,
        description=(
-            "Service account JSON credentials string. If omitted, falls back to "
-            "Application Default Credentials (ADC)."
+            "Paste the entire service-account JSON file contents. If omitted, "
+            "falls back to Application Default Credentials (ADC)."
+        ),
+        json_schema_extra={"multiline": True},
+    )
+    api_key: str | list[str] | None = Field(
+        default=None,
+        description=(
+            "Not used for Vertex AI — authentication is via the service account "
+            "in `credentials` (or ADC). Leave blank."
        ),
    )
-    api_key: str | list[str] | None = Field(default=None)


 REALTIME_PROVIDERS = {
@ -470,7 +513,10 @@ RealtimeConfig = Annotated[
@register_tts
 class DeepgramTTSConfiguration(BaseServiceConfiguration):
    provider: Literal[ServiceProviders.DEEPGRAM] = ServiceProviders.DEEPGRAM
-    voice: str = "aura-2-helena-en"
+    voice: str = Field(
+        default="aura-2-helena-en",
+        description="Deepgram voice ID (model is inferred from the 'aura-N' prefix).",
+    )

    @computed_field
    @property
@ -492,10 +538,14 @@ ELEVENLABS_TTS_MODELS = ["eleven_flash_v2_5"]
@register_tts
 class ElevenlabsTTSConfiguration(BaseServiceConfiguration):
    provider: Literal[ServiceProviders.ELEVENLABS] = ServiceProviders.ELEVENLABS
-    voice: str = "21m00Tcm4TlvDq8ikWAM"  # Rachel voice ID
-    speed: float = Field(default=1.0, ge=0.1, le=2.0, description="Speed of the voice")
+    voice: str = Field(
+        default="21m00Tcm4TlvDq8ikWAM",
+        description="ElevenLabs voice ID from your Voice Library.",
+    )
+    speed: float = Field(default=1.0, ge=0.1, le=2.0, description="Speed of the voice.")
    model: str = Field(
        default="eleven_flash_v2_5",
+        description="ElevenLabs TTS model.",
        json_schema_extra={"examples": ELEVENLABS_TTS_MODELS},
    )
    base_url: str = Field(
@ -515,9 +565,14 @@ OPENAI_TTS_MODELS = ["gpt-4o-mini-tts"]
 class OpenAITTSService(BaseTTSConfiguration):
    provider: Literal[ServiceProviders.OPENAI] = ServiceProviders.OPENAI
    model: str = Field(
-        default="gpt-4o-mini-tts", json_schema_extra={"examples": OPENAI_TTS_MODELS}
+        default="gpt-4o-mini-tts",
+        description="OpenAI TTS model.",
+        json_schema_extra={"examples": OPENAI_TTS_MODELS},
+    )
+    voice: str = Field(
+        default="alloy",
+        description="OpenAI TTS voice name.",
    )
-    voice: str = "alloy"


 DOGRAH_TTS_MODELS = ["default"]
@ -527,10 +582,15 @@ DOGRAH_TTS_MODELS = ["default"]
 class DograhTTSService(BaseTTSConfiguration):
    provider: Literal[ServiceProviders.DOGRAH] = ServiceProviders.DOGRAH
    model: str = Field(
-        default="default", json_schema_extra={"examples": DOGRAH_TTS_MODELS}
+        default="default",
+        description="Dograh TTS tier.",
+        json_schema_extra={"examples": DOGRAH_TTS_MODELS},
    )
-    voice: str = "default"
-    speed: float = Field(default=1.0, ge=0.5, le=2.0, description="Speed of the voice")
+    voice: str = Field(
+        default="default",
+        description="Voice preset.",
+    )
+    speed: float = Field(default=1.0, ge=0.5, le=2.0, description="Speed of the voice.")


 CARTESIA_TTS_MODELS = ["sonic-3"]
@ -540,15 +600,20 @@ CARTESIA_TTS_MODELS = ["sonic-3"]
 class CartesiaTTSConfiguration(BaseTTSConfiguration):
    provider: Literal[ServiceProviders.CARTESIA] = ServiceProviders.CARTESIA
    model: str = Field(
-        default="sonic-3", json_schema_extra={"examples": CARTESIA_TTS_MODELS}
+        default="sonic-3",
+        description="Cartesia TTS model.",
+        json_schema_extra={"examples": CARTESIA_TTS_MODELS},
    )
-    voice: str = Field(default="3faa81ae-d3d8-4ab1-9e44-e50e46d33c30")
-    speed: float = Field(default=1.0, ge=0.6, le=1.5, description="Speed of the voice")
+    voice: str = Field(
+        default="3faa81ae-d3d8-4ab1-9e44-e50e46d33c30",
+        description="Cartesia voice UUID from your Cartesia dashboard.",
+    )
+    speed: float = Field(default=1.0, ge=0.6, le=1.5, description="Speed of the voice.")
    volume: float = Field(
        default=1.0,
        ge=0.5,
        le=2.0,
-        description="Volume multiplier for generated speech",
+        description="Volume multiplier for generated speech.",
    )


@ -623,10 +688,13 @@ SARVAM_LANGUAGES = [
 class SarvamTTSConfiguration(BaseTTSConfiguration):
    provider: Literal[ServiceProviders.SARVAM] = ServiceProviders.SARVAM
    model: str = Field(
-        default="bulbul:v2", json_schema_extra={"examples": SARVAM_TTS_MODELS}
+        default="bulbul:v2",
+        description="Sarvam TTS model (voice list depends on this).",
+        json_schema_extra={"examples": SARVAM_TTS_MODELS},
    )
    voice: str = Field(
        default="anushka",
+        description="Sarvam voice name; must match the selected model's voice list.",
        json_schema_extra={
            "examples": SARVAM_V2_VOICES,
            "model_options": {
@ -636,7 +704,9 @@ class SarvamTTSConfiguration(BaseTTSConfiguration):
        },
    )
    language: str = Field(
-        default="hi-IN", json_schema_extra={"examples": SARVAM_LANGUAGES}
+        default="hi-IN",
+        description="BCP-47 Indian-language code (e.g. hi-IN, en-IN).",
+        json_schema_extra={"examples": SARVAM_LANGUAGES},
    )


@ -647,10 +717,12 @@ CAMB_TTS_MODELS = ["mars-flash", "mars-pro", "mars-instruct"]
 class CambTTSConfiguration(BaseTTSConfiguration):
    provider: Literal[ServiceProviders.CAMB] = ServiceProviders.CAMB
    model: str = Field(
-        default="mars-flash", json_schema_extra={"examples": CAMB_TTS_MODELS}
+        default="mars-flash",
+        description="Camb.ai TTS model.",
+        json_schema_extra={"examples": CAMB_TTS_MODELS},
    )
-    voice: str = Field(default="147320", description="Camb.ai voice ID")
-    language: str = Field(default="en-us", description="BCP-47 language code")
+    voice: str = Field(default="147320", description="Camb.ai voice ID.")
+    language: str = Field(default="en-us", description="BCP-47 language code.")


 RIME_TTS_MODELS = ["arcana", "mistv3", "mistv2", "mist"]
@ -662,17 +734,19 @@ class RimeTTSConfiguration(BaseTTSConfiguration):
    provider: Literal[ServiceProviders.RIME] = ServiceProviders.RIME
    model: str = Field(
        default="arcana",
+        description="Rime TTS model.",
        json_schema_extra={"examples": RIME_TTS_MODELS, "allow_custom_input": True},
    )
    voice: str = Field(
        default="celeste",
-        description="Rime voice ID",
+        description="Rime voice ID.",
    )
    speed: float = Field(
-        default=1.0, ge=0.5, le=2.0, description="Speech speed multiplier"
+        default=1.0, ge=0.5, le=2.0, description="Speech speed multiplier."
    )
    language: str = Field(
        default="en",
+        description="ISO 639-1 language code.",
        json_schema_extra={"examples": RIME_TTS_LANGUAGES, "allow_custom_input": True},
    )

@ -685,6 +759,7 @@ class SpeachesTTSConfiguration(BaseTTSConfiguration):
    provider: Literal[ServiceProviders.SPEACHES] = ServiceProviders.SPEACHES
    model: str = Field(
        default="kokoro",
+        description="Model name as served by your TTS endpoint (e.g. Kokoro-FastAPI).",
        json_schema_extra={
            "examples": SPEACHES_TTS_MODELS,
            "allow_custom_input": True,
@ -693,16 +768,19 @@ class SpeachesTTSConfiguration(BaseTTSConfiguration):
    voice: str = Field(
        default="af_heart",
        json_schema_extra={"allow_custom_input": True},
-        description="Voice ID for the TTS engine",
+        description="Voice ID for the TTS engine.",
    )
    base_url: str = Field(
        default="http://localhost:8000/v1",
-        description="OpenAI-compatible TTS endpoint (Kokoro-FastAPI, etc.)",
+        description="OpenAI-compatible TTS endpoint (Kokoro-FastAPI, etc.).",
    )
    speed: float = Field(
-        default=1.0, ge=0.25, le=4.0, description="Speech speed (0.25 to 4.0)"
+        default=1.0, ge=0.25, le=4.0, description="Speech speed (0.25 to 4.0)."
+    )
+    api_key: str | list[str] | None = Field(
+        default=None,
+        description="Usually not required for self-hosted TTS. Leave blank unless enforced.",
    )
-    api_key: str | list[str] | None = Field(default=None)


 TTSConfig = Annotated[
@ -813,10 +891,13 @@ DEEPGRAM_LANGUAGES = [
 class DeepgramSTTConfiguration(BaseSTTConfiguration):
    provider: Literal[ServiceProviders.DEEPGRAM] = ServiceProviders.DEEPGRAM
    model: str = Field(
-        default="nova-3-general", json_schema_extra={"examples": DEEPGRAM_STT_MODELS}
+        default="nova-3-general",
+        description="Deepgram STT model.",
+        json_schema_extra={"examples": DEEPGRAM_STT_MODELS},
    )
    language: str = Field(
        default="multi",
+        description="Language code; 'multi' enables auto-detect (Nova-3 only).",
        json_schema_extra={
            "examples": DEEPGRAM_LANGUAGES,
            "model_options": {
@ -834,7 +915,9 @@ CARTESIA_STT_MODELS = ["ink-whisper"]
 class CartesiaSTTConfiguration(BaseSTTConfiguration):
    provider: Literal[ServiceProviders.CARTESIA] = ServiceProviders.CARTESIA
    model: str = Field(
-        default="ink-whisper", json_schema_extra={"examples": CARTESIA_STT_MODELS}
+        default="ink-whisper",
+        description="Cartesia STT model.",
+        json_schema_extra={"examples": CARTESIA_STT_MODELS},
    )


@ -845,7 +928,9 @@ OPENAI_STT_MODELS = ["gpt-4o-transcribe"]
 class OpenAISTTConfiguration(BaseSTTConfiguration):
    provider: Literal[ServiceProviders.OPENAI] = ServiceProviders.OPENAI
    model: str = Field(
-        default="gpt-4o-transcribe", json_schema_extra={"examples": OPENAI_STT_MODELS}
+        default="gpt-4o-transcribe",
+        description="OpenAI transcription model.",
+        json_schema_extra={"examples": OPENAI_STT_MODELS},
    )


@ -858,10 +943,14 @@ DOGRAH_STT_LANGUAGES = DEEPGRAM_LANGUAGES
 class DograhSTTService(BaseSTTConfiguration):
    provider: Literal[ServiceProviders.DOGRAH] = ServiceProviders.DOGRAH
    model: str = Field(
-        default="default", json_schema_extra={"examples": DOGRAH_STT_MODELS}
+        default="default",
+        description="Dograh STT tier.",
+        json_schema_extra={"examples": DOGRAH_STT_MODELS},
    )
    language: str = Field(
-        default="multi", json_schema_extra={"examples": DOGRAH_STT_LANGUAGES}
+        default="multi",
+        description="Language code; use 'multi' for auto-detect.",
+        json_schema_extra={"examples": DOGRAH_STT_LANGUAGES},
    )


@ -873,10 +962,14 @@ SARVAM_STT_MODELS = ["saarika:v2.5", "saaras:v2"]
 class SarvamSTTConfiguration(BaseSTTConfiguration):
    provider: Literal[ServiceProviders.SARVAM] = ServiceProviders.SARVAM
    model: str = Field(
-        default="saarika:v2.5", json_schema_extra={"examples": SARVAM_STT_MODELS}
+        default="saarika:v2.5",
+        description="Sarvam STT model.",
+        json_schema_extra={"examples": SARVAM_STT_MODELS},
    )
    language: str = Field(
-        default="hi-IN", json_schema_extra={"examples": SARVAM_LANGUAGES}
+        default="hi-IN",
+        description="BCP-47 Indian-language code.",
+        json_schema_extra={"examples": SARVAM_LANGUAGES},
    )


@ -912,10 +1005,13 @@ SPEECHMATICS_STT_LANGUAGES = [
 class SpeechmaticsSTTConfiguration(BaseSTTConfiguration):
    provider: Literal[ServiceProviders.SPEECHMATICS] = ServiceProviders.SPEECHMATICS
    model: str = Field(
-        default="enhanced", description="Operating point: standard or enhanced"
+        default="enhanced",
+        description="Speechmatics operating point: 'standard' or 'enhanced'.",
    )
    language: str = Field(
-        default="en", json_schema_extra={"examples": SPEECHMATICS_STT_LANGUAGES}
+        default="en",
+        description="ISO 639-1 language code.",
+        json_schema_extra={"examples": SPEECHMATICS_STT_LANGUAGES},
    )


@ -931,6 +1027,7 @@ class SpeachesSTTConfiguration(BaseSTTConfiguration):
    provider: Literal[ServiceProviders.SPEACHES] = ServiceProviders.SPEACHES
    model: str = Field(
        default="Systran/faster-distil-whisper-small.en",
+        description="Whisper model identifier as served by your STT endpoint.",
        json_schema_extra={
            "examples": SPEACHES_STT_MODELS,
            "allow_custom_input": True,
@ -938,6 +1035,7 @@ class SpeachesSTTConfiguration(BaseSTTConfiguration):
    )
    language: str = Field(
        default="en",
+        description="ISO 639-1 language code.",
        json_schema_extra={
            "examples": SPEACHES_STT_LANGUAGES,
            "allow_custom_input": True,
@ -945,9 +1043,12 @@ class SpeachesSTTConfiguration(BaseSTTConfiguration):
    )
    base_url: str = Field(
        default="http://localhost:8000/v1",
-        description="OpenAI-compatible STT endpoint (Speaches, etc.)",
+        description="OpenAI-compatible STT endpoint (Speaches, etc.).",
+    )
+    api_key: str | list[str] | None = Field(
+        default=None,
+        description="Usually not required for self-hosted STT. Leave blank unless enforced.",
    )
-    api_key: str | list[str] | None = Field(default=None)


 ASSEMBLYAI_STT_MODELS = ["u3-rt-pro"]
@ -959,10 +1060,12 @@ class AssemblyAISTTConfiguration(BaseSTTConfiguration):
    provider: Literal[ServiceProviders.ASSEMBLYAI] = ServiceProviders.ASSEMBLYAI
    model: str = Field(
        default="u3-rt-pro",
+        description="AssemblyAI realtime STT model.",
        json_schema_extra={"examples": ASSEMBLYAI_STT_MODELS},
    )
    language: str = Field(
        default="en",
+        description="ISO 639-1 language code.",
        json_schema_extra={"examples": ASSEMBLYAI_STT_LANGUAGES},
    )

@ -1077,10 +1180,12 @@ class GladiaSTTConfiguration(BaseSTTConfiguration):
    provider: Literal[ServiceProviders.GLADIA] = ServiceProviders.GLADIA
    model: str = Field(
        default="solaria-1",
+        description="Gladia STT model.",
        json_schema_extra={"examples": GLADIA_STT_MODELS},
    )
    language: str = Field(
        default="en",
+        description="ISO 639-1 language code.",
        json_schema_extra={"examples": GLADIA_STT_LANGUAGES},
    )

@ -1110,6 +1215,7 @@ class OpenAIEmbeddingsConfiguration(BaseEmbeddingsConfiguration):
    provider: Literal[ServiceProviders.OPENAI] = ServiceProviders.OPENAI
    model: str = Field(
        default="text-embedding-3-small",
+        description="OpenAI embedding model.",
        json_schema_extra={"examples": OPENAI_EMBEDDING_MODELS},
    )

@ -1122,10 +1228,14 @@ class OpenRouterEmbeddingsConfiguration(BaseEmbeddingsConfiguration):
    provider: Literal[ServiceProviders.OPENROUTER] = ServiceProviders.OPENROUTER
    model: str = Field(
        default="openai/text-embedding-3-small",
+        description="OpenRouter-hosted embedding model slug.",
        json_schema_extra={"examples": OPENROUTER_EMBEDDING_MODELS},
    )

-    base_url: str = Field(default="https://openrouter.ai/api/v1")
+    base_url: str = Field(
+        default="https://openrouter.ai/api/v1",
+        description="Override only if proxying OpenRouter through your own gateway.",
+    )


 EmbeddingsConfig = Annotated[
--- a/docs/configurations/inference-providers.mdx
+++ b/docs/configurations/inference-providers.mdx
@ -120,4 +120,68 @@ To use Gemini 3.1 Live with Dograh, you need a Google Gemini API key. Follow the

 <Note>
  When using a Realtime provider like Gemini Live, you do not need to configure separate TTS and STT services — the realtime model handles speech in and out. However, you **must** still configure an **LLM** under the LLM tab: it powers variable extraction and QA analysis, which the realtime service does not perform.
+</Note>
+
+## Gemini Live on Vertex AI
+
+If you want to run Gemini Live through your own Google Cloud project — for billing consolidation, VPC controls, regional residency, or enterprise IAM — Dograh also supports Gemini Live via **Vertex AI** as a separate provider (`google_vertex_realtime`). The default model is `google/gemini-live-2.5-flash-native-audio`.
+
+Unlike Google AI Studio (which uses a single Gemini API key), Vertex AI authenticates with a **service account** belonging to your Google Cloud project.
+
+### Prerequisites
+
+1. A Google Cloud project with billing enabled.
+2. The Vertex AI API enabled on that project:
+
+   ```bash
+   gcloud services enable aiplatform.googleapis.com --project=YOUR_PROJECT_ID
+   ```
+
+3. A service account with the **Vertex AI User** role (`roles/aiplatform.user`) on the project:
+
+   ```bash
+   gcloud projects add-iam-policy-binding YOUR_PROJECT_ID \
+     --member="serviceAccount:YOUR_SA@YOUR_PROJECT_ID.iam.gserviceaccount.com" \
+     --role="roles/aiplatform.user"
+   ```
+
+4. A **JSON** key for that service account (P12 keys are not supported).
+
+### Creating the service account key
+
+1. In the GCP Console, go to **IAM & Admin → Service Accounts**.
+2. Pick an existing service account (or create a new one).
+3. Open the **Keys** tab → **Add Key → Create new key**.
+4. Choose **JSON** as the key type and click **Create**.
+5. The key file will download to your computer — store it securely and treat it as a secret.
+
+<Note>
+  Always pick **JSON**, not P12. The Vertex AI client libraries used by Dograh only accept service-account JSON keys; P12 is a legacy format retained for older Google Workspace integrations.
+</Note>
+
+### Configuring Vertex AI Realtime in Dograh
+
+1. Go to **Model Configurations** in your Dograh dashboard.
+2. Enable the **Realtime** toggle.
+3. Under the **Realtime** section, select `google_vertex_realtime` as the provider.
+4. Fill in the fields:
+
+   | Field | What to put in |
+   |---|---|
+   | **Model** | Vertex publisher/model id, e.g. `google/gemini-live-2.5-flash-native-audio` |
+   | **Voice** | One of the built-in voices (Puck, Charon, Kore, Fenrir, Aoede) |
+   | **Language** | BCP-47 code (e.g. `en-US`) |
+   | **Project Id** | The `project_id` value from your service-account JSON |
+   | **Location** | GCP region where the model is available (e.g. `us-east4`) |
+   | **Credentials** | Paste the **entire contents** of the service-account JSON file |
+   | **API Key** | Leave blank — Vertex AI does not use API keys |
+
+5. Save the configuration.
+
+<Note>
+  Paste the whole JSON file into the **Credentials** field — including `private_key`, `client_email`, and all other entries. Don't try to extract individual fields. If `Credentials` is left blank, Dograh falls back to **Application Default Credentials (ADC)** from the host environment, which is useful when running Dograh on a GCP VM or GKE pod with an attached service account.
+</Note>
+
+<Note>
+  IAM changes can take up to ~60 seconds to propagate. If you see `Permission 'aiplatform.endpoints.predict' denied`, wait a minute and retry — or double-check that the role was granted to the same service account whose JSON you pasted.
 </Note>
--- a/2
+++ b/2
@ -1 +1 @@
-Subproject commit f780c6de083d607adc7779109cad37f8b5a7030d
+Subproject commit 8590e5333d63eb69b78a193f9eeb2ff0584f9e9a
--- a/ui/src/components/ServiceConfigurationForm.tsx
+++ b/ui/src/components/ServiceConfigurationForm.tsx
@ -13,6 +13,7 @@ import { Label } from "@/components/ui/label";
 import { Select, SelectContent, SelectItem, SelectTrigger, SelectValue } from "@/components/ui/select";
 import { Switch } from "@/components/ui/switch";
 import { Tabs, TabsContent, TabsList, TabsTrigger } from "@/components/ui/tabs";
+import { Textarea } from "@/components/ui/textarea";
 import { VoiceSelector } from "@/components/VoiceSelector";
 import { LANGUAGE_DISPLAY_NAMES } from "@/constants/languages";
 import { useUserConfig } from "@/context/UserConfigContext";
@ -30,6 +31,7 @@ interface SchemaProperty {
    $ref?: string;
    description?: string;
    format?: string;
+    multiline?: boolean;
 }

 interface ProviderSchema {
@ -501,18 +503,26 @@ export function ServiceConfigurationForm({

                {currentProvider && providerSchema && configFields.length > 1 && (
                    <div className="grid grid-cols-2 gap-4">
-                        {configFields.slice(1).map((field) => (
-                            <div key={field} className="space-y-2">
-                                <Label className="capitalize">{field.replace(/_/g, ' ')}</Label>
-                                {renderField(service, field, providerSchema)}
-                            </div>
-                        ))}
+                        {configFields.slice(1).map((field) => {
+                            const fieldSchema = providerSchema.properties[field];
+                            const actualFieldSchema = fieldSchema?.$ref && providerSchema.$defs
+                                ? providerSchema.$defs[fieldSchema.$ref.split('/').pop() || '']
+                                : fieldSchema;
+                            const fullWidth = actualFieldSchema?.multiline;
+                            return (
+                                <div key={field} className={`space-y-2 ${fullWidth ? "col-span-2" : ""}`}>
+                                    <Label className="capitalize">{field.replace(/_/g, ' ')}</Label>
+                                    {renderField(service, field, providerSchema)}
+                                </div>
+                            );
+                        })}
                    </div>
                )}

                {currentProvider && providerSchema && providerSchema.properties.api_key && (
                    <div className="space-y-2">
                        <Label>{mode === 'override' ? 'API Key (leave empty to use global)' : 'API Key(s)'}</Label>
+                        {renderFieldDescription("api_key", providerSchema)}
                        {apiKeys[service].map((key, index) => (
                            <div key={index} className="flex gap-2">
                                <Input
@ -564,7 +574,28 @@ export function ServiceConfigurationForm({
        );
    };

+    const renderFieldDescription = (field: string, providerSchema: ProviderSchema) => {
+        const schema = providerSchema.properties[field];
+        if (!schema) return null;
+        const actualSchema = schema.$ref && providerSchema.$defs
+            ? providerSchema.$defs[schema.$ref.split('/').pop() || '']
+            : schema;
+        if (!actualSchema?.description) return null;
+        return (
+            <p className="text-xs text-muted-foreground">{actualSchema.description}</p>
+        );
+    };
+
    const renderField = (service: ServiceSegment, field: string, providerSchema: ProviderSchema) => {
+        return (
+            <>
+                {renderFieldInput(service, field, providerSchema)}
+                {renderFieldDescription(field, providerSchema)}
+            </>
+        );
+    };
+
+    const renderFieldInput = (service: ServiceSegment, field: string, providerSchema: ProviderSchema) => {
        const schema = providerSchema.properties[field];
        const actualSchema = schema.$ref && providerSchema.$defs
            ? providerSchema.$defs[schema.$ref.split('/').pop() || '']
@ -699,6 +730,19 @@ export function ServiceConfigurationForm({
            );
        }

+        if (actualSchema?.multiline) {
+            return (
+                <Textarea
+                    rows={6}
+                    className="font-mono text-xs"
+                    placeholder={`Enter ${field}`}
+                    {...register(`${service}_${field}`, {
+                        required: service !== "embeddings" && providerSchema.required?.includes(field),
+                    })}
+                />
+            );
+        }
+
        return (
            <Input
                type={actualSchema?.type === "number" ? "number" : "text"}