mirror of
https://github.com/dograh-hq/dograh.git
synced 2026-06-07 07:55:16 +02:00
fix: sampling rate fix for openai realtime
This commit is contained in:
parent
d37d6d05c1
commit
0b005dad58
5 changed files with 296 additions and 54 deletions
|
|
@ -49,6 +49,7 @@ class UserConfigurationValidator:
|
|||
ServiceProviders.SPEACHES.value: self._check_speaches_api_key,
|
||||
ServiceProviders.OPENAI_REALTIME.value: self._check_openai_api_key,
|
||||
ServiceProviders.GOOGLE_REALTIME.value: self._check_google_api_key,
|
||||
ServiceProviders.GOOGLE_VERTEX_REALTIME.value: self._check_google_vertex_realtime_api_key,
|
||||
ServiceProviders.ASSEMBLYAI.value: self._check_assemblyai_api_key,
|
||||
ServiceProviders.GLADIA.value: self._check_gladia_api_key,
|
||||
ServiceProviders.RIME.value: self._check_rime_api_key,
|
||||
|
|
@ -116,6 +117,22 @@ class UserConfigurationValidator:
|
|||
return [{"model": service_name, "message": str(e)}]
|
||||
return []
|
||||
|
||||
# Vertex Realtime uses service-account credentials (or ADC) instead of api_key
|
||||
if provider == ServiceProviders.GOOGLE_VERTEX_REALTIME.value:
|
||||
try:
|
||||
if not self._check_google_vertex_realtime_api_key(
|
||||
provider, service_config
|
||||
):
|
||||
return [
|
||||
{
|
||||
"model": service_name,
|
||||
"message": f"Invalid {provider} configuration",
|
||||
}
|
||||
]
|
||||
except ValueError as e:
|
||||
return [{"model": service_name, "message": str(e)}]
|
||||
return []
|
||||
|
||||
# AWS Bedrock uses AWS credentials instead of api_key
|
||||
if provider == ServiceProviders.AWS_BEDROCK.value:
|
||||
try:
|
||||
|
|
@ -216,6 +233,13 @@ class UserConfigurationValidator:
|
|||
raise ValueError("base_url is required for Speaches services")
|
||||
return True
|
||||
|
||||
def _check_google_vertex_realtime_api_key(self, model: str, service_config) -> bool:
|
||||
if not getattr(service_config, "project_id", None):
|
||||
raise ValueError("project_id is required for Google Vertex Realtime")
|
||||
if not getattr(service_config, "location", None):
|
||||
raise ValueError("location is required for Google Vertex Realtime")
|
||||
return True
|
||||
|
||||
def _check_aws_bedrock_api_key(self, model: str, service_config) -> bool:
|
||||
if not service_config.aws_access_key or not service_config.aws_secret_key:
|
||||
raise ValueError("AWS access key and secret key are required for Bedrock")
|
||||
|
|
|
|||
|
|
@ -207,6 +207,7 @@ class OpenAILLMService(BaseLLMConfiguration):
|
|||
provider: Literal[ServiceProviders.OPENAI] = ServiceProviders.OPENAI
|
||||
model: str = Field(
|
||||
default="gpt-4.1",
|
||||
description="OpenAI chat model to use.",
|
||||
json_schema_extra={"examples": OPENAI_MODELS, "allow_custom_input": True},
|
||||
)
|
||||
|
||||
|
|
@ -216,6 +217,7 @@ class GoogleLLMService(BaseLLMConfiguration):
|
|||
provider: Literal[ServiceProviders.GOOGLE] = ServiceProviders.GOOGLE
|
||||
model: str = Field(
|
||||
default="gemini-2.0-flash",
|
||||
description="Gemini model on Google AI Studio (not Vertex).",
|
||||
json_schema_extra={"examples": GOOGLE_MODELS, "allow_custom_input": True},
|
||||
)
|
||||
|
||||
|
|
@ -225,6 +227,7 @@ class GroqLLMService(BaseLLMConfiguration):
|
|||
provider: Literal[ServiceProviders.GROQ] = ServiceProviders.GROQ
|
||||
model: str = Field(
|
||||
default="llama-3.3-70b-versatile",
|
||||
description="Groq-hosted model identifier.",
|
||||
json_schema_extra={"examples": GROQ_MODELS, "allow_custom_input": True},
|
||||
)
|
||||
|
||||
|
|
@ -234,10 +237,14 @@ class OpenRouterLLMConfiguration(BaseLLMConfiguration):
|
|||
provider: Literal[ServiceProviders.OPENROUTER] = ServiceProviders.OPENROUTER
|
||||
model: str = Field(
|
||||
default="openai/gpt-4.1",
|
||||
description="OpenRouter model slug in 'vendor/model' form.",
|
||||
json_schema_extra={"examples": OPENROUTER_MODELS, "allow_custom_input": True},
|
||||
)
|
||||
|
||||
base_url: str = Field(default="https://openrouter.ai/api/v1")
|
||||
base_url: str = Field(
|
||||
default="https://openrouter.ai/api/v1",
|
||||
description="Override only if proxying OpenRouter through your own gateway.",
|
||||
)
|
||||
|
||||
|
||||
@register_llm
|
||||
|
|
@ -245,10 +252,13 @@ class AzureLLMService(BaseLLMConfiguration):
|
|||
provider: Literal[ServiceProviders.AZURE] = ServiceProviders.AZURE
|
||||
model: str = Field(
|
||||
default="gpt-4.1-mini",
|
||||
description="Azure deployment name (not the upstream OpenAI model id).",
|
||||
json_schema_extra={"examples": AZURE_MODELS, "allow_custom_input": True},
|
||||
)
|
||||
|
||||
endpoint: str
|
||||
endpoint: str = Field(
|
||||
description="Azure OpenAI resource endpoint (e.g. https://<resource>.openai.azure.com).",
|
||||
)
|
||||
|
||||
|
||||
@register_llm
|
||||
|
|
@ -256,6 +266,7 @@ class DograhLLMService(BaseLLMConfiguration):
|
|||
provider: Literal[ServiceProviders.DOGRAH] = ServiceProviders.DOGRAH
|
||||
model: str = Field(
|
||||
default="default",
|
||||
description="Dograh-hosted model tier.",
|
||||
json_schema_extra={"examples": DOGRAH_LLM_MODELS, "allow_custom_input": True},
|
||||
)
|
||||
|
||||
|
|
@ -265,12 +276,25 @@ class AWSBedrockLLMConfiguration(BaseLLMConfiguration):
|
|||
provider: Literal[ServiceProviders.AWS_BEDROCK] = ServiceProviders.AWS_BEDROCK
|
||||
model: str = Field(
|
||||
default="us.amazon.nova-pro-v1:0",
|
||||
description="Bedrock model ID — include the region inference-profile prefix (e.g. 'us.').",
|
||||
json_schema_extra={"examples": AWS_BEDROCK_MODELS, "allow_custom_input": True},
|
||||
)
|
||||
aws_access_key: str = Field(default="")
|
||||
aws_secret_key: str = Field(default="")
|
||||
aws_region: str = Field(default="us-east-1")
|
||||
api_key: str | list[str] | None = Field(default=None)
|
||||
aws_access_key: str = Field(
|
||||
default="",
|
||||
description="AWS access key ID with bedrock:InvokeModel permission.",
|
||||
)
|
||||
aws_secret_key: str = Field(
|
||||
default="",
|
||||
description="AWS secret access key paired with the access key ID.",
|
||||
)
|
||||
aws_region: str = Field(
|
||||
default="us-east-1",
|
||||
description="AWS region where the Bedrock model is available.",
|
||||
)
|
||||
api_key: str | list[str] | None = Field(
|
||||
default=None,
|
||||
description="Not used for Bedrock — authentication is via the AWS credentials above. Leave blank.",
|
||||
)
|
||||
|
||||
|
||||
SPEACHES_LLM_MODELS = ["llama3", "mistral", "phi3", "qwen2", "gemma2", "deepseek-r1"]
|
||||
|
|
@ -281,6 +305,7 @@ class SpeachesLLMConfiguration(BaseLLMConfiguration):
|
|||
provider: Literal[ServiceProviders.SPEACHES] = ServiceProviders.SPEACHES
|
||||
model: str = Field(
|
||||
default="llama3",
|
||||
description="Model name as exposed by your OpenAI-compatible server.",
|
||||
json_schema_extra={
|
||||
"examples": SPEACHES_LLM_MODELS,
|
||||
"allow_custom_input": True,
|
||||
|
|
@ -288,9 +313,12 @@ class SpeachesLLMConfiguration(BaseLLMConfiguration):
|
|||
)
|
||||
base_url: str = Field(
|
||||
default="http://localhost:11434/v1",
|
||||
description="OpenAI-compatible endpoint (Ollama, vLLM, etc.)",
|
||||
description="OpenAI-compatible endpoint (Ollama, vLLM, etc.).",
|
||||
)
|
||||
api_key: str | list[str] | None = Field(
|
||||
default=None,
|
||||
description="Usually not required for self-hosted endpoints. Leave blank unless your server enforces one.",
|
||||
)
|
||||
api_key: str | list[str] | None = Field(default=None)
|
||||
|
||||
|
||||
OPENAI_REALTIME_MODELS = ["gpt-realtime-2"]
|
||||
|
|
@ -313,6 +341,7 @@ class OpenAIRealtimeLLMConfiguration(BaseLLMConfiguration):
|
|||
)
|
||||
model: str = Field(
|
||||
default="gpt-realtime-2",
|
||||
description="OpenAI realtime (speech-to-speech) model.",
|
||||
json_schema_extra={
|
||||
"examples": OPENAI_REALTIME_MODELS,
|
||||
"allow_custom_input": True,
|
||||
|
|
@ -320,6 +349,7 @@ class OpenAIRealtimeLLMConfiguration(BaseLLMConfiguration):
|
|||
)
|
||||
voice: str = Field(
|
||||
default="alloy",
|
||||
description="Voice the model speaks in.",
|
||||
json_schema_extra={
|
||||
"examples": OPENAI_REALTIME_VOICES,
|
||||
"allow_custom_input": True,
|
||||
|
|
@ -365,6 +395,7 @@ class GoogleRealtimeLLMConfiguration(BaseLLMConfiguration):
|
|||
)
|
||||
model: str = Field(
|
||||
default="gemini-3.1-flash-live-preview",
|
||||
description="Gemini Live model on Google AI Studio (not Vertex).",
|
||||
json_schema_extra={
|
||||
"examples": GOOGLE_REALTIME_MODELS,
|
||||
"allow_custom_input": True,
|
||||
|
|
@ -372,6 +403,7 @@ class GoogleRealtimeLLMConfiguration(BaseLLMConfiguration):
|
|||
)
|
||||
voice: str = Field(
|
||||
default="Puck",
|
||||
description="Voice the model speaks in.",
|
||||
json_schema_extra={
|
||||
"examples": GOOGLE_REALTIME_VOICES,
|
||||
"allow_custom_input": True,
|
||||
|
|
@ -379,6 +411,7 @@ class GoogleRealtimeLLMConfiguration(BaseLLMConfiguration):
|
|||
)
|
||||
language: str = Field(
|
||||
default="en",
|
||||
description="ISO 639-1 language code.",
|
||||
json_schema_extra={
|
||||
"examples": GOOGLE_REALTIME_LANGUAGES,
|
||||
"allow_custom_input": True,
|
||||
|
|
@ -400,6 +433,7 @@ class GoogleVertexRealtimeLLMConfiguration(BaseLLMConfiguration):
|
|||
)
|
||||
model: str = Field(
|
||||
default="google/gemini-live-2.5-flash-native-audio",
|
||||
description="Vertex AI publisher/model identifier.",
|
||||
json_schema_extra={
|
||||
"examples": GOOGLE_VERTEX_REALTIME_MODELS,
|
||||
"allow_custom_input": True,
|
||||
|
|
@ -407,13 +441,15 @@ class GoogleVertexRealtimeLLMConfiguration(BaseLLMConfiguration):
|
|||
)
|
||||
voice: str = Field(
|
||||
default="Charon",
|
||||
description="Voice the model speaks in.",
|
||||
json_schema_extra={
|
||||
"examples": GOOGLE_VERTEX_REALTIME_VOICES,
|
||||
"allow_custom_input": True,
|
||||
},
|
||||
)
|
||||
language: str = Field(
|
||||
default="en-US",
|
||||
default="en",
|
||||
description="BCP-47 language code (e.g. 'en-US').",
|
||||
json_schema_extra={
|
||||
"examples": GOOGLE_VERTEX_REALTIME_LANGUAGES,
|
||||
"allow_custom_input": True,
|
||||
|
|
@ -427,11 +463,18 @@ class GoogleVertexRealtimeLLMConfiguration(BaseLLMConfiguration):
|
|||
credentials: str | None = Field(
|
||||
default=None,
|
||||
description=(
|
||||
"Service account JSON credentials string. If omitted, falls back to "
|
||||
"Application Default Credentials (ADC)."
|
||||
"Paste the entire service-account JSON file contents. If omitted, "
|
||||
"falls back to Application Default Credentials (ADC)."
|
||||
),
|
||||
json_schema_extra={"multiline": True},
|
||||
)
|
||||
api_key: str | list[str] | None = Field(
|
||||
default=None,
|
||||
description=(
|
||||
"Not used for Vertex AI — authentication is via the service account "
|
||||
"in `credentials` (or ADC). Leave blank."
|
||||
),
|
||||
)
|
||||
api_key: str | list[str] | None = Field(default=None)
|
||||
|
||||
|
||||
REALTIME_PROVIDERS = {
|
||||
|
|
@ -470,7 +513,10 @@ RealtimeConfig = Annotated[
|
|||
@register_tts
|
||||
class DeepgramTTSConfiguration(BaseServiceConfiguration):
|
||||
provider: Literal[ServiceProviders.DEEPGRAM] = ServiceProviders.DEEPGRAM
|
||||
voice: str = "aura-2-helena-en"
|
||||
voice: str = Field(
|
||||
default="aura-2-helena-en",
|
||||
description="Deepgram voice ID (model is inferred from the 'aura-N' prefix).",
|
||||
)
|
||||
|
||||
@computed_field
|
||||
@property
|
||||
|
|
@ -492,10 +538,14 @@ ELEVENLABS_TTS_MODELS = ["eleven_flash_v2_5"]
|
|||
@register_tts
|
||||
class ElevenlabsTTSConfiguration(BaseServiceConfiguration):
|
||||
provider: Literal[ServiceProviders.ELEVENLABS] = ServiceProviders.ELEVENLABS
|
||||
voice: str = "21m00Tcm4TlvDq8ikWAM" # Rachel voice ID
|
||||
speed: float = Field(default=1.0, ge=0.1, le=2.0, description="Speed of the voice")
|
||||
voice: str = Field(
|
||||
default="21m00Tcm4TlvDq8ikWAM",
|
||||
description="ElevenLabs voice ID from your Voice Library.",
|
||||
)
|
||||
speed: float = Field(default=1.0, ge=0.1, le=2.0, description="Speed of the voice.")
|
||||
model: str = Field(
|
||||
default="eleven_flash_v2_5",
|
||||
description="ElevenLabs TTS model.",
|
||||
json_schema_extra={"examples": ELEVENLABS_TTS_MODELS},
|
||||
)
|
||||
base_url: str = Field(
|
||||
|
|
@ -515,9 +565,14 @@ OPENAI_TTS_MODELS = ["gpt-4o-mini-tts"]
|
|||
class OpenAITTSService(BaseTTSConfiguration):
|
||||
provider: Literal[ServiceProviders.OPENAI] = ServiceProviders.OPENAI
|
||||
model: str = Field(
|
||||
default="gpt-4o-mini-tts", json_schema_extra={"examples": OPENAI_TTS_MODELS}
|
||||
default="gpt-4o-mini-tts",
|
||||
description="OpenAI TTS model.",
|
||||
json_schema_extra={"examples": OPENAI_TTS_MODELS},
|
||||
)
|
||||
voice: str = Field(
|
||||
default="alloy",
|
||||
description="OpenAI TTS voice name.",
|
||||
)
|
||||
voice: str = "alloy"
|
||||
|
||||
|
||||
DOGRAH_TTS_MODELS = ["default"]
|
||||
|
|
@ -527,10 +582,15 @@ DOGRAH_TTS_MODELS = ["default"]
|
|||
class DograhTTSService(BaseTTSConfiguration):
|
||||
provider: Literal[ServiceProviders.DOGRAH] = ServiceProviders.DOGRAH
|
||||
model: str = Field(
|
||||
default="default", json_schema_extra={"examples": DOGRAH_TTS_MODELS}
|
||||
default="default",
|
||||
description="Dograh TTS tier.",
|
||||
json_schema_extra={"examples": DOGRAH_TTS_MODELS},
|
||||
)
|
||||
voice: str = "default"
|
||||
speed: float = Field(default=1.0, ge=0.5, le=2.0, description="Speed of the voice")
|
||||
voice: str = Field(
|
||||
default="default",
|
||||
description="Voice preset.",
|
||||
)
|
||||
speed: float = Field(default=1.0, ge=0.5, le=2.0, description="Speed of the voice.")
|
||||
|
||||
|
||||
CARTESIA_TTS_MODELS = ["sonic-3"]
|
||||
|
|
@ -540,15 +600,20 @@ CARTESIA_TTS_MODELS = ["sonic-3"]
|
|||
class CartesiaTTSConfiguration(BaseTTSConfiguration):
|
||||
provider: Literal[ServiceProviders.CARTESIA] = ServiceProviders.CARTESIA
|
||||
model: str = Field(
|
||||
default="sonic-3", json_schema_extra={"examples": CARTESIA_TTS_MODELS}
|
||||
default="sonic-3",
|
||||
description="Cartesia TTS model.",
|
||||
json_schema_extra={"examples": CARTESIA_TTS_MODELS},
|
||||
)
|
||||
voice: str = Field(default="3faa81ae-d3d8-4ab1-9e44-e50e46d33c30")
|
||||
speed: float = Field(default=1.0, ge=0.6, le=1.5, description="Speed of the voice")
|
||||
voice: str = Field(
|
||||
default="3faa81ae-d3d8-4ab1-9e44-e50e46d33c30",
|
||||
description="Cartesia voice UUID from your Cartesia dashboard.",
|
||||
)
|
||||
speed: float = Field(default=1.0, ge=0.6, le=1.5, description="Speed of the voice.")
|
||||
volume: float = Field(
|
||||
default=1.0,
|
||||
ge=0.5,
|
||||
le=2.0,
|
||||
description="Volume multiplier for generated speech",
|
||||
description="Volume multiplier for generated speech.",
|
||||
)
|
||||
|
||||
|
||||
|
|
@ -623,10 +688,13 @@ SARVAM_LANGUAGES = [
|
|||
class SarvamTTSConfiguration(BaseTTSConfiguration):
|
||||
provider: Literal[ServiceProviders.SARVAM] = ServiceProviders.SARVAM
|
||||
model: str = Field(
|
||||
default="bulbul:v2", json_schema_extra={"examples": SARVAM_TTS_MODELS}
|
||||
default="bulbul:v2",
|
||||
description="Sarvam TTS model (voice list depends on this).",
|
||||
json_schema_extra={"examples": SARVAM_TTS_MODELS},
|
||||
)
|
||||
voice: str = Field(
|
||||
default="anushka",
|
||||
description="Sarvam voice name; must match the selected model's voice list.",
|
||||
json_schema_extra={
|
||||
"examples": SARVAM_V2_VOICES,
|
||||
"model_options": {
|
||||
|
|
@ -636,7 +704,9 @@ class SarvamTTSConfiguration(BaseTTSConfiguration):
|
|||
},
|
||||
)
|
||||
language: str = Field(
|
||||
default="hi-IN", json_schema_extra={"examples": SARVAM_LANGUAGES}
|
||||
default="hi-IN",
|
||||
description="BCP-47 Indian-language code (e.g. hi-IN, en-IN).",
|
||||
json_schema_extra={"examples": SARVAM_LANGUAGES},
|
||||
)
|
||||
|
||||
|
||||
|
|
@ -647,10 +717,12 @@ CAMB_TTS_MODELS = ["mars-flash", "mars-pro", "mars-instruct"]
|
|||
class CambTTSConfiguration(BaseTTSConfiguration):
|
||||
provider: Literal[ServiceProviders.CAMB] = ServiceProviders.CAMB
|
||||
model: str = Field(
|
||||
default="mars-flash", json_schema_extra={"examples": CAMB_TTS_MODELS}
|
||||
default="mars-flash",
|
||||
description="Camb.ai TTS model.",
|
||||
json_schema_extra={"examples": CAMB_TTS_MODELS},
|
||||
)
|
||||
voice: str = Field(default="147320", description="Camb.ai voice ID")
|
||||
language: str = Field(default="en-us", description="BCP-47 language code")
|
||||
voice: str = Field(default="147320", description="Camb.ai voice ID.")
|
||||
language: str = Field(default="en-us", description="BCP-47 language code.")
|
||||
|
||||
|
||||
RIME_TTS_MODELS = ["arcana", "mistv3", "mistv2", "mist"]
|
||||
|
|
@ -662,17 +734,19 @@ class RimeTTSConfiguration(BaseTTSConfiguration):
|
|||
provider: Literal[ServiceProviders.RIME] = ServiceProviders.RIME
|
||||
model: str = Field(
|
||||
default="arcana",
|
||||
description="Rime TTS model.",
|
||||
json_schema_extra={"examples": RIME_TTS_MODELS, "allow_custom_input": True},
|
||||
)
|
||||
voice: str = Field(
|
||||
default="celeste",
|
||||
description="Rime voice ID",
|
||||
description="Rime voice ID.",
|
||||
)
|
||||
speed: float = Field(
|
||||
default=1.0, ge=0.5, le=2.0, description="Speech speed multiplier"
|
||||
default=1.0, ge=0.5, le=2.0, description="Speech speed multiplier."
|
||||
)
|
||||
language: str = Field(
|
||||
default="en",
|
||||
description="ISO 639-1 language code.",
|
||||
json_schema_extra={"examples": RIME_TTS_LANGUAGES, "allow_custom_input": True},
|
||||
)
|
||||
|
||||
|
|
@ -685,6 +759,7 @@ class SpeachesTTSConfiguration(BaseTTSConfiguration):
|
|||
provider: Literal[ServiceProviders.SPEACHES] = ServiceProviders.SPEACHES
|
||||
model: str = Field(
|
||||
default="kokoro",
|
||||
description="Model name as served by your TTS endpoint (e.g. Kokoro-FastAPI).",
|
||||
json_schema_extra={
|
||||
"examples": SPEACHES_TTS_MODELS,
|
||||
"allow_custom_input": True,
|
||||
|
|
@ -693,16 +768,19 @@ class SpeachesTTSConfiguration(BaseTTSConfiguration):
|
|||
voice: str = Field(
|
||||
default="af_heart",
|
||||
json_schema_extra={"allow_custom_input": True},
|
||||
description="Voice ID for the TTS engine",
|
||||
description="Voice ID for the TTS engine.",
|
||||
)
|
||||
base_url: str = Field(
|
||||
default="http://localhost:8000/v1",
|
||||
description="OpenAI-compatible TTS endpoint (Kokoro-FastAPI, etc.)",
|
||||
description="OpenAI-compatible TTS endpoint (Kokoro-FastAPI, etc.).",
|
||||
)
|
||||
speed: float = Field(
|
||||
default=1.0, ge=0.25, le=4.0, description="Speech speed (0.25 to 4.0)"
|
||||
default=1.0, ge=0.25, le=4.0, description="Speech speed (0.25 to 4.0)."
|
||||
)
|
||||
api_key: str | list[str] | None = Field(
|
||||
default=None,
|
||||
description="Usually not required for self-hosted TTS. Leave blank unless enforced.",
|
||||
)
|
||||
api_key: str | list[str] | None = Field(default=None)
|
||||
|
||||
|
||||
TTSConfig = Annotated[
|
||||
|
|
@ -813,10 +891,13 @@ DEEPGRAM_LANGUAGES = [
|
|||
class DeepgramSTTConfiguration(BaseSTTConfiguration):
|
||||
provider: Literal[ServiceProviders.DEEPGRAM] = ServiceProviders.DEEPGRAM
|
||||
model: str = Field(
|
||||
default="nova-3-general", json_schema_extra={"examples": DEEPGRAM_STT_MODELS}
|
||||
default="nova-3-general",
|
||||
description="Deepgram STT model.",
|
||||
json_schema_extra={"examples": DEEPGRAM_STT_MODELS},
|
||||
)
|
||||
language: str = Field(
|
||||
default="multi",
|
||||
description="Language code; 'multi' enables auto-detect (Nova-3 only).",
|
||||
json_schema_extra={
|
||||
"examples": DEEPGRAM_LANGUAGES,
|
||||
"model_options": {
|
||||
|
|
@ -834,7 +915,9 @@ CARTESIA_STT_MODELS = ["ink-whisper"]
|
|||
class CartesiaSTTConfiguration(BaseSTTConfiguration):
|
||||
provider: Literal[ServiceProviders.CARTESIA] = ServiceProviders.CARTESIA
|
||||
model: str = Field(
|
||||
default="ink-whisper", json_schema_extra={"examples": CARTESIA_STT_MODELS}
|
||||
default="ink-whisper",
|
||||
description="Cartesia STT model.",
|
||||
json_schema_extra={"examples": CARTESIA_STT_MODELS},
|
||||
)
|
||||
|
||||
|
||||
|
|
@ -845,7 +928,9 @@ OPENAI_STT_MODELS = ["gpt-4o-transcribe"]
|
|||
class OpenAISTTConfiguration(BaseSTTConfiguration):
|
||||
provider: Literal[ServiceProviders.OPENAI] = ServiceProviders.OPENAI
|
||||
model: str = Field(
|
||||
default="gpt-4o-transcribe", json_schema_extra={"examples": OPENAI_STT_MODELS}
|
||||
default="gpt-4o-transcribe",
|
||||
description="OpenAI transcription model.",
|
||||
json_schema_extra={"examples": OPENAI_STT_MODELS},
|
||||
)
|
||||
|
||||
|
||||
|
|
@ -858,10 +943,14 @@ DOGRAH_STT_LANGUAGES = DEEPGRAM_LANGUAGES
|
|||
class DograhSTTService(BaseSTTConfiguration):
|
||||
provider: Literal[ServiceProviders.DOGRAH] = ServiceProviders.DOGRAH
|
||||
model: str = Field(
|
||||
default="default", json_schema_extra={"examples": DOGRAH_STT_MODELS}
|
||||
default="default",
|
||||
description="Dograh STT tier.",
|
||||
json_schema_extra={"examples": DOGRAH_STT_MODELS},
|
||||
)
|
||||
language: str = Field(
|
||||
default="multi", json_schema_extra={"examples": DOGRAH_STT_LANGUAGES}
|
||||
default="multi",
|
||||
description="Language code; use 'multi' for auto-detect.",
|
||||
json_schema_extra={"examples": DOGRAH_STT_LANGUAGES},
|
||||
)
|
||||
|
||||
|
||||
|
|
@ -873,10 +962,14 @@ SARVAM_STT_MODELS = ["saarika:v2.5", "saaras:v2"]
|
|||
class SarvamSTTConfiguration(BaseSTTConfiguration):
|
||||
provider: Literal[ServiceProviders.SARVAM] = ServiceProviders.SARVAM
|
||||
model: str = Field(
|
||||
default="saarika:v2.5", json_schema_extra={"examples": SARVAM_STT_MODELS}
|
||||
default="saarika:v2.5",
|
||||
description="Sarvam STT model.",
|
||||
json_schema_extra={"examples": SARVAM_STT_MODELS},
|
||||
)
|
||||
language: str = Field(
|
||||
default="hi-IN", json_schema_extra={"examples": SARVAM_LANGUAGES}
|
||||
default="hi-IN",
|
||||
description="BCP-47 Indian-language code.",
|
||||
json_schema_extra={"examples": SARVAM_LANGUAGES},
|
||||
)
|
||||
|
||||
|
||||
|
|
@ -912,10 +1005,13 @@ SPEECHMATICS_STT_LANGUAGES = [
|
|||
class SpeechmaticsSTTConfiguration(BaseSTTConfiguration):
|
||||
provider: Literal[ServiceProviders.SPEECHMATICS] = ServiceProviders.SPEECHMATICS
|
||||
model: str = Field(
|
||||
default="enhanced", description="Operating point: standard or enhanced"
|
||||
default="enhanced",
|
||||
description="Speechmatics operating point: 'standard' or 'enhanced'.",
|
||||
)
|
||||
language: str = Field(
|
||||
default="en", json_schema_extra={"examples": SPEECHMATICS_STT_LANGUAGES}
|
||||
default="en",
|
||||
description="ISO 639-1 language code.",
|
||||
json_schema_extra={"examples": SPEECHMATICS_STT_LANGUAGES},
|
||||
)
|
||||
|
||||
|
||||
|
|
@ -931,6 +1027,7 @@ class SpeachesSTTConfiguration(BaseSTTConfiguration):
|
|||
provider: Literal[ServiceProviders.SPEACHES] = ServiceProviders.SPEACHES
|
||||
model: str = Field(
|
||||
default="Systran/faster-distil-whisper-small.en",
|
||||
description="Whisper model identifier as served by your STT endpoint.",
|
||||
json_schema_extra={
|
||||
"examples": SPEACHES_STT_MODELS,
|
||||
"allow_custom_input": True,
|
||||
|
|
@ -938,6 +1035,7 @@ class SpeachesSTTConfiguration(BaseSTTConfiguration):
|
|||
)
|
||||
language: str = Field(
|
||||
default="en",
|
||||
description="ISO 639-1 language code.",
|
||||
json_schema_extra={
|
||||
"examples": SPEACHES_STT_LANGUAGES,
|
||||
"allow_custom_input": True,
|
||||
|
|
@ -945,9 +1043,12 @@ class SpeachesSTTConfiguration(BaseSTTConfiguration):
|
|||
)
|
||||
base_url: str = Field(
|
||||
default="http://localhost:8000/v1",
|
||||
description="OpenAI-compatible STT endpoint (Speaches, etc.)",
|
||||
description="OpenAI-compatible STT endpoint (Speaches, etc.).",
|
||||
)
|
||||
api_key: str | list[str] | None = Field(
|
||||
default=None,
|
||||
description="Usually not required for self-hosted STT. Leave blank unless enforced.",
|
||||
)
|
||||
api_key: str | list[str] | None = Field(default=None)
|
||||
|
||||
|
||||
ASSEMBLYAI_STT_MODELS = ["u3-rt-pro"]
|
||||
|
|
@ -959,10 +1060,12 @@ class AssemblyAISTTConfiguration(BaseSTTConfiguration):
|
|||
provider: Literal[ServiceProviders.ASSEMBLYAI] = ServiceProviders.ASSEMBLYAI
|
||||
model: str = Field(
|
||||
default="u3-rt-pro",
|
||||
description="AssemblyAI realtime STT model.",
|
||||
json_schema_extra={"examples": ASSEMBLYAI_STT_MODELS},
|
||||
)
|
||||
language: str = Field(
|
||||
default="en",
|
||||
description="ISO 639-1 language code.",
|
||||
json_schema_extra={"examples": ASSEMBLYAI_STT_LANGUAGES},
|
||||
)
|
||||
|
||||
|
|
@ -1077,10 +1180,12 @@ class GladiaSTTConfiguration(BaseSTTConfiguration):
|
|||
provider: Literal[ServiceProviders.GLADIA] = ServiceProviders.GLADIA
|
||||
model: str = Field(
|
||||
default="solaria-1",
|
||||
description="Gladia STT model.",
|
||||
json_schema_extra={"examples": GLADIA_STT_MODELS},
|
||||
)
|
||||
language: str = Field(
|
||||
default="en",
|
||||
description="ISO 639-1 language code.",
|
||||
json_schema_extra={"examples": GLADIA_STT_LANGUAGES},
|
||||
)
|
||||
|
||||
|
|
@ -1110,6 +1215,7 @@ class OpenAIEmbeddingsConfiguration(BaseEmbeddingsConfiguration):
|
|||
provider: Literal[ServiceProviders.OPENAI] = ServiceProviders.OPENAI
|
||||
model: str = Field(
|
||||
default="text-embedding-3-small",
|
||||
description="OpenAI embedding model.",
|
||||
json_schema_extra={"examples": OPENAI_EMBEDDING_MODELS},
|
||||
)
|
||||
|
||||
|
|
@ -1122,10 +1228,14 @@ class OpenRouterEmbeddingsConfiguration(BaseEmbeddingsConfiguration):
|
|||
provider: Literal[ServiceProviders.OPENROUTER] = ServiceProviders.OPENROUTER
|
||||
model: str = Field(
|
||||
default="openai/text-embedding-3-small",
|
||||
description="OpenRouter-hosted embedding model slug.",
|
||||
json_schema_extra={"examples": OPENROUTER_EMBEDDING_MODELS},
|
||||
)
|
||||
|
||||
base_url: str = Field(default="https://openrouter.ai/api/v1")
|
||||
base_url: str = Field(
|
||||
default="https://openrouter.ai/api/v1",
|
||||
description="Override only if proxying OpenRouter through your own gateway.",
|
||||
)
|
||||
|
||||
|
||||
EmbeddingsConfig = Annotated[
|
||||
|
|
|
|||
|
|
@ -120,4 +120,68 @@ To use Gemini 3.1 Live with Dograh, you need a Google Gemini API key. Follow the
|
|||
|
||||
<Note>
|
||||
When using a Realtime provider like Gemini Live, you do not need to configure separate TTS and STT services — the realtime model handles speech in and out. However, you **must** still configure an **LLM** under the LLM tab: it powers variable extraction and QA analysis, which the realtime service does not perform.
|
||||
</Note>
|
||||
|
||||
## Gemini Live on Vertex AI
|
||||
|
||||
If you want to run Gemini Live through your own Google Cloud project — for billing consolidation, VPC controls, regional residency, or enterprise IAM — Dograh also supports Gemini Live via **Vertex AI** as a separate provider (`google_vertex_realtime`). The default model is `google/gemini-live-2.5-flash-native-audio`.
|
||||
|
||||
Unlike Google AI Studio (which uses a single Gemini API key), Vertex AI authenticates with a **service account** belonging to your Google Cloud project.
|
||||
|
||||
### Prerequisites
|
||||
|
||||
1. A Google Cloud project with billing enabled.
|
||||
2. The Vertex AI API enabled on that project:
|
||||
|
||||
```bash
|
||||
gcloud services enable aiplatform.googleapis.com --project=YOUR_PROJECT_ID
|
||||
```
|
||||
|
||||
3. A service account with the **Vertex AI User** role (`roles/aiplatform.user`) on the project:
|
||||
|
||||
```bash
|
||||
gcloud projects add-iam-policy-binding YOUR_PROJECT_ID \
|
||||
--member="serviceAccount:YOUR_SA@YOUR_PROJECT_ID.iam.gserviceaccount.com" \
|
||||
--role="roles/aiplatform.user"
|
||||
```
|
||||
|
||||
4. A **JSON** key for that service account (P12 keys are not supported).
|
||||
|
||||
### Creating the service account key
|
||||
|
||||
1. In the GCP Console, go to **IAM & Admin → Service Accounts**.
|
||||
2. Pick an existing service account (or create a new one).
|
||||
3. Open the **Keys** tab → **Add Key → Create new key**.
|
||||
4. Choose **JSON** as the key type and click **Create**.
|
||||
5. The key file will download to your computer — store it securely and treat it as a secret.
|
||||
|
||||
<Note>
|
||||
Always pick **JSON**, not P12. The Vertex AI client libraries used by Dograh only accept service-account JSON keys; P12 is a legacy format retained for older Google Workspace integrations.
|
||||
</Note>
|
||||
|
||||
### Configuring Vertex AI Realtime in Dograh
|
||||
|
||||
1. Go to **Model Configurations** in your Dograh dashboard.
|
||||
2. Enable the **Realtime** toggle.
|
||||
3. Under the **Realtime** section, select `google_vertex_realtime` as the provider.
|
||||
4. Fill in the fields:
|
||||
|
||||
| Field | What to put in |
|
||||
|---|---|
|
||||
| **Model** | Vertex publisher/model id, e.g. `google/gemini-live-2.5-flash-native-audio` |
|
||||
| **Voice** | One of the built-in voices (Puck, Charon, Kore, Fenrir, Aoede) |
|
||||
| **Language** | BCP-47 code (e.g. `en-US`) |
|
||||
| **Project Id** | The `project_id` value from your service-account JSON |
|
||||
| **Location** | GCP region where the model is available (e.g. `us-east4`) |
|
||||
| **Credentials** | Paste the **entire contents** of the service-account JSON file |
|
||||
| **API Key** | Leave blank — Vertex AI does not use API keys |
|
||||
|
||||
5. Save the configuration.
|
||||
|
||||
<Note>
|
||||
Paste the whole JSON file into the **Credentials** field — including `private_key`, `client_email`, and all other entries. Don't try to extract individual fields. If `Credentials` is left blank, Dograh falls back to **Application Default Credentials (ADC)** from the host environment, which is useful when running Dograh on a GCP VM or GKE pod with an attached service account.
|
||||
</Note>
|
||||
|
||||
<Note>
|
||||
IAM changes can take up to ~60 seconds to propagate. If you see `Permission 'aiplatform.endpoints.predict' denied`, wait a minute and retry — or double-check that the role was granted to the same service account whose JSON you pasted.
|
||||
</Note>
|
||||
2
pipecat
2
pipecat
|
|
@ -1 +1 @@
|
|||
Subproject commit f780c6de083d607adc7779109cad37f8b5a7030d
|
||||
Subproject commit 8590e5333d63eb69b78a193f9eeb2ff0584f9e9a
|
||||
|
|
@ -13,6 +13,7 @@ import { Label } from "@/components/ui/label";
|
|||
import { Select, SelectContent, SelectItem, SelectTrigger, SelectValue } from "@/components/ui/select";
|
||||
import { Switch } from "@/components/ui/switch";
|
||||
import { Tabs, TabsContent, TabsList, TabsTrigger } from "@/components/ui/tabs";
|
||||
import { Textarea } from "@/components/ui/textarea";
|
||||
import { VoiceSelector } from "@/components/VoiceSelector";
|
||||
import { LANGUAGE_DISPLAY_NAMES } from "@/constants/languages";
|
||||
import { useUserConfig } from "@/context/UserConfigContext";
|
||||
|
|
@ -30,6 +31,7 @@ interface SchemaProperty {
|
|||
$ref?: string;
|
||||
description?: string;
|
||||
format?: string;
|
||||
multiline?: boolean;
|
||||
}
|
||||
|
||||
interface ProviderSchema {
|
||||
|
|
@ -501,18 +503,26 @@ export function ServiceConfigurationForm({
|
|||
|
||||
{currentProvider && providerSchema && configFields.length > 1 && (
|
||||
<div className="grid grid-cols-2 gap-4">
|
||||
{configFields.slice(1).map((field) => (
|
||||
<div key={field} className="space-y-2">
|
||||
<Label className="capitalize">{field.replace(/_/g, ' ')}</Label>
|
||||
{renderField(service, field, providerSchema)}
|
||||
</div>
|
||||
))}
|
||||
{configFields.slice(1).map((field) => {
|
||||
const fieldSchema = providerSchema.properties[field];
|
||||
const actualFieldSchema = fieldSchema?.$ref && providerSchema.$defs
|
||||
? providerSchema.$defs[fieldSchema.$ref.split('/').pop() || '']
|
||||
: fieldSchema;
|
||||
const fullWidth = actualFieldSchema?.multiline;
|
||||
return (
|
||||
<div key={field} className={`space-y-2 ${fullWidth ? "col-span-2" : ""}`}>
|
||||
<Label className="capitalize">{field.replace(/_/g, ' ')}</Label>
|
||||
{renderField(service, field, providerSchema)}
|
||||
</div>
|
||||
);
|
||||
})}
|
||||
</div>
|
||||
)}
|
||||
|
||||
{currentProvider && providerSchema && providerSchema.properties.api_key && (
|
||||
<div className="space-y-2">
|
||||
<Label>{mode === 'override' ? 'API Key (leave empty to use global)' : 'API Key(s)'}</Label>
|
||||
{renderFieldDescription("api_key", providerSchema)}
|
||||
{apiKeys[service].map((key, index) => (
|
||||
<div key={index} className="flex gap-2">
|
||||
<Input
|
||||
|
|
@ -564,7 +574,28 @@ export function ServiceConfigurationForm({
|
|||
);
|
||||
};
|
||||
|
||||
const renderFieldDescription = (field: string, providerSchema: ProviderSchema) => {
|
||||
const schema = providerSchema.properties[field];
|
||||
if (!schema) return null;
|
||||
const actualSchema = schema.$ref && providerSchema.$defs
|
||||
? providerSchema.$defs[schema.$ref.split('/').pop() || '']
|
||||
: schema;
|
||||
if (!actualSchema?.description) return null;
|
||||
return (
|
||||
<p className="text-xs text-muted-foreground">{actualSchema.description}</p>
|
||||
);
|
||||
};
|
||||
|
||||
const renderField = (service: ServiceSegment, field: string, providerSchema: ProviderSchema) => {
|
||||
return (
|
||||
<>
|
||||
{renderFieldInput(service, field, providerSchema)}
|
||||
{renderFieldDescription(field, providerSchema)}
|
||||
</>
|
||||
);
|
||||
};
|
||||
|
||||
const renderFieldInput = (service: ServiceSegment, field: string, providerSchema: ProviderSchema) => {
|
||||
const schema = providerSchema.properties[field];
|
||||
const actualSchema = schema.$ref && providerSchema.$defs
|
||||
? providerSchema.$defs[schema.$ref.split('/').pop() || '']
|
||||
|
|
@ -699,6 +730,19 @@ export function ServiceConfigurationForm({
|
|||
);
|
||||
}
|
||||
|
||||
if (actualSchema?.multiline) {
|
||||
return (
|
||||
<Textarea
|
||||
rows={6}
|
||||
className="font-mono text-xs"
|
||||
placeholder={`Enter ${field}`}
|
||||
{...register(`${service}_${field}`, {
|
||||
required: service !== "embeddings" && providerSchema.required?.includes(field),
|
||||
})}
|
||||
/>
|
||||
);
|
||||
}
|
||||
|
||||
return (
|
||||
<Input
|
||||
type={actualSchema?.type === "number" ? "number" : "text"}
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue