mirror of
https://github.com/dograh-hq/dograh.git
synced 2026-07-04 10:52:17 +02:00
feat: add Azure AI multi-provider support (TTS, STT, Embeddings, Realtime) (#381)
* feat: add Azure AI multi-provider support (TTS, STT, Embeddings, Realtime) Enables Azure AI services across all model layers so users with Azure credits can consolidate billing on a single provider. - Voice (TTS): AzureSpeechTTSConfiguration via azure_speech provider - Transcriber (STT): AzureSpeechSTTConfiguration via azure_speech provider - Embedding: AzureOpenAIEmbeddingsConfiguration via azure provider - Realtime: AzureRealtimeLLMConfiguration via azure_realtime provider New files: - api/services/pipecat/realtime/azure_realtime.py - api/services/gen_ai/embedding/azure_openai_service.py - api/tests/test_azure_speech_service_factory.py The UI picks up all four providers automatically from the schema — no frontend changes required. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com> * fix: add validation for URL and params --------- Co-authored-by: Vishal Dhateria <vishal@finela.ai> Co-authored-by: Claude Sonnet 4.6 <noreply@anthropic.com> Co-authored-by: Abhishek Kumar <abhishek@a6k.me>
This commit is contained in:
parent
8a4a2e25db
commit
7ba95c0fbe
15 changed files with 1082 additions and 29 deletions
|
|
@ -41,6 +41,7 @@ class UserConfigurationValidator:
|
|||
ServiceProviders.ELEVENLABS.value: self._validate_elevenlabs_api_key,
|
||||
ServiceProviders.GOOGLE.value: self._check_google_api_key,
|
||||
ServiceProviders.AZURE.value: self._check_azure_api_key,
|
||||
ServiceProviders.AZURE_SPEECH.value: self._check_azure_speech_api_key,
|
||||
ServiceProviders.CARTESIA.value: self._check_cartesia_api_key,
|
||||
ServiceProviders.DOGRAH.value: self._check_dograh_api_key,
|
||||
ServiceProviders.SARVAM.value: self._check_sarvam_api_key,
|
||||
|
|
@ -54,6 +55,7 @@ class UserConfigurationValidator:
|
|||
ServiceProviders.ULTRAVOX_REALTIME.value: self._check_ultravox_realtime_api_key,
|
||||
ServiceProviders.GOOGLE_REALTIME.value: self._check_google_api_key,
|
||||
ServiceProviders.GOOGLE_VERTEX_REALTIME.value: self._check_google_vertex_realtime_api_key,
|
||||
ServiceProviders.AZURE_REALTIME.value: self._check_azure_realtime_api_key,
|
||||
ServiceProviders.ASSEMBLYAI.value: self._check_assemblyai_api_key,
|
||||
ServiceProviders.GLADIA.value: self._check_gladia_api_key,
|
||||
ServiceProviders.RIME.value: self._check_rime_api_key,
|
||||
|
|
@ -313,6 +315,12 @@ class UserConfigurationValidator:
|
|||
def _check_azure_api_key(self, model: str, api_key: str) -> bool:
|
||||
return True
|
||||
|
||||
def _check_azure_speech_api_key(self, model: str, api_key: str) -> bool:
|
||||
return True
|
||||
|
||||
def _check_azure_realtime_api_key(self, model: str, api_key: str) -> bool:
|
||||
return True
|
||||
|
||||
def _check_cartesia_api_key(self, model: str, api_key: str) -> bool:
|
||||
return True
|
||||
|
||||
|
|
|
|||
|
|
@ -1,3 +1,14 @@
|
|||
from .azure import (
|
||||
AZURE_EMBEDDING_MODELS,
|
||||
AZURE_MODELS,
|
||||
AZURE_REALTIME_API_VERSIONS,
|
||||
AZURE_REALTIME_MODELS,
|
||||
AZURE_REALTIME_VOICES,
|
||||
AZURE_SPEECH_REGIONS,
|
||||
AZURE_SPEECH_STT_LANGUAGES,
|
||||
AZURE_SPEECH_TTS_LANGUAGES,
|
||||
AZURE_SPEECH_TTS_VOICES,
|
||||
)
|
||||
from .deepgram import DEEPGRAM_LANGUAGES, DEEPGRAM_STT_MODELS
|
||||
from .gladia import GLADIA_STT_LANGUAGES, GLADIA_STT_MODELS
|
||||
from .google import (
|
||||
|
|
@ -27,6 +38,15 @@ from .sarvam import (
|
|||
from .speechmatics import SPEECHMATICS_STT_LANGUAGES
|
||||
|
||||
__all__ = [
|
||||
"AZURE_EMBEDDING_MODELS",
|
||||
"AZURE_MODELS",
|
||||
"AZURE_REALTIME_API_VERSIONS",
|
||||
"AZURE_REALTIME_MODELS",
|
||||
"AZURE_REALTIME_VOICES",
|
||||
"AZURE_SPEECH_REGIONS",
|
||||
"AZURE_SPEECH_STT_LANGUAGES",
|
||||
"AZURE_SPEECH_TTS_LANGUAGES",
|
||||
"AZURE_SPEECH_TTS_VOICES",
|
||||
"DEEPGRAM_LANGUAGES",
|
||||
"DEEPGRAM_STT_MODELS",
|
||||
"GLADIA_STT_LANGUAGES",
|
||||
|
|
|
|||
125
api/services/configuration/options/azure.py
Normal file
125
api/services/configuration/options/azure.py
Normal file
|
|
@ -0,0 +1,125 @@
|
|||
AZURE_MODELS = ["gpt-4.1-mini"]
|
||||
|
||||
AZURE_REALTIME_MODELS = ["gpt-4o-realtime-preview"]
|
||||
AZURE_REALTIME_VOICES = [
|
||||
"alloy",
|
||||
"ash",
|
||||
"ballad",
|
||||
"coral",
|
||||
"echo",
|
||||
"sage",
|
||||
"shimmer",
|
||||
"verse",
|
||||
]
|
||||
AZURE_REALTIME_API_VERSIONS = [
|
||||
"2025-04-01-preview",
|
||||
"2024-10-01-preview",
|
||||
"2024-12-17",
|
||||
]
|
||||
|
||||
AZURE_SPEECH_REGIONS = [
|
||||
"eastus",
|
||||
"eastus2",
|
||||
"westus",
|
||||
"westus2",
|
||||
"westus3",
|
||||
"centralus",
|
||||
"northcentralus",
|
||||
"southcentralus",
|
||||
"westcentralus",
|
||||
"westeurope",
|
||||
"northeurope",
|
||||
"uksouth",
|
||||
"ukwest",
|
||||
"francecentral",
|
||||
"switzerlandnorth",
|
||||
"germanywestcentral",
|
||||
"norwayeast",
|
||||
"australiaeast",
|
||||
"eastasia",
|
||||
"southeastasia",
|
||||
"japaneast",
|
||||
"japanwest",
|
||||
"koreacentral",
|
||||
"centralindia",
|
||||
"southindia",
|
||||
"brazilsouth",
|
||||
]
|
||||
|
||||
AZURE_SPEECH_TTS_LANGUAGES = [
|
||||
"en-US",
|
||||
"en-GB",
|
||||
"en-AU",
|
||||
"en-CA",
|
||||
"en-IN",
|
||||
"es-ES",
|
||||
"es-MX",
|
||||
"fr-FR",
|
||||
"fr-CA",
|
||||
"de-DE",
|
||||
"it-IT",
|
||||
"ja-JP",
|
||||
"ko-KR",
|
||||
"zh-CN",
|
||||
"zh-HK",
|
||||
"zh-TW",
|
||||
"pt-BR",
|
||||
"pt-PT",
|
||||
"ru-RU",
|
||||
"ar-SA",
|
||||
"nl-NL",
|
||||
"pl-PL",
|
||||
"sv-SE",
|
||||
"hi-IN",
|
||||
]
|
||||
|
||||
AZURE_SPEECH_TTS_VOICES = [
|
||||
"en-US-AriaNeural",
|
||||
"en-US-GuyNeural",
|
||||
"en-US-JennyNeural",
|
||||
"en-US-DavisNeural",
|
||||
"en-US-AmberNeural",
|
||||
"en-US-AnaNeural",
|
||||
"en-US-AshleyNeural",
|
||||
"en-US-BrandonNeural",
|
||||
"en-US-ChristopherNeural",
|
||||
"en-US-ElizabethNeural",
|
||||
"en-US-EricNeural",
|
||||
"en-US-JacobNeural",
|
||||
"en-US-MichelleNeural",
|
||||
"en-US-MonicaNeural",
|
||||
"en-US-NancyNeural",
|
||||
"en-US-RogerNeural",
|
||||
"en-US-SaraNeural",
|
||||
"en-US-SteffanNeural",
|
||||
"en-US-TonyNeural",
|
||||
]
|
||||
|
||||
AZURE_SPEECH_STT_LANGUAGES = [
|
||||
"en-US",
|
||||
"en-GB",
|
||||
"en-AU",
|
||||
"en-CA",
|
||||
"en-IN",
|
||||
"es-ES",
|
||||
"es-MX",
|
||||
"fr-FR",
|
||||
"fr-CA",
|
||||
"de-DE",
|
||||
"it-IT",
|
||||
"ja-JP",
|
||||
"ko-KR",
|
||||
"zh-CN",
|
||||
"pt-BR",
|
||||
"pt-PT",
|
||||
"ru-RU",
|
||||
"ar-SA",
|
||||
"nl-NL",
|
||||
"pl-PL",
|
||||
"hi-IN",
|
||||
]
|
||||
|
||||
AZURE_EMBEDDING_MODELS = [
|
||||
"text-embedding-3-small",
|
||||
"text-embedding-ada-002",
|
||||
]
|
||||
|
|
@ -5,6 +5,15 @@ from typing import Annotated, Dict, Literal, Type, TypeVar, Union
|
|||
from pydantic import BaseModel, ConfigDict, Field, computed_field, field_validator
|
||||
|
||||
from api.services.configuration.options import (
|
||||
AZURE_EMBEDDING_MODELS,
|
||||
AZURE_MODELS,
|
||||
AZURE_REALTIME_API_VERSIONS,
|
||||
AZURE_REALTIME_MODELS,
|
||||
AZURE_REALTIME_VOICES,
|
||||
AZURE_SPEECH_REGIONS,
|
||||
AZURE_SPEECH_STT_LANGUAGES,
|
||||
AZURE_SPEECH_TTS_LANGUAGES,
|
||||
AZURE_SPEECH_TTS_VOICES,
|
||||
DEEPGRAM_LANGUAGES,
|
||||
DEEPGRAM_STT_MODELS,
|
||||
GLADIA_STT_LANGUAGES,
|
||||
|
|
@ -52,6 +61,7 @@ class ServiceProviders(str, Enum):
|
|||
ELEVENLABS = "elevenlabs"
|
||||
GOOGLE = "google"
|
||||
AZURE = "azure"
|
||||
AZURE_SPEECH = "azure_speech"
|
||||
DOGRAH = "dograh"
|
||||
SARVAM = "sarvam"
|
||||
SPEECHMATICS = "speechmatics"
|
||||
|
|
@ -68,6 +78,7 @@ class ServiceProviders(str, Enum):
|
|||
ULTRAVOX_REALTIME = "ultravox_realtime"
|
||||
GOOGLE_REALTIME = "google_realtime"
|
||||
GOOGLE_VERTEX_REALTIME = "google_vertex_realtime"
|
||||
AZURE_REALTIME = "azure_realtime"
|
||||
|
||||
|
||||
class BaseServiceConfiguration(BaseModel):
|
||||
|
|
@ -79,6 +90,7 @@ class BaseServiceConfiguration(BaseModel):
|
|||
ServiceProviders.ELEVENLABS,
|
||||
ServiceProviders.GOOGLE,
|
||||
ServiceProviders.AZURE,
|
||||
ServiceProviders.AZURE_SPEECH,
|
||||
ServiceProviders.DOGRAH,
|
||||
ServiceProviders.AWS_BEDROCK,
|
||||
ServiceProviders.SPEACHES,
|
||||
|
|
@ -92,6 +104,7 @@ class BaseServiceConfiguration(BaseModel):
|
|||
ServiceProviders.ULTRAVOX_REALTIME,
|
||||
ServiceProviders.GOOGLE_REALTIME,
|
||||
ServiceProviders.GOOGLE_VERTEX_REALTIME,
|
||||
ServiceProviders.AZURE_REALTIME,
|
||||
ServiceProviders.SARVAM,
|
||||
]
|
||||
api_key: str | list[str]
|
||||
|
|
@ -242,6 +255,16 @@ SPEACHES_PROVIDER_MODEL_CONFIG = provider_model_config(
|
|||
),
|
||||
provider_docs_url="https://github.com/speaches-ai/speaches",
|
||||
)
|
||||
AZURE_SPEECH_PROVIDER_MODEL_CONFIG = provider_model_config(
|
||||
"Azure Speech Services",
|
||||
description="Azure Cognitive Services Speech — TTS and STT via the Azure Speech SDK.",
|
||||
provider_docs_url="https://learn.microsoft.com/en-us/azure/ai-services/speech-service/",
|
||||
)
|
||||
AZURE_REALTIME_PROVIDER_MODEL_CONFIG = provider_model_config(
|
||||
"Azure OpenAI Realtime",
|
||||
description="Azure OpenAI Realtime API — low-latency speech-to-speech conversations.",
|
||||
provider_docs_url="https://learn.microsoft.com/en-us/azure/ai-services/openai/how-to/realtime-audio-quickstart",
|
||||
)
|
||||
|
||||
OPENAI_MODELS = [
|
||||
"gpt-4.1",
|
||||
|
|
@ -272,7 +295,6 @@ OPENROUTER_MODELS = [
|
|||
"meta-llama/llama-3.3-70b-instruct",
|
||||
"deepseek/deepseek-chat-v3-0324",
|
||||
]
|
||||
AZURE_MODELS = ["gpt-4.1-mini"]
|
||||
DOGRAH_LLM_MODELS = ["default", "accurate", "fast", "lite", "zen"]
|
||||
AWS_BEDROCK_MODELS = [
|
||||
"us.amazon.nova-pro-v1:0",
|
||||
|
|
@ -666,12 +688,45 @@ class GoogleVertexRealtimeLLMConfiguration(BaseLLMConfiguration):
|
|||
)
|
||||
|
||||
|
||||
@register_service(ServiceType.REALTIME)
|
||||
class AzureRealtimeLLMConfiguration(BaseLLMConfiguration):
|
||||
model_config = AZURE_REALTIME_PROVIDER_MODEL_CONFIG
|
||||
provider: Literal[ServiceProviders.AZURE_REALTIME] = ServiceProviders.AZURE_REALTIME
|
||||
model: str = Field(
|
||||
default="gpt-4o-realtime-preview",
|
||||
description="Azure OpenAI realtime deployment name.",
|
||||
json_schema_extra={
|
||||
"examples": AZURE_REALTIME_MODELS,
|
||||
"allow_custom_input": True,
|
||||
},
|
||||
)
|
||||
endpoint: str = Field(
|
||||
description="Azure OpenAI resource endpoint (e.g. https://<resource>.openai.azure.com).",
|
||||
)
|
||||
voice: str = Field(
|
||||
default="alloy",
|
||||
description="Voice the model speaks in.",
|
||||
json_schema_extra={
|
||||
"examples": AZURE_REALTIME_VOICES,
|
||||
"allow_custom_input": True,
|
||||
},
|
||||
)
|
||||
api_version: str = Field(
|
||||
default="2025-04-01-preview",
|
||||
description="Azure OpenAI API version.",
|
||||
json_schema_extra={
|
||||
"examples": AZURE_REALTIME_API_VERSIONS,
|
||||
},
|
||||
)
|
||||
|
||||
|
||||
REALTIME_PROVIDERS = {
|
||||
ServiceProviders.OPENAI_REALTIME.value,
|
||||
ServiceProviders.GROK_REALTIME.value,
|
||||
ServiceProviders.ULTRAVOX_REALTIME.value,
|
||||
ServiceProviders.GOOGLE_REALTIME.value,
|
||||
ServiceProviders.GOOGLE_VERTEX_REALTIME.value,
|
||||
ServiceProviders.AZURE_REALTIME.value,
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -699,6 +754,7 @@ RealtimeConfig = Annotated[
|
|||
UltravoxRealtimeLLMConfiguration,
|
||||
GoogleRealtimeLLMConfiguration,
|
||||
GoogleVertexRealtimeLLMConfiguration,
|
||||
AzureRealtimeLLMConfiguration,
|
||||
],
|
||||
Field(discriminator="provider"),
|
||||
]
|
||||
|
|
@ -1024,6 +1080,46 @@ class MiniMaxTTSConfiguration(BaseTTSConfiguration):
|
|||
)
|
||||
|
||||
|
||||
@register_tts
|
||||
class AzureSpeechTTSConfiguration(BaseTTSConfiguration):
|
||||
model_config = AZURE_SPEECH_PROVIDER_MODEL_CONFIG
|
||||
provider: Literal[ServiceProviders.AZURE_SPEECH] = ServiceProviders.AZURE_SPEECH
|
||||
model: str = Field(
|
||||
default="neural",
|
||||
description="Azure Speech synthesis engine (neural voices only).",
|
||||
json_schema_extra={"examples": ["neural"]},
|
||||
)
|
||||
region: str = Field(
|
||||
default="eastus",
|
||||
description="Azure region for Speech Services (e.g. 'eastus', 'westeurope').",
|
||||
json_schema_extra={
|
||||
"examples": AZURE_SPEECH_REGIONS,
|
||||
},
|
||||
)
|
||||
voice: str = Field(
|
||||
default="en-US-AriaNeural",
|
||||
description="Azure Neural voice name (e.g. 'en-US-AriaNeural').",
|
||||
json_schema_extra={
|
||||
"examples": AZURE_SPEECH_TTS_VOICES,
|
||||
"allow_custom_input": True,
|
||||
},
|
||||
)
|
||||
language: str = Field(
|
||||
default="en-US",
|
||||
description="BCP-47 language code for synthesis.",
|
||||
json_schema_extra={
|
||||
"examples": AZURE_SPEECH_TTS_LANGUAGES,
|
||||
"allow_custom_input": True,
|
||||
},
|
||||
)
|
||||
speed: float = Field(
|
||||
default=1.0,
|
||||
ge=0.5,
|
||||
le=2.0,
|
||||
description="Speech speed multiplier (0.5 to 2.0).",
|
||||
)
|
||||
|
||||
|
||||
TTSConfig = Annotated[
|
||||
Union[
|
||||
DeepgramTTSConfiguration,
|
||||
|
|
@ -1037,6 +1133,7 @@ TTSConfig = Annotated[
|
|||
RimeTTSConfiguration,
|
||||
SpeachesTTSConfiguration,
|
||||
MiniMaxTTSConfiguration,
|
||||
AzureSpeechTTSConfiguration,
|
||||
],
|
||||
Field(discriminator="provider"),
|
||||
]
|
||||
|
|
@ -1273,6 +1370,32 @@ class GladiaSTTConfiguration(BaseSTTConfiguration):
|
|||
)
|
||||
|
||||
|
||||
@register_stt
|
||||
class AzureSpeechSTTConfiguration(BaseSTTConfiguration):
|
||||
model_config = AZURE_SPEECH_PROVIDER_MODEL_CONFIG
|
||||
provider: Literal[ServiceProviders.AZURE_SPEECH] = ServiceProviders.AZURE_SPEECH
|
||||
model: str = Field(
|
||||
default="latest_long",
|
||||
description="Azure Speech recognition model (use 'latest_long' for continuous recognition).",
|
||||
json_schema_extra={"examples": ["latest_long", "latest_short"]},
|
||||
)
|
||||
region: str = Field(
|
||||
default="eastus",
|
||||
description="Azure region for Speech Services (e.g. 'eastus', 'westeurope').",
|
||||
json_schema_extra={
|
||||
"examples": AZURE_SPEECH_REGIONS,
|
||||
},
|
||||
)
|
||||
language: str = Field(
|
||||
default="en-US",
|
||||
description="BCP-47 language code for recognition.",
|
||||
json_schema_extra={
|
||||
"examples": AZURE_SPEECH_STT_LANGUAGES,
|
||||
"allow_custom_input": True,
|
||||
},
|
||||
)
|
||||
|
||||
|
||||
STTConfig = Annotated[
|
||||
Union[
|
||||
DeepgramSTTConfiguration,
|
||||
|
|
@ -1285,6 +1408,7 @@ STTConfig = Annotated[
|
|||
SpeachesSTTConfiguration,
|
||||
AssemblyAISTTConfiguration,
|
||||
GladiaSTTConfiguration,
|
||||
AzureSpeechSTTConfiguration,
|
||||
],
|
||||
Field(discriminator="provider"),
|
||||
]
|
||||
|
|
@ -1324,8 +1448,36 @@ class OpenRouterEmbeddingsConfiguration(BaseEmbeddingsConfiguration):
|
|||
)
|
||||
|
||||
|
||||
@register_embeddings
|
||||
class AzureOpenAIEmbeddingsConfiguration(BaseEmbeddingsConfiguration):
|
||||
model_config = AZURE_OPENAI_PROVIDER_MODEL_CONFIG
|
||||
provider: Literal[ServiceProviders.AZURE] = ServiceProviders.AZURE
|
||||
model: str = Field(
|
||||
default="text-embedding-3-small",
|
||||
description=(
|
||||
"Azure OpenAI embedding deployment name. The deployment must return "
|
||||
"1536-dimensional embeddings."
|
||||
),
|
||||
json_schema_extra={
|
||||
"examples": AZURE_EMBEDDING_MODELS,
|
||||
"allow_custom_input": True,
|
||||
},
|
||||
)
|
||||
endpoint: str = Field(
|
||||
description="Azure OpenAI resource endpoint (e.g. https://<resource>.openai.azure.com).",
|
||||
)
|
||||
api_version: str = Field(
|
||||
default="2024-02-15-preview",
|
||||
description="Azure OpenAI API version for embeddings.",
|
||||
)
|
||||
|
||||
|
||||
EmbeddingsConfig = Annotated[
|
||||
Union[OpenAIEmbeddingsConfiguration, OpenRouterEmbeddingsConfiguration],
|
||||
Union[
|
||||
OpenAIEmbeddingsConfiguration,
|
||||
OpenRouterEmbeddingsConfiguration,
|
||||
AzureOpenAIEmbeddingsConfiguration,
|
||||
],
|
||||
Field(discriminator="provider"),
|
||||
]
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue