feat: add Azure AI multi-provider support (TTS, STT, Embeddings, Realtime) (#381)

* feat: add Azure AI multi-provider support (TTS, STT, Embeddings, Realtime) Enables Azure AI services across all model layers so users with Azure credits can consolidate billing on a single provider. - Voice (TTS): AzureSpeechTTSConfiguration via azure_speech provider - Transcriber (STT): AzureSpeechSTTConfiguration via azure_speech provider - Embedding: AzureOpenAIEmbeddingsConfiguration via azure provider - Realtime: AzureRealtimeLLMConfiguration via azure_realtime provider New files: - api/services/pipecat/realtime/azure_realtime.py - api/services/gen_ai/embedding/azure_openai_service.py - api/tests/test_azure_speech_service_factory.py The UI picks up all four providers automatically from the schema — no frontend changes required. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com> * fix: add validation for URL and params --------- Co-authored-by: Vishal Dhateria <vishal@finela.ai> Co-authored-by: Claude Sonnet 4.6 <noreply@anthropic.com> Co-authored-by: Abhishek Kumar <abhishek@a6k.me>
2026-07-04 10:52:17 +02:00 · 2026-06-02 12:50:00 +05:30 · 2026-06-02 12:50:00 +05:30 · 7ba95c0fbe
commit 7ba95c0fbe
parent 8a4a2e25db
15 changed files with 1082 additions and 29 deletions
--- a/api/services/configuration/check_validity.py
+++ b/api/services/configuration/check_validity.py
@ -41,6 +41,7 @@ class UserConfigurationValidator:
            ServiceProviders.ELEVENLABS.value: self._validate_elevenlabs_api_key,
            ServiceProviders.GOOGLE.value: self._check_google_api_key,
            ServiceProviders.AZURE.value: self._check_azure_api_key,
+            ServiceProviders.AZURE_SPEECH.value: self._check_azure_speech_api_key,
            ServiceProviders.CARTESIA.value: self._check_cartesia_api_key,
            ServiceProviders.DOGRAH.value: self._check_dograh_api_key,
            ServiceProviders.SARVAM.value: self._check_sarvam_api_key,
@ -54,6 +55,7 @@ class UserConfigurationValidator:
            ServiceProviders.ULTRAVOX_REALTIME.value: self._check_ultravox_realtime_api_key,
            ServiceProviders.GOOGLE_REALTIME.value: self._check_google_api_key,
            ServiceProviders.GOOGLE_VERTEX_REALTIME.value: self._check_google_vertex_realtime_api_key,
+            ServiceProviders.AZURE_REALTIME.value: self._check_azure_realtime_api_key,
            ServiceProviders.ASSEMBLYAI.value: self._check_assemblyai_api_key,
            ServiceProviders.GLADIA.value: self._check_gladia_api_key,
            ServiceProviders.RIME.value: self._check_rime_api_key,
@ -313,6 +315,12 @@ class UserConfigurationValidator:
    def _check_azure_api_key(self, model: str, api_key: str) -> bool:
        return True

+    def _check_azure_speech_api_key(self, model: str, api_key: str) -> bool:
+        return True
+
+    def _check_azure_realtime_api_key(self, model: str, api_key: str) -> bool:
+        return True
+
    def _check_cartesia_api_key(self, model: str, api_key: str) -> bool:
        return True

--- a/api/services/configuration/options/init.py
+++ b/api/services/configuration/options/init.py
@ -1,3 +1,14 @@
+from .azure import (
+    AZURE_EMBEDDING_MODELS,
+    AZURE_MODELS,
+    AZURE_REALTIME_API_VERSIONS,
+    AZURE_REALTIME_MODELS,
+    AZURE_REALTIME_VOICES,
+    AZURE_SPEECH_REGIONS,
+    AZURE_SPEECH_STT_LANGUAGES,
+    AZURE_SPEECH_TTS_LANGUAGES,
+    AZURE_SPEECH_TTS_VOICES,
+)
 from .deepgram import DEEPGRAM_LANGUAGES, DEEPGRAM_STT_MODELS
 from .gladia import GLADIA_STT_LANGUAGES, GLADIA_STT_MODELS
 from .google import (
@ -27,6 +38,15 @@ from .sarvam import (
 from .speechmatics import SPEECHMATICS_STT_LANGUAGES

 __all__ = [
+    "AZURE_EMBEDDING_MODELS",
+    "AZURE_MODELS",
+    "AZURE_REALTIME_API_VERSIONS",
+    "AZURE_REALTIME_MODELS",
+    "AZURE_REALTIME_VOICES",
+    "AZURE_SPEECH_REGIONS",
+    "AZURE_SPEECH_STT_LANGUAGES",
+    "AZURE_SPEECH_TTS_LANGUAGES",
+    "AZURE_SPEECH_TTS_VOICES",
    "DEEPGRAM_LANGUAGES",
    "DEEPGRAM_STT_MODELS",
    "GLADIA_STT_LANGUAGES",
--- a/api/services/configuration/options/azure.py
+++ b/api/services/configuration/options/azure.py
@ -0,0 +1,125 @@
+AZURE_MODELS = ["gpt-4.1-mini"]
+
+AZURE_REALTIME_MODELS = ["gpt-4o-realtime-preview"]
+AZURE_REALTIME_VOICES = [
+    "alloy",
+    "ash",
+    "ballad",
+    "coral",
+    "echo",
+    "sage",
+    "shimmer",
+    "verse",
+]
+AZURE_REALTIME_API_VERSIONS = [
+    "2025-04-01-preview",
+    "2024-10-01-preview",
+    "2024-12-17",
+]
+
+AZURE_SPEECH_REGIONS = [
+    "eastus",
+    "eastus2",
+    "westus",
+    "westus2",
+    "westus3",
+    "centralus",
+    "northcentralus",
+    "southcentralus",
+    "westcentralus",
+    "westeurope",
+    "northeurope",
+    "uksouth",
+    "ukwest",
+    "francecentral",
+    "switzerlandnorth",
+    "germanywestcentral",
+    "norwayeast",
+    "australiaeast",
+    "eastasia",
+    "southeastasia",
+    "japaneast",
+    "japanwest",
+    "koreacentral",
+    "centralindia",
+    "southindia",
+    "brazilsouth",
+]
+
+AZURE_SPEECH_TTS_LANGUAGES = [
+    "en-US",
+    "en-GB",
+    "en-AU",
+    "en-CA",
+    "en-IN",
+    "es-ES",
+    "es-MX",
+    "fr-FR",
+    "fr-CA",
+    "de-DE",
+    "it-IT",
+    "ja-JP",
+    "ko-KR",
+    "zh-CN",
+    "zh-HK",
+    "zh-TW",
+    "pt-BR",
+    "pt-PT",
+    "ru-RU",
+    "ar-SA",
+    "nl-NL",
+    "pl-PL",
+    "sv-SE",
+    "hi-IN",
+]
+
+AZURE_SPEECH_TTS_VOICES = [
+    "en-US-AriaNeural",
+    "en-US-GuyNeural",
+    "en-US-JennyNeural",
+    "en-US-DavisNeural",
+    "en-US-AmberNeural",
+    "en-US-AnaNeural",
+    "en-US-AshleyNeural",
+    "en-US-BrandonNeural",
+    "en-US-ChristopherNeural",
+    "en-US-ElizabethNeural",
+    "en-US-EricNeural",
+    "en-US-JacobNeural",
+    "en-US-MichelleNeural",
+    "en-US-MonicaNeural",
+    "en-US-NancyNeural",
+    "en-US-RogerNeural",
+    "en-US-SaraNeural",
+    "en-US-SteffanNeural",
+    "en-US-TonyNeural",
+]
+
+AZURE_SPEECH_STT_LANGUAGES = [
+    "en-US",
+    "en-GB",
+    "en-AU",
+    "en-CA",
+    "en-IN",
+    "es-ES",
+    "es-MX",
+    "fr-FR",
+    "fr-CA",
+    "de-DE",
+    "it-IT",
+    "ja-JP",
+    "ko-KR",
+    "zh-CN",
+    "pt-BR",
+    "pt-PT",
+    "ru-RU",
+    "ar-SA",
+    "nl-NL",
+    "pl-PL",
+    "hi-IN",
+]
+
+AZURE_EMBEDDING_MODELS = [
+    "text-embedding-3-small",
+    "text-embedding-ada-002",
+]
--- a/api/services/configuration/registry.py
+++ b/api/services/configuration/registry.py
@ -5,6 +5,15 @@ from typing import Annotated, Dict, Literal, Type, TypeVar, Union
 from pydantic import BaseModel, ConfigDict, Field, computed_field, field_validator

 from api.services.configuration.options import (
+    AZURE_EMBEDDING_MODELS,
+    AZURE_MODELS,
+    AZURE_REALTIME_API_VERSIONS,
+    AZURE_REALTIME_MODELS,
+    AZURE_REALTIME_VOICES,
+    AZURE_SPEECH_REGIONS,
+    AZURE_SPEECH_STT_LANGUAGES,
+    AZURE_SPEECH_TTS_LANGUAGES,
+    AZURE_SPEECH_TTS_VOICES,
    DEEPGRAM_LANGUAGES,
    DEEPGRAM_STT_MODELS,
    GLADIA_STT_LANGUAGES,
@ -52,6 +61,7 @@ class ServiceProviders(str, Enum):
    ELEVENLABS = "elevenlabs"
    GOOGLE = "google"
    AZURE = "azure"
+    AZURE_SPEECH = "azure_speech"
    DOGRAH = "dograh"
    SARVAM = "sarvam"
    SPEECHMATICS = "speechmatics"
@ -68,6 +78,7 @@ class ServiceProviders(str, Enum):
    ULTRAVOX_REALTIME = "ultravox_realtime"
    GOOGLE_REALTIME = "google_realtime"
    GOOGLE_VERTEX_REALTIME = "google_vertex_realtime"
+    AZURE_REALTIME = "azure_realtime"


 class BaseServiceConfiguration(BaseModel):
@ -79,6 +90,7 @@ class BaseServiceConfiguration(BaseModel):
        ServiceProviders.ELEVENLABS,
        ServiceProviders.GOOGLE,
        ServiceProviders.AZURE,
+        ServiceProviders.AZURE_SPEECH,
        ServiceProviders.DOGRAH,
        ServiceProviders.AWS_BEDROCK,
        ServiceProviders.SPEACHES,
@ -92,6 +104,7 @@ class BaseServiceConfiguration(BaseModel):
        ServiceProviders.ULTRAVOX_REALTIME,
        ServiceProviders.GOOGLE_REALTIME,
        ServiceProviders.GOOGLE_VERTEX_REALTIME,
+        ServiceProviders.AZURE_REALTIME,
        ServiceProviders.SARVAM,
    ]
    api_key: str | list[str]
@ -242,6 +255,16 @@ SPEACHES_PROVIDER_MODEL_CONFIG = provider_model_config(
    ),
    provider_docs_url="https://github.com/speaches-ai/speaches",
 )
+AZURE_SPEECH_PROVIDER_MODEL_CONFIG = provider_model_config(
+    "Azure Speech Services",
+    description="Azure Cognitive Services Speech — TTS and STT via the Azure Speech SDK.",
+    provider_docs_url="https://learn.microsoft.com/en-us/azure/ai-services/speech-service/",
+)
+AZURE_REALTIME_PROVIDER_MODEL_CONFIG = provider_model_config(
+    "Azure OpenAI Realtime",
+    description="Azure OpenAI Realtime API — low-latency speech-to-speech conversations.",
+    provider_docs_url="https://learn.microsoft.com/en-us/azure/ai-services/openai/how-to/realtime-audio-quickstart",
+)

 OPENAI_MODELS = [
    "gpt-4.1",
@ -272,7 +295,6 @@ OPENROUTER_MODELS = [
    "meta-llama/llama-3.3-70b-instruct",
    "deepseek/deepseek-chat-v3-0324",
 ]
-AZURE_MODELS = ["gpt-4.1-mini"]
 DOGRAH_LLM_MODELS = ["default", "accurate", "fast", "lite", "zen"]
 AWS_BEDROCK_MODELS = [
    "us.amazon.nova-pro-v1:0",
@ -666,12 +688,45 @@ class GoogleVertexRealtimeLLMConfiguration(BaseLLMConfiguration):
    )


+@register_service(ServiceType.REALTIME)
+class AzureRealtimeLLMConfiguration(BaseLLMConfiguration):
+    model_config = AZURE_REALTIME_PROVIDER_MODEL_CONFIG
+    provider: Literal[ServiceProviders.AZURE_REALTIME] = ServiceProviders.AZURE_REALTIME
+    model: str = Field(
+        default="gpt-4o-realtime-preview",
+        description="Azure OpenAI realtime deployment name.",
+        json_schema_extra={
+            "examples": AZURE_REALTIME_MODELS,
+            "allow_custom_input": True,
+        },
+    )
+    endpoint: str = Field(
+        description="Azure OpenAI resource endpoint (e.g. https://<resource>.openai.azure.com).",
+    )
+    voice: str = Field(
+        default="alloy",
+        description="Voice the model speaks in.",
+        json_schema_extra={
+            "examples": AZURE_REALTIME_VOICES,
+            "allow_custom_input": True,
+        },
+    )
+    api_version: str = Field(
+        default="2025-04-01-preview",
+        description="Azure OpenAI API version.",
+        json_schema_extra={
+            "examples": AZURE_REALTIME_API_VERSIONS,
+        },
+    )
+
+
 REALTIME_PROVIDERS = {
    ServiceProviders.OPENAI_REALTIME.value,
    ServiceProviders.GROK_REALTIME.value,
    ServiceProviders.ULTRAVOX_REALTIME.value,
    ServiceProviders.GOOGLE_REALTIME.value,
    ServiceProviders.GOOGLE_VERTEX_REALTIME.value,
+    ServiceProviders.AZURE_REALTIME.value,
 }


@ -699,6 +754,7 @@ RealtimeConfig = Annotated[
        UltravoxRealtimeLLMConfiguration,
        GoogleRealtimeLLMConfiguration,
        GoogleVertexRealtimeLLMConfiguration,
+        AzureRealtimeLLMConfiguration,
    ],
    Field(discriminator="provider"),
 ]
@ -1024,6 +1080,46 @@ class MiniMaxTTSConfiguration(BaseTTSConfiguration):
    )


+@register_tts
+class AzureSpeechTTSConfiguration(BaseTTSConfiguration):
+    model_config = AZURE_SPEECH_PROVIDER_MODEL_CONFIG
+    provider: Literal[ServiceProviders.AZURE_SPEECH] = ServiceProviders.AZURE_SPEECH
+    model: str = Field(
+        default="neural",
+        description="Azure Speech synthesis engine (neural voices only).",
+        json_schema_extra={"examples": ["neural"]},
+    )
+    region: str = Field(
+        default="eastus",
+        description="Azure region for Speech Services (e.g. 'eastus', 'westeurope').",
+        json_schema_extra={
+            "examples": AZURE_SPEECH_REGIONS,
+        },
+    )
+    voice: str = Field(
+        default="en-US-AriaNeural",
+        description="Azure Neural voice name (e.g. 'en-US-AriaNeural').",
+        json_schema_extra={
+            "examples": AZURE_SPEECH_TTS_VOICES,
+            "allow_custom_input": True,
+        },
+    )
+    language: str = Field(
+        default="en-US",
+        description="BCP-47 language code for synthesis.",
+        json_schema_extra={
+            "examples": AZURE_SPEECH_TTS_LANGUAGES,
+            "allow_custom_input": True,
+        },
+    )
+    speed: float = Field(
+        default=1.0,
+        ge=0.5,
+        le=2.0,
+        description="Speech speed multiplier (0.5 to 2.0).",
+    )
+
+
 TTSConfig = Annotated[
    Union[
        DeepgramTTSConfiguration,
@ -1037,6 +1133,7 @@ TTSConfig = Annotated[
        RimeTTSConfiguration,
        SpeachesTTSConfiguration,
        MiniMaxTTSConfiguration,
+        AzureSpeechTTSConfiguration,
    ],
    Field(discriminator="provider"),
 ]
@ -1273,6 +1370,32 @@ class GladiaSTTConfiguration(BaseSTTConfiguration):
    )


+@register_stt
+class AzureSpeechSTTConfiguration(BaseSTTConfiguration):
+    model_config = AZURE_SPEECH_PROVIDER_MODEL_CONFIG
+    provider: Literal[ServiceProviders.AZURE_SPEECH] = ServiceProviders.AZURE_SPEECH
+    model: str = Field(
+        default="latest_long",
+        description="Azure Speech recognition model (use 'latest_long' for continuous recognition).",
+        json_schema_extra={"examples": ["latest_long", "latest_short"]},
+    )
+    region: str = Field(
+        default="eastus",
+        description="Azure region for Speech Services (e.g. 'eastus', 'westeurope').",
+        json_schema_extra={
+            "examples": AZURE_SPEECH_REGIONS,
+        },
+    )
+    language: str = Field(
+        default="en-US",
+        description="BCP-47 language code for recognition.",
+        json_schema_extra={
+            "examples": AZURE_SPEECH_STT_LANGUAGES,
+            "allow_custom_input": True,
+        },
+    )
+
+
 STTConfig = Annotated[
    Union[
        DeepgramSTTConfiguration,
@ -1285,6 +1408,7 @@ STTConfig = Annotated[
        SpeachesSTTConfiguration,
        AssemblyAISTTConfiguration,
        GladiaSTTConfiguration,
+        AzureSpeechSTTConfiguration,
    ],
    Field(discriminator="provider"),
 ]
@ -1324,8 +1448,36 @@ class OpenRouterEmbeddingsConfiguration(BaseEmbeddingsConfiguration):
    )


+@register_embeddings
+class AzureOpenAIEmbeddingsConfiguration(BaseEmbeddingsConfiguration):
+    model_config = AZURE_OPENAI_PROVIDER_MODEL_CONFIG
+    provider: Literal[ServiceProviders.AZURE] = ServiceProviders.AZURE
+    model: str = Field(
+        default="text-embedding-3-small",
+        description=(
+            "Azure OpenAI embedding deployment name. The deployment must return "
+            "1536-dimensional embeddings."
+        ),
+        json_schema_extra={
+            "examples": AZURE_EMBEDDING_MODELS,
+            "allow_custom_input": True,
+        },
+    )
+    endpoint: str = Field(
+        description="Azure OpenAI resource endpoint (e.g. https://<resource>.openai.azure.com).",
+    )
+    api_version: str = Field(
+        default="2024-02-15-preview",
+        description="Azure OpenAI API version for embeddings.",
+    )
+
+
 EmbeddingsConfig = Annotated[
-    Union[OpenAIEmbeddingsConfiguration, OpenRouterEmbeddingsConfiguration],
+    Union[
+        OpenAIEmbeddingsConfiguration,
+        OpenRouterEmbeddingsConfiguration,
+        AzureOpenAIEmbeddingsConfiguration,
+    ],
    Field(discriminator="provider"),
 ]