diff --git a/api/routes/knowledge_base.py b/api/routes/knowledge_base.py index bd47c4c..5bf4b0a 100644 --- a/api/routes/knowledge_base.py +++ b/api/routes/knowledge_base.py @@ -370,7 +370,10 @@ async def search_chunks( try: # Import here to avoid circular dependency from api.services.configuration.registry import ServiceProviders - from api.services.gen_ai import AzureOpenAIEmbeddingService, OpenAIEmbeddingService + from api.services.gen_ai import ( + AzureOpenAIEmbeddingService, + OpenAIEmbeddingService, + ) # Try to get user's embeddings configuration user_config = await db_client.get_user_configurations(user.id) @@ -385,7 +388,9 @@ async def search_chunks( embeddings_model = user_config.embeddings.model embeddings_provider = getattr(user_config.embeddings, "provider", None) embeddings_endpoint = getattr(user_config.embeddings, "endpoint", None) - embeddings_api_version = getattr(user_config.embeddings, "api_version", None) + embeddings_api_version = getattr( + user_config.embeddings, "api_version", None + ) # Initialize embedding service based on provider if embeddings_provider == ServiceProviders.AZURE.value and embeddings_endpoint: diff --git a/api/services/configuration/check_validity.py b/api/services/configuration/check_validity.py index 3a76147..0e4da86 100644 --- a/api/services/configuration/check_validity.py +++ b/api/services/configuration/check_validity.py @@ -41,6 +41,7 @@ class UserConfigurationValidator: ServiceProviders.ELEVENLABS.value: self._validate_elevenlabs_api_key, ServiceProviders.GOOGLE.value: self._check_google_api_key, ServiceProviders.AZURE.value: self._check_azure_api_key, + ServiceProviders.AZURE_SPEECH.value: self._check_azure_speech_api_key, ServiceProviders.CARTESIA.value: self._check_cartesia_api_key, ServiceProviders.DOGRAH.value: self._check_dograh_api_key, ServiceProviders.SARVAM.value: self._check_sarvam_api_key, @@ -54,6 +55,7 @@ class UserConfigurationValidator: ServiceProviders.ULTRAVOX_REALTIME.value: self._check_ultravox_realtime_api_key, ServiceProviders.GOOGLE_REALTIME.value: self._check_google_api_key, ServiceProviders.GOOGLE_VERTEX_REALTIME.value: self._check_google_vertex_realtime_api_key, + ServiceProviders.AZURE_REALTIME.value: self._check_azure_realtime_api_key, ServiceProviders.ASSEMBLYAI.value: self._check_assemblyai_api_key, ServiceProviders.GLADIA.value: self._check_gladia_api_key, ServiceProviders.RIME.value: self._check_rime_api_key, @@ -313,6 +315,12 @@ class UserConfigurationValidator: def _check_azure_api_key(self, model: str, api_key: str) -> bool: return True + def _check_azure_speech_api_key(self, model: str, api_key: str) -> bool: + return True + + def _check_azure_realtime_api_key(self, model: str, api_key: str) -> bool: + return True + def _check_cartesia_api_key(self, model: str, api_key: str) -> bool: return True diff --git a/api/services/configuration/options/__init__.py b/api/services/configuration/options/__init__.py index acc088c..1e3294a 100644 --- a/api/services/configuration/options/__init__.py +++ b/api/services/configuration/options/__init__.py @@ -1,3 +1,14 @@ +from .azure import ( + AZURE_EMBEDDING_MODELS, + AZURE_MODELS, + AZURE_REALTIME_API_VERSIONS, + AZURE_REALTIME_MODELS, + AZURE_REALTIME_VOICES, + AZURE_SPEECH_REGIONS, + AZURE_SPEECH_STT_LANGUAGES, + AZURE_SPEECH_TTS_LANGUAGES, + AZURE_SPEECH_TTS_VOICES, +) from .deepgram import DEEPGRAM_LANGUAGES, DEEPGRAM_STT_MODELS from .gladia import GLADIA_STT_LANGUAGES, GLADIA_STT_MODELS from .google import ( @@ -27,6 +38,15 @@ from .sarvam import ( from .speechmatics import SPEECHMATICS_STT_LANGUAGES __all__ = [ + "AZURE_EMBEDDING_MODELS", + "AZURE_MODELS", + "AZURE_REALTIME_API_VERSIONS", + "AZURE_REALTIME_MODELS", + "AZURE_REALTIME_VOICES", + "AZURE_SPEECH_REGIONS", + "AZURE_SPEECH_STT_LANGUAGES", + "AZURE_SPEECH_TTS_LANGUAGES", + "AZURE_SPEECH_TTS_VOICES", "DEEPGRAM_LANGUAGES", "DEEPGRAM_STT_MODELS", "GLADIA_STT_LANGUAGES", diff --git a/api/services/configuration/options/azure.py b/api/services/configuration/options/azure.py new file mode 100644 index 0000000..d80282b --- /dev/null +++ b/api/services/configuration/options/azure.py @@ -0,0 +1,125 @@ +AZURE_MODELS = ["gpt-4.1-mini"] + +AZURE_REALTIME_MODELS = ["gpt-4o-realtime-preview"] +AZURE_REALTIME_VOICES = [ + "alloy", + "ash", + "ballad", + "coral", + "echo", + "sage", + "shimmer", + "verse", +] +AZURE_REALTIME_API_VERSIONS = [ + "2025-04-01-preview", + "2024-10-01-preview", + "2024-12-17", +] + +AZURE_SPEECH_REGIONS = [ + "eastus", + "eastus2", + "westus", + "westus2", + "westus3", + "centralus", + "northcentralus", + "southcentralus", + "westcentralus", + "westeurope", + "northeurope", + "uksouth", + "ukwest", + "francecentral", + "switzerlandnorth", + "germanywestcentral", + "norwayeast", + "australiaeast", + "eastasia", + "southeastasia", + "japaneast", + "japanwest", + "koreacentral", + "centralindia", + "southindia", + "brazilsouth", +] + +AZURE_SPEECH_TTS_LANGUAGES = [ + "en-US", + "en-GB", + "en-AU", + "en-CA", + "en-IN", + "es-ES", + "es-MX", + "fr-FR", + "fr-CA", + "de-DE", + "it-IT", + "ja-JP", + "ko-KR", + "zh-CN", + "zh-HK", + "zh-TW", + "pt-BR", + "pt-PT", + "ru-RU", + "ar-SA", + "nl-NL", + "pl-PL", + "sv-SE", + "hi-IN", +] + +AZURE_SPEECH_TTS_VOICES = [ + "en-US-AriaNeural", + "en-US-GuyNeural", + "en-US-JennyNeural", + "en-US-DavisNeural", + "en-US-AmberNeural", + "en-US-AnaNeural", + "en-US-AshleyNeural", + "en-US-BrandonNeural", + "en-US-ChristopherNeural", + "en-US-ElizabethNeural", + "en-US-EricNeural", + "en-US-JacobNeural", + "en-US-MichelleNeural", + "en-US-MonicaNeural", + "en-US-NancyNeural", + "en-US-RogerNeural", + "en-US-SaraNeural", + "en-US-SteffanNeural", + "en-US-TonyNeural", +] + +AZURE_SPEECH_STT_LANGUAGES = [ + "en-US", + "en-GB", + "en-AU", + "en-CA", + "en-IN", + "es-ES", + "es-MX", + "fr-FR", + "fr-CA", + "de-DE", + "it-IT", + "ja-JP", + "ko-KR", + "zh-CN", + "pt-BR", + "pt-PT", + "ru-RU", + "ar-SA", + "nl-NL", + "pl-PL", + "hi-IN", +] + +AZURE_EMBEDDING_MODELS = [ + "text-embedding-3-small", + "text-embedding-ada-002", +] diff --git a/api/services/configuration/registry.py b/api/services/configuration/registry.py index c3cebcf..f05c5f7 100644 --- a/api/services/configuration/registry.py +++ b/api/services/configuration/registry.py @@ -5,6 +5,15 @@ from typing import Annotated, Dict, Literal, Type, TypeVar, Union from pydantic import BaseModel, ConfigDict, Field, computed_field, field_validator from api.services.configuration.options import ( + AZURE_EMBEDDING_MODELS, + AZURE_MODELS, + AZURE_REALTIME_API_VERSIONS, + AZURE_REALTIME_MODELS, + AZURE_REALTIME_VOICES, + AZURE_SPEECH_REGIONS, + AZURE_SPEECH_STT_LANGUAGES, + AZURE_SPEECH_TTS_LANGUAGES, + AZURE_SPEECH_TTS_VOICES, DEEPGRAM_LANGUAGES, DEEPGRAM_STT_MODELS, GLADIA_STT_LANGUAGES, @@ -286,7 +295,6 @@ OPENROUTER_MODELS = [ "meta-llama/llama-3.3-70b-instruct", "deepseek/deepseek-chat-v3-0324", ] -AZURE_MODELS = ["gpt-4.1-mini"] DOGRAH_LLM_MODELS = ["default", "accurate", "fast", "lite", "zen"] AWS_BEDROCK_MODELS = [ "us.amazon.nova-pro-v1:0", @@ -680,24 +688,6 @@ class GoogleVertexRealtimeLLMConfiguration(BaseLLMConfiguration): ) -AZURE_REALTIME_MODELS = ["gpt-4o-realtime-preview"] -AZURE_REALTIME_VOICES = [ - "alloy", - "ash", - "ballad", - "coral", - "echo", - "sage", - "shimmer", - "verse", -] -AZURE_REALTIME_API_VERSIONS = [ - "2025-04-01-preview", - "2024-10-01-preview", - "2024-12-17", -] - - @register_service(ServiceType.REALTIME) class AzureRealtimeLLMConfiguration(BaseLLMConfiguration): model_config = AZURE_REALTIME_PROVIDER_MODEL_CONFIG @@ -1090,76 +1080,6 @@ class MiniMaxTTSConfiguration(BaseTTSConfiguration): ) -AZURE_SPEECH_REGIONS = [ - "eastus", - "eastus2", - "westus", - "westus2", - "westus3", - "centralus", - "northcentralus", - "southcentralus", - "westcentralus", - "westeurope", - "northeurope", - "uksouth", - "ukwest", - "francecentral", - "switzerlandnorth", - "germanywestcentral", - "norwayeast", - "australiaeast", - "eastasia", - "southeastasia", - "japaneast", - "japanwest", - "koreacentral", - "centralindia", - "southindia", - "brazilsouth", -] - -AZURE_SPEECH_TTS_LANGUAGES = [ - "en-US", "en-GB", "en-AU", "en-CA", "en-IN", - "es-ES", "es-MX", - "fr-FR", "fr-CA", - "de-DE", - "it-IT", - "ja-JP", - "ko-KR", - "zh-CN", "zh-HK", "zh-TW", - "pt-BR", "pt-PT", - "ru-RU", - "ar-SA", - "nl-NL", - "pl-PL", - "sv-SE", - "hi-IN", -] - -AZURE_SPEECH_TTS_VOICES = [ - "en-US-AriaNeural", - "en-US-GuyNeural", - "en-US-JennyNeural", - "en-US-DavisNeural", - "en-US-AmberNeural", - "en-US-AnaNeural", - "en-US-AshleyNeural", - "en-US-BrandonNeural", - "en-US-ChristopherNeural", - "en-US-ElizabethNeural", - "en-US-EricNeural", - "en-US-JacobNeural", - "en-US-MichelleNeural", - "en-US-MonicaNeural", - "en-US-NancyNeural", - "en-US-RogerNeural", - "en-US-SaraNeural", - "en-US-SteffanNeural", - "en-US-TonyNeural", -] - - @register_tts class AzureSpeechTTSConfiguration(BaseTTSConfiguration): model_config = AZURE_SPEECH_PROVIDER_MODEL_CONFIG @@ -1450,24 +1370,6 @@ class GladiaSTTConfiguration(BaseSTTConfiguration): ) -AZURE_SPEECH_STT_LANGUAGES = [ - "en-US", "en-GB", "en-AU", "en-CA", "en-IN", - "es-ES", "es-MX", - "fr-FR", "fr-CA", - "de-DE", - "it-IT", - "ja-JP", - "ko-KR", - "zh-CN", - "pt-BR", "pt-PT", - "ru-RU", - "ar-SA", - "nl-NL", - "pl-PL", - "hi-IN", -] - - @register_stt class AzureSpeechSTTConfiguration(BaseSTTConfiguration): model_config = AZURE_SPEECH_PROVIDER_MODEL_CONFIG @@ -1546,17 +1448,20 @@ class OpenRouterEmbeddingsConfiguration(BaseEmbeddingsConfiguration): ) -AZURE_EMBEDDING_MODELS = ["text-embedding-3-small", "text-embedding-3-large", "text-embedding-ada-002"] - - @register_embeddings class AzureOpenAIEmbeddingsConfiguration(BaseEmbeddingsConfiguration): model_config = AZURE_OPENAI_PROVIDER_MODEL_CONFIG provider: Literal[ServiceProviders.AZURE] = ServiceProviders.AZURE model: str = Field( default="text-embedding-3-small", - description="Azure OpenAI embedding deployment name (must match the deployed model).", - json_schema_extra={"examples": AZURE_EMBEDDING_MODELS, "allow_custom_input": True}, + description=( + "Azure OpenAI embedding deployment name. The deployment must return " + "1536-dimensional embeddings." + ), + json_schema_extra={ + "examples": AZURE_EMBEDDING_MODELS, + "allow_custom_input": True, + }, ) endpoint: str = Field( description="Azure OpenAI resource endpoint (e.g. https://.openai.azure.com).", diff --git a/api/services/gen_ai/embedding/__init__.py b/api/services/gen_ai/embedding/__init__.py index 0632ad1..40a04bd 100644 --- a/api/services/gen_ai/embedding/__init__.py +++ b/api/services/gen_ai/embedding/__init__.py @@ -1,6 +1,9 @@ """Embedding services for document processing and retrieval.""" -from .azure_openai_service import AzureEmbeddingAPIKeyNotConfiguredError, AzureOpenAIEmbeddingService +from .azure_openai_service import ( + AzureEmbeddingAPIKeyNotConfiguredError, + AzureOpenAIEmbeddingService, +) from .base import BaseEmbeddingService from .openai_service import EmbeddingAPIKeyNotConfiguredError, OpenAIEmbeddingService diff --git a/api/services/gen_ai/embedding/azure_openai_service.py b/api/services/gen_ai/embedding/azure_openai_service.py index ca2c759..dddb785 100644 --- a/api/services/gen_ai/embedding/azure_openai_service.py +++ b/api/services/gen_ai/embedding/azure_openai_service.py @@ -1,8 +1,8 @@ """Azure OpenAI embedding service. Uses the Azure OpenAI REST API for text embeddings, compatible with -text-embedding-3-small, text-embedding-3-large, and text-embedding-ada-002 -deployments. +1536-dimensional embedding deployments such as text-embedding-3-small and +text-embedding-ada-002. """ from typing import Any, Dict, List, Optional @@ -89,11 +89,23 @@ class AzureOpenAIEmbeddingService(BaseEmbeddingService): input=texts, model=self.model_id, ) - return [item.embedding for item in response.data] + embeddings = [item.embedding for item in response.data] + self._validate_embedding_dimensions(embeddings) + return embeddings except Exception as e: logger.error(f"Error generating Azure OpenAI embeddings: {e}") raise + def _validate_embedding_dimensions(self, embeddings: List[List[float]]) -> None: + for embedding in embeddings: + if len(embedding) != EMBEDDING_DIMENSION: + raise ValueError( + "Azure OpenAI embedding deployment " + f"{self.model_id!r} returned {len(embedding)} dimensions; " + "Dograh knowledge base storage currently supports " + f"{EMBEDDING_DIMENSION}-dimensional embeddings." + ) + async def embed_query(self, query: str) -> List[float]: """Embed a single query text using Azure OpenAI API.""" self._ensure_configured() diff --git a/api/services/pipecat/service_factory.py b/api/services/pipecat/service_factory.py index 313a8ef..8ed96e4 100644 --- a/api/services/pipecat/service_factory.py +++ b/api/services/pipecat/service_factory.py @@ -1,4 +1,5 @@ from typing import TYPE_CHECKING +from urllib.parse import urlencode, urlparse, urlunparse import aiohttp from fastapi import HTTPException @@ -269,11 +270,10 @@ def create_stt_service( language_code = getattr(user_config.stt, "language", None) or "en-US" region = getattr(user_config.stt, "region", None) or "eastus" - # Try to map BCP-47 string to pipecat Language enum; fall back to string try: pipecat_language = PipecatLanguage(language_code) except ValueError: - pipecat_language = PipecatLanguage.EN_US + pipecat_language = language_code return AzureSTTService( api_key=user_config.stt.api_key, region=region, @@ -806,13 +806,27 @@ def create_realtime_llm_service(user_config, audio_config: "AudioConfig"): ) endpoint = getattr(realtime_config, "endpoint", None) or "" - api_version = getattr(realtime_config, "api_version", None) or "2025-04-01-preview" + if not endpoint: + raise HTTPException( + status_code=400, + detail="Azure Realtime requires an endpoint.", + ) + _validate_runtime_service_url(endpoint, "endpoint") + api_version = ( + getattr(realtime_config, "api_version", None) or "2025-04-01-preview" + ) # Construct the Azure Realtime WebSocket URL # https://.openai.azure.com/openai/realtime?api-version=&deployment= - base_host = endpoint.rstrip("/").replace("https://", "").replace("http://", "") - wss_url = ( - f"wss://{base_host}/openai/realtime" - f"?api-version={api_version}&deployment={model}" + parsed_endpoint = urlparse(endpoint) + wss_url = urlunparse( + ( + "wss", + parsed_endpoint.netloc, + "/openai/realtime", + "", + urlencode({"api-version": api_version, "deployment": model}), + "", + ) ) return DograhAzureRealtimeLLMService( api_key=api_key, diff --git a/api/services/workflow/tools/knowledge_base.py b/api/services/workflow/tools/knowledge_base.py index b3fbda9..6ce8f8c 100644 --- a/api/services/workflow/tools/knowledge_base.py +++ b/api/services/workflow/tools/knowledge_base.py @@ -260,7 +260,10 @@ async def _perform_retrieval( "Model Configurations > Embedding." ) - if embeddings_provider == ServiceProviders.AZURE.value and embeddings_endpoint: + if ( + embeddings_provider == ServiceProviders.AZURE.value + and embeddings_endpoint + ): embedding_service = AzureOpenAIEmbeddingService( db_client=db_client, api_key=embeddings_api_key, diff --git a/api/tasks/knowledge_base_processing.py b/api/tasks/knowledge_base_processing.py index 2066f1d..4e94329 100644 --- a/api/tasks/knowledge_base_processing.py +++ b/api/tasks/knowledge_base_processing.py @@ -164,7 +164,9 @@ async def process_knowledge_base_document( embeddings_model = user_config.embeddings.model embeddings_base_url = getattr(user_config.embeddings, "base_url", None) embeddings_endpoint = getattr(user_config.embeddings, "endpoint", None) - embeddings_api_version = getattr(user_config.embeddings, "api_version", None) + embeddings_api_version = getattr( + user_config.embeddings, "api_version", None + ) logger.info( f"Using user embeddings config: provider={embeddings_provider}, " f"model={embeddings_model}" diff --git a/api/tests/test_azure_speech_service_factory.py b/api/tests/test_azure_speech_service_factory.py index fe958e5..26739a7 100644 --- a/api/tests/test_azure_speech_service_factory.py +++ b/api/tests/test_azure_speech_service_factory.py @@ -3,8 +3,24 @@ from types import SimpleNamespace from unittest.mock import patch -from api.services.configuration.registry import ServiceProviders -from api.services.pipecat.service_factory import create_stt_service, create_tts_service +import pytest +from fastapi import HTTPException + +from api.services.configuration.check_validity import UserConfigurationValidator +from api.services.configuration.registry import ( + AzureRealtimeLLMConfiguration, + AzureSpeechSTTConfiguration, + AzureSpeechTTSConfiguration, + ServiceProviders, +) +from api.services.gen_ai.embedding.azure_openai_service import ( + AzureOpenAIEmbeddingService, +) +from api.services.pipecat.service_factory import ( + create_realtime_llm_service, + create_stt_service, + create_tts_service, +) def _audio_config(): @@ -79,3 +95,88 @@ def test_create_azure_speech_stt_service(): assert kwargs["api_key"] == "test-subscription-key" assert kwargs["region"] == "eastus" assert kwargs["sample_rate"] == 16000 + + +def test_create_azure_speech_stt_service_preserves_custom_language(): + user_config = SimpleNamespace( + stt=SimpleNamespace( + provider=ServiceProviders.AZURE_SPEECH.value, + api_key="test-subscription-key", + region="eastus", + language="custom-locale", + model="latest_long", + ) + ) + + with patch("api.services.pipecat.service_factory.AzureSTTService") as mock_service: + create_stt_service(user_config, _audio_config()) + + kwargs = mock_service.call_args.kwargs + assert kwargs["settings"].language == "custom-locale" + + +def test_validator_accepts_azure_speech_services(): + validator = UserConfigurationValidator() + + assert ( + validator._validate_service( + AzureSpeechTTSConfiguration(api_key="test-key"), + "tts", + ) + == [] + ) + assert ( + validator._validate_service( + AzureSpeechSTTConfiguration(api_key="test-key"), + "stt", + ) + == [] + ) + + +def test_validator_accepts_azure_realtime_service(monkeypatch): + monkeypatch.setattr("api.utils.url_security.DEPLOYMENT_MODE", "oss") + validator = UserConfigurationValidator() + + assert ( + validator._validate_service( + AzureRealtimeLLMConfiguration( + api_key="test-key", + endpoint="https://example.openai.azure.com", + ), + "realtime", + ) + == [] + ) + + +def test_create_azure_realtime_blocks_private_endpoint_in_saas(monkeypatch): + monkeypatch.setattr("api.utils.url_security.DEPLOYMENT_MODE", "saas") + user_config = SimpleNamespace( + realtime=SimpleNamespace( + provider=ServiceProviders.AZURE_REALTIME.value, + api_key="test-key", + endpoint="http://10.0.0.10", + api_version="2025-04-01-preview", + model="gpt-4o-realtime-preview", + voice="alloy", + ) + ) + + with pytest.raises(HTTPException) as exc_info: + create_realtime_llm_service(user_config, _audio_config()) + + assert exc_info.value.status_code == 400 + assert "public IP" in exc_info.value.detail + + +def test_azure_embedding_service_rejects_wrong_dimension(): + service = AzureOpenAIEmbeddingService( + db_client=SimpleNamespace(), + api_key=None, + endpoint=None, + model_id="text-embedding-3-large", + ) + + with pytest.raises(ValueError, match="1536-dimensional"): + service._validate_embedding_dimensions([[0.0] * 3072])