feat: add gemini realtime and speaches integration

- Add gemini realtime support - Add speaches support for locally hosted LLMs
2026-06-22 08:38:13 +02:00 · 2026-03-31 17:39:47 +05:30 · 2026-03-31 17:39:47 +05:30 · ee2028eb2d
commit ee2028eb2d
parent 2eaaabd936
19 changed files with 531 additions and 185 deletions
--- a/api/services/configuration/check_validity.py
+++ b/api/services/configuration/check_validity.py
@ -47,6 +47,8 @@ class UserConfigurationValidator:
            ServiceProviders.CAMB.value: self._check_camb_api_key,
            ServiceProviders.AWS_BEDROCK.value: self._check_aws_bedrock_api_key,
            ServiceProviders.SPEACHES.value: self._check_speaches_api_key,
+            ServiceProviders.OPENAI_REALTIME.value: self._check_openai_api_key,
+            ServiceProviders.GOOGLE_REALTIME.value: self._check_google_api_key,
        }

    async def validate(
@ -70,6 +72,10 @@ class UserConfigurationValidator:
                configuration.embeddings, "embeddings", required=False
            )
        )
+        # Realtime is optional - only validate if configured
+        status_list.extend(
+            self._validate_service(configuration.realtime, "realtime", required=False)
+        )

        if status_list:
            raise ValueError(status_list)
--- a/api/services/configuration/masking.py
+++ b/api/services/configuration/masking.py
@ -29,7 +29,7 @@ def contains_masked_key(api_key: str | list[str] | None) -> bool:

 def check_for_masked_keys(config: "UserConfiguration") -> None:
    """Raise ValueError if any service in *config* still has a masked API key."""
-    for field in ("llm", "tts", "stt", "embeddings"):
+    for field in ("llm", "tts", "stt", "embeddings", "realtime"):
        service = getattr(config, field, None)
        if service is None:
            continue
@ -121,6 +121,8 @@ def mask_user_config(config: UserConfiguration) -> Dict[str, Any]:
        "tts": _mask_service(config.tts),
        "stt": _mask_service(config.stt),
        "embeddings": _mask_service(config.embeddings),
+        "realtime": _mask_service(config.realtime),
+        "is_realtime": config.is_realtime,
        "test_phone_number": config.test_phone_number,
        "timezone": config.timezone,
    }
--- a/api/services/configuration/merge.py
+++ b/api/services/configuration/merge.py
@ -9,7 +9,7 @@ from typing import Dict
 from api.schemas.user_configuration import UserConfiguration
 from api.services.configuration.masking import resolve_masked_api_keys

-SERVICE_FIELDS = ("llm", "tts", "stt", "embeddings")
+SERVICE_FIELDS = ("llm", "tts", "stt", "embeddings", "realtime")


 def merge_user_configurations(
@ -64,6 +64,9 @@ def merge_user_configurations(
        _merge_service_block(service)

    # other simple fields
+    if "is_realtime" in incoming_partial:
+        merged["is_realtime"] = incoming_partial["is_realtime"]
+
    if "test_phone_number" in incoming_partial:
        merged["test_phone_number"] = incoming_partial["test_phone_number"]

--- a/api/services/configuration/registry.py
+++ b/api/services/configuration/registry.py
@ -10,6 +10,7 @@ class ServiceType(Enum):
    TTS = auto()
    STT = auto()
    EMBEDDINGS = auto()
+    REALTIME = auto()


 class ServiceProviders(str, Enum):
@ -28,6 +29,8 @@ class ServiceProviders(str, Enum):
    CAMB = "camb"
    AWS_BEDROCK = "aws_bedrock"
    SPEACHES = "speaches"
+    OPENAI_REALTIME = "openai_realtime"
+    GOOGLE_REALTIME = "google_realtime"


 class BaseServiceConfiguration(BaseModel):
@ -42,6 +45,8 @@ class BaseServiceConfiguration(BaseModel):
        ServiceProviders.DOGRAH,
        ServiceProviders.AWS_BEDROCK,
        ServiceProviders.SPEACHES,
+        ServiceProviders.OPENAI_REALTIME,
+        ServiceProviders.GOOGLE_REALTIME,
        # ServiceProviders.SARVAM,
    ]
    api_key: str | list[str]
@ -97,6 +102,7 @@ REGISTRY: Dict[ServiceType, Dict[str, Type[BaseServiceConfiguration]]] = {
    ServiceType.TTS: {},
    ServiceType.STT: {},
    ServiceType.EMBEDDINGS: {},
+    ServiceType.REALTIME: {},
 }

 T = TypeVar("T", bound=BaseServiceConfiguration)
@ -279,6 +285,68 @@ class SpeachesLLMConfiguration(BaseLLMConfiguration):
    api_key: str | list[str] | None = Field(default=None)


+OPENAI_REALTIME_MODELS = ["gpt-4o-realtime-preview", "gpt-4o-mini-realtime-preview"]
+OPENAI_REALTIME_VOICES = [
+    "alloy",
+    "ash",
+    "ballad",
+    "coral",
+    "echo",
+    "sage",
+    "shimmer",
+    "verse",
+]
+
+
+# @register_service(ServiceType.REALTIME)
+# class OpenAIRealtimeLLMConfiguration(BaseLLMConfiguration):
+#     provider: Literal[ServiceProviders.OPENAI_REALTIME] = (
+#         ServiceProviders.OPENAI_REALTIME
+#     )
+#     model: str = Field(
+#         default="gpt-4o-realtime-preview",
+#         json_schema_extra={
+#             "examples": OPENAI_REALTIME_MODELS,
+#             "allow_custom_input": True,
+#         },
+#     )
+#     voice: str = Field(
+#         default="alloy",
+#         json_schema_extra={"examples": OPENAI_REALTIME_VOICES},
+#     )
+
+
+GOOGLE_REALTIME_MODELS = ["gemini-3.1-flash-live-preview"]
+GOOGLE_REALTIME_VOICES = ["Puck", "Charon", "Kore", "Fenrir", "Aoede"]
+
+
+@register_service(ServiceType.REALTIME)
+class GoogleRealtimeLLMConfiguration(BaseLLMConfiguration):
+    provider: Literal[ServiceProviders.GOOGLE_REALTIME] = (
+        ServiceProviders.GOOGLE_REALTIME
+    )
+    model: str = Field(
+        default="gemini-2.0-flash-live-001",
+        json_schema_extra={
+            "examples": GOOGLE_REALTIME_MODELS,
+            "allow_custom_input": True,
+        },
+    )
+    voice: str = Field(
+        default="Puck",
+        json_schema_extra={
+            "examples": GOOGLE_REALTIME_VOICES,
+            "allow_custom_input": True,
+        },
+    )
+
+
+REALTIME_PROVIDERS = {
+    ServiceProviders.OPENAI_REALTIME.value,
+    ServiceProviders.GOOGLE_REALTIME.value,
+}
+
+
 LLMConfig = Annotated[
    Union[
        OpenAILLMService,
@ -293,6 +361,14 @@ LLMConfig = Annotated[
    Field(discriminator="provider"),
 ]

+RealtimeConfig = Annotated[
+    Union[
+        # OpenAIRealtimeLLMConfiguration,
+        GoogleRealtimeLLMConfiguration,
+    ],
+    Field(discriminator="provider"),
+]
+
 ###################################################### TTS ########################################################################


@ -719,6 +795,7 @@ SPEACHES_STT_MODELS = [
    "Systran/faster-distil-whisper-small.en",
    "Systran/faster-whisper-large-v3",
 ]
+SPEACHES_STT_LANGUAGES = ["en", "ar", "nl", "fr", "de", "hi", "it", "pt", "es"]


@register_stt
@ -731,6 +808,13 @@ class SpeachesSTTConfiguration(BaseSTTConfiguration):
            "allow_custom_input": True,
        },
    )
+    language: str = Field(
+        default="en",
+        json_schema_extra={
+            "examples": SPEACHES_STT_LANGUAGES,
+            "allow_custom_input": True,
+        },
+    )
    base_url: str = Field(
        default="http://localhost:8000/v1",
        description="OpenAI-compatible STT endpoint (Speaches, etc.)",
@ -785,6 +869,6 @@ EmbeddingsConfig = Annotated[
 ]

 ServiceConfig = Annotated[
-    Union[LLMConfig, TTSConfig, STTConfig, EmbeddingsConfig],
+    Union[LLMConfig, RealtimeConfig, TTSConfig, STTConfig, EmbeddingsConfig],
    Field(discriminator="provider"),
 ]