feat: add gemini realtime and speaches integration

- Add gemini realtime support
- Add speaches support for locally hosted LLMs
This commit is contained in:
Abhishek Kumar 2026-03-31 17:39:47 +05:30
parent 2eaaabd936
commit ee2028eb2d
19 changed files with 531 additions and 185 deletions

View file

@ -47,6 +47,8 @@ class UserConfigurationValidator:
ServiceProviders.CAMB.value: self._check_camb_api_key,
ServiceProviders.AWS_BEDROCK.value: self._check_aws_bedrock_api_key,
ServiceProviders.SPEACHES.value: self._check_speaches_api_key,
ServiceProviders.OPENAI_REALTIME.value: self._check_openai_api_key,
ServiceProviders.GOOGLE_REALTIME.value: self._check_google_api_key,
}
async def validate(
@ -70,6 +72,10 @@ class UserConfigurationValidator:
configuration.embeddings, "embeddings", required=False
)
)
# Realtime is optional - only validate if configured
status_list.extend(
self._validate_service(configuration.realtime, "realtime", required=False)
)
if status_list:
raise ValueError(status_list)

View file

@ -29,7 +29,7 @@ def contains_masked_key(api_key: str | list[str] | None) -> bool:
def check_for_masked_keys(config: "UserConfiguration") -> None:
"""Raise ValueError if any service in *config* still has a masked API key."""
for field in ("llm", "tts", "stt", "embeddings"):
for field in ("llm", "tts", "stt", "embeddings", "realtime"):
service = getattr(config, field, None)
if service is None:
continue
@ -121,6 +121,8 @@ def mask_user_config(config: UserConfiguration) -> Dict[str, Any]:
"tts": _mask_service(config.tts),
"stt": _mask_service(config.stt),
"embeddings": _mask_service(config.embeddings),
"realtime": _mask_service(config.realtime),
"is_realtime": config.is_realtime,
"test_phone_number": config.test_phone_number,
"timezone": config.timezone,
}

View file

@ -9,7 +9,7 @@ from typing import Dict
from api.schemas.user_configuration import UserConfiguration
from api.services.configuration.masking import resolve_masked_api_keys
SERVICE_FIELDS = ("llm", "tts", "stt", "embeddings")
SERVICE_FIELDS = ("llm", "tts", "stt", "embeddings", "realtime")
def merge_user_configurations(
@ -64,6 +64,9 @@ def merge_user_configurations(
_merge_service_block(service)
# other simple fields
if "is_realtime" in incoming_partial:
merged["is_realtime"] = incoming_partial["is_realtime"]
if "test_phone_number" in incoming_partial:
merged["test_phone_number"] = incoming_partial["test_phone_number"]

View file

@ -10,6 +10,7 @@ class ServiceType(Enum):
TTS = auto()
STT = auto()
EMBEDDINGS = auto()
REALTIME = auto()
class ServiceProviders(str, Enum):
@ -28,6 +29,8 @@ class ServiceProviders(str, Enum):
CAMB = "camb"
AWS_BEDROCK = "aws_bedrock"
SPEACHES = "speaches"
OPENAI_REALTIME = "openai_realtime"
GOOGLE_REALTIME = "google_realtime"
class BaseServiceConfiguration(BaseModel):
@ -42,6 +45,8 @@ class BaseServiceConfiguration(BaseModel):
ServiceProviders.DOGRAH,
ServiceProviders.AWS_BEDROCK,
ServiceProviders.SPEACHES,
ServiceProviders.OPENAI_REALTIME,
ServiceProviders.GOOGLE_REALTIME,
# ServiceProviders.SARVAM,
]
api_key: str | list[str]
@ -97,6 +102,7 @@ REGISTRY: Dict[ServiceType, Dict[str, Type[BaseServiceConfiguration]]] = {
ServiceType.TTS: {},
ServiceType.STT: {},
ServiceType.EMBEDDINGS: {},
ServiceType.REALTIME: {},
}
T = TypeVar("T", bound=BaseServiceConfiguration)
@ -279,6 +285,68 @@ class SpeachesLLMConfiguration(BaseLLMConfiguration):
api_key: str | list[str] | None = Field(default=None)
OPENAI_REALTIME_MODELS = ["gpt-4o-realtime-preview", "gpt-4o-mini-realtime-preview"]
OPENAI_REALTIME_VOICES = [
"alloy",
"ash",
"ballad",
"coral",
"echo",
"sage",
"shimmer",
"verse",
]
# @register_service(ServiceType.REALTIME)
# class OpenAIRealtimeLLMConfiguration(BaseLLMConfiguration):
# provider: Literal[ServiceProviders.OPENAI_REALTIME] = (
# ServiceProviders.OPENAI_REALTIME
# )
# model: str = Field(
# default="gpt-4o-realtime-preview",
# json_schema_extra={
# "examples": OPENAI_REALTIME_MODELS,
# "allow_custom_input": True,
# },
# )
# voice: str = Field(
# default="alloy",
# json_schema_extra={"examples": OPENAI_REALTIME_VOICES},
# )
GOOGLE_REALTIME_MODELS = ["gemini-3.1-flash-live-preview"]
GOOGLE_REALTIME_VOICES = ["Puck", "Charon", "Kore", "Fenrir", "Aoede"]
@register_service(ServiceType.REALTIME)
class GoogleRealtimeLLMConfiguration(BaseLLMConfiguration):
provider: Literal[ServiceProviders.GOOGLE_REALTIME] = (
ServiceProviders.GOOGLE_REALTIME
)
model: str = Field(
default="gemini-2.0-flash-live-001",
json_schema_extra={
"examples": GOOGLE_REALTIME_MODELS,
"allow_custom_input": True,
},
)
voice: str = Field(
default="Puck",
json_schema_extra={
"examples": GOOGLE_REALTIME_VOICES,
"allow_custom_input": True,
},
)
REALTIME_PROVIDERS = {
ServiceProviders.OPENAI_REALTIME.value,
ServiceProviders.GOOGLE_REALTIME.value,
}
LLMConfig = Annotated[
Union[
OpenAILLMService,
@ -293,6 +361,14 @@ LLMConfig = Annotated[
Field(discriminator="provider"),
]
RealtimeConfig = Annotated[
Union[
# OpenAIRealtimeLLMConfiguration,
GoogleRealtimeLLMConfiguration,
],
Field(discriminator="provider"),
]
###################################################### TTS ########################################################################
@ -719,6 +795,7 @@ SPEACHES_STT_MODELS = [
"Systran/faster-distil-whisper-small.en",
"Systran/faster-whisper-large-v3",
]
SPEACHES_STT_LANGUAGES = ["en", "ar", "nl", "fr", "de", "hi", "it", "pt", "es"]
@register_stt
@ -731,6 +808,13 @@ class SpeachesSTTConfiguration(BaseSTTConfiguration):
"allow_custom_input": True,
},
)
language: str = Field(
default="en",
json_schema_extra={
"examples": SPEACHES_STT_LANGUAGES,
"allow_custom_input": True,
},
)
base_url: str = Field(
default="http://localhost:8000/v1",
description="OpenAI-compatible STT endpoint (Speaches, etc.)",
@ -785,6 +869,6 @@ EmbeddingsConfig = Annotated[
]
ServiceConfig = Annotated[
Union[LLMConfig, TTSConfig, STTConfig, EmbeddingsConfig],
Union[LLMConfig, RealtimeConfig, TTSConfig, STTConfig, EmbeddingsConfig],
Field(discriminator="provider"),
]