Merge remote-tracking branch 'origin/main' into feat/user-onboarding

# Conflicts:
#	docs/api-reference/openapi.json
#	sdk/python/src/dograh_sdk/_generated_models.py
#	ui/src/client/index.ts
#	ui/src/components/AIModelConfigurationV2Editor.tsx
This commit is contained in:
Abhishek Kumar 2026-06-17 19:19:20 +05:30
commit 5559ed686f
44 changed files with 2155 additions and 321 deletions

View file

@ -49,6 +49,7 @@ class UserConfigurationValidator:
ServiceProviders.CAMB.value: self._check_camb_api_key,
ServiceProviders.AWS_BEDROCK.value: self._check_aws_bedrock_api_key,
ServiceProviders.SPEACHES.value: self._check_speaches_api_key,
ServiceProviders.HUGGINGFACE.value: self._check_huggingface_api_key,
ServiceProviders.GOOGLE_VERTEX.value: self._check_google_vertex_llm_api_key,
ServiceProviders.OPENAI_REALTIME.value: self._check_openai_api_key,
ServiceProviders.GROK_REALTIME.value: self._check_grok_realtime_api_key,
@ -60,6 +61,7 @@ class UserConfigurationValidator:
ServiceProviders.GLADIA.value: self._check_gladia_api_key,
ServiceProviders.RIME.value: self._check_rime_api_key,
ServiceProviders.MINIMAX.value: self._check_minimax_api_key,
ServiceProviders.SMALLEST.value: self._check_smallest_api_key,
}
async def validate(
@ -360,6 +362,14 @@ class UserConfigurationValidator:
raise ValueError("base_url is required for Speaches services")
return True
def _check_huggingface_api_key(self, model: str, api_key: str) -> bool:
if not api_key.startswith("hf_"):
raise ValueError(
"Invalid Hugging Face API token format. Use a token that starts with "
"'hf_' and has Inference Providers permission."
)
return True
def _check_google_vertex_realtime_api_key(self, model: str, service_config) -> bool:
if not getattr(service_config, "project_id", None):
raise ValueError("project_id is required for Google Vertex Realtime")
@ -389,6 +399,7 @@ class UserConfigurationValidator:
return True
def _check_minimax_api_key(self, model: str, api_key: str) -> bool:
# MiniMax doesn't publish a cheap key-validation endpoint; trust the key
# at save time and surface auth errors at first call (same as Rime/Sarvam).
return True
def _check_smallest_api_key(self, model: str, api_key: str) -> bool:
return True

View file

@ -68,6 +68,7 @@ class ServiceProviders(str, Enum):
CAMB = "camb"
AWS_BEDROCK = "aws_bedrock"
SPEACHES = "speaches"
HUGGINGFACE = "huggingface"
ASSEMBLYAI = "assemblyai"
GLADIA = "gladia"
RIME = "rime"
@ -79,6 +80,7 @@ class ServiceProviders(str, Enum):
GOOGLE_REALTIME = "google_realtime"
GOOGLE_VERTEX_REALTIME = "google_vertex_realtime"
AZURE_REALTIME = "azure_realtime"
SMALLEST = "smallest"
class BaseServiceConfiguration(BaseModel):
@ -94,6 +96,7 @@ class BaseServiceConfiguration(BaseModel):
ServiceProviders.DOGRAH,
ServiceProviders.AWS_BEDROCK,
ServiceProviders.SPEACHES,
ServiceProviders.HUGGINGFACE,
ServiceProviders.ASSEMBLYAI,
ServiceProviders.GLADIA,
ServiceProviders.RIME,
@ -106,6 +109,7 @@ class BaseServiceConfiguration(BaseModel):
ServiceProviders.GOOGLE_VERTEX_REALTIME,
ServiceProviders.AZURE_REALTIME,
ServiceProviders.SARVAM,
ServiceProviders.SMALLEST,
]
api_key: str | list[str]
@ -255,6 +259,11 @@ SPEACHES_PROVIDER_MODEL_CONFIG = provider_model_config(
),
provider_docs_url="https://github.com/speaches-ai/speaches",
)
HUGGINGFACE_PROVIDER_MODEL_CONFIG = provider_model_config(
"Hugging Face",
description="Hosted Hugging Face Inference Providers API for usage-based inference.",
provider_docs_url="https://huggingface.co/docs/inference-providers/en/index",
)
AZURE_SPEECH_PROVIDER_MODEL_CONFIG = provider_model_config(
"Azure Speech Services",
description="Azure Cognitive Services Speech — TTS and STT via the Azure Speech SDK.",
@ -471,6 +480,35 @@ class SpeachesLLMConfiguration(BaseLLMConfiguration):
)
HUGGINGFACE_LLM_MODELS = [
"openai/gpt-oss-120b:cerebras",
"deepseek-ai/DeepSeek-R1:fastest",
"Qwen/Qwen3-Coder-480B-A35B-Instruct:fastest",
]
@register_llm
class HuggingFaceLLMConfiguration(BaseLLMConfiguration):
model_config = HUGGINGFACE_PROVIDER_MODEL_CONFIG
provider: Literal[ServiceProviders.HUGGINGFACE] = ServiceProviders.HUGGINGFACE
model: str = Field(
default="openai/gpt-oss-120b:cerebras",
description="Hugging Face chat-completion model identifier, optionally with provider suffix.",
json_schema_extra={
"examples": HUGGINGFACE_LLM_MODELS,
"allow_custom_input": True,
},
)
base_url: str = Field(
default="https://router.huggingface.co/v1",
description="Hugging Face OpenAI-compatible chat-completions router base URL.",
)
bill_to: str | None = Field(
default=None,
description="Optional Hugging Face organization or user to bill using X-HF-Bill-To.",
)
MINIMAX_MODELS = [
"MiniMax-M2.7",
"MiniMax-M2.7-highspeed",
@ -741,6 +779,7 @@ LLMConfig = Annotated[
DograhLLMService,
AWSBedrockLLMConfiguration,
SpeachesLLMConfiguration,
HuggingFaceLLMConfiguration,
MiniMaxLLMConfiguration,
SarvamLLMConfiguration,
],
@ -907,6 +946,7 @@ class DograhTTSService(BaseTTSConfiguration):
voice: str = Field(
default="default",
description="Voice preset.",
json_schema_extra={"allow_custom_input": True},
)
speed: float = Field(default=1.0, ge=0.5, le=2.0, description="Speed of the voice.")
@ -961,6 +1001,12 @@ class SarvamTTSConfiguration(BaseTTSConfiguration):
description="BCP-47 Indian-language code (e.g. hi-IN, en-IN).",
json_schema_extra={"examples": SARVAM_LANGUAGES},
)
speed: float = Field(
default=1.0,
ge=0.5,
le=2.0,
description="Speech speed multiplier.",
)
CAMB_TTS_MODELS = ["mars-flash", "mars-pro", "mars-instruct"]
@ -1120,6 +1166,80 @@ class AzureSpeechTTSConfiguration(BaseTTSConfiguration):
)
SMALLEST_PROVIDER_MODEL_CONFIG = provider_model_config(
"Smallest AI",
description="Smallest AI ultralow-latency TTS (Waves) and STT (Pulse) APIs.",
provider_docs_url="https://smallest.ai/docs",
)
SMALLEST_TTS_MODELS = ["lightning_v3.1", "lightning_v3.1_pro"]
SMALLEST_TTS_VOICES = [
"sophia",
"avery",
"liam",
"lucas",
"olivia",
"ryan",
"freya",
"william",
"devansh",
"arjun",
"niharika",
"maya",
"dhruv",
"mia",
"maithili",
]
SMALLEST_TTS_LANGUAGES = [
"en",
"hi",
"fr",
"de",
"es",
"it",
"nl",
"pl",
"ru",
"ar",
"bn",
"gu",
"he",
"kn",
"mr",
"ta",
]
@register_tts
class SmallestAITTSConfiguration(BaseTTSConfiguration):
model_config = SMALLEST_PROVIDER_MODEL_CONFIG
provider: Literal[ServiceProviders.SMALLEST] = ServiceProviders.SMALLEST
model: str = Field(
default="lightning_v3.1",
description="Smallest AI TTS model. lightning_v3.1_pro is the premium pool (American, British, Indian accents); lightning_v3.1 is the standard pool with 217 voices across 12 languages.",
json_schema_extra={"examples": SMALLEST_TTS_MODELS},
)
voice: str = Field(
default="sophia",
description="Smallest AI voice ID.",
json_schema_extra={"examples": SMALLEST_TTS_VOICES, "allow_custom_input": True},
)
language: str = Field(
default="en",
description="ISO 639-1 language code for synthesis.",
json_schema_extra={
"examples": SMALLEST_TTS_LANGUAGES,
"allow_custom_input": True,
},
)
speed: float = Field(
default=1.0,
ge=0.5,
le=2.0,
description="Speech speed multiplier (0.5 to 2.0).",
)
TTSConfig = Annotated[
Union[
DeepgramTTSConfiguration,
@ -1134,6 +1254,7 @@ TTSConfig = Annotated[
SpeachesTTSConfiguration,
MiniMaxTTSConfiguration,
AzureSpeechTTSConfiguration,
SmallestAITTSConfiguration,
],
Field(discriminator="provider"),
]
@ -1334,6 +1455,38 @@ class SpeachesSTTConfiguration(BaseSTTConfiguration):
)
HUGGINGFACE_STT_MODELS = [
"openai/whisper-large-v3-turbo",
"openai/whisper-large-v3",
]
@register_stt
class HuggingFaceSTTConfiguration(BaseSTTConfiguration):
model_config = HUGGINGFACE_PROVIDER_MODEL_CONFIG
provider: Literal[ServiceProviders.HUGGINGFACE] = ServiceProviders.HUGGINGFACE
model: str = Field(
default="openai/whisper-large-v3-turbo",
description="Hugging Face ASR model identifier served through Inference Providers.",
json_schema_extra={
"examples": HUGGINGFACE_STT_MODELS,
"allow_custom_input": True,
},
)
base_url: str = Field(
default="https://router.huggingface.co/hf-inference",
description="Hugging Face Inference Providers router base URL.",
)
bill_to: str | None = Field(
default=None,
description="Optional Hugging Face organization or user to bill using X-HF-Bill-To.",
)
return_timestamps: bool = Field(
default=False,
description="Request timestamp chunks when supported by the selected provider/model.",
)
ASSEMBLYAI_STT_MODELS = ["u3-rt-pro"]
ASSEMBLYAI_STT_LANGUAGES = ["en", "es", "de", "fr", "pt", "it"]
@ -1396,6 +1549,62 @@ class AzureSpeechSTTConfiguration(BaseSTTConfiguration):
)
SMALLEST_STT_MODELS = ["pulse"]
SMALLEST_STT_LANGUAGES = [
"en",
"hi",
"fr",
"de",
"es",
"it",
"nl",
"pl",
"ru",
"pt",
"bn",
"gu",
"kn",
"ml",
"mr",
"ta",
"te",
"pa",
"or",
"bg",
"cs",
"da",
"et",
"fi",
"hu",
"lt",
"lv",
"mt",
"ro",
"sk",
"sv",
"uk",
]
@register_stt
class SmallestAISTTConfiguration(BaseSTTConfiguration):
model_config = SMALLEST_PROVIDER_MODEL_CONFIG
provider: Literal[ServiceProviders.SMALLEST] = ServiceProviders.SMALLEST
model: str = Field(
default="pulse",
description="Smallest AI STT model. Supports 38 languages with real-time streaming.",
json_schema_extra={"examples": SMALLEST_STT_MODELS},
)
language: str = Field(
default="en",
description="ISO 639-1 language code for transcription.",
json_schema_extra={
"examples": SMALLEST_STT_LANGUAGES,
"allow_custom_input": True,
},
)
STTConfig = Annotated[
Union[
DeepgramSTTConfiguration,
@ -1406,9 +1615,11 @@ STTConfig = Annotated[
SpeechmaticsSTTConfiguration,
SarvamSTTConfiguration,
SpeachesSTTConfiguration,
HuggingFaceSTTConfiguration,
AssemblyAISTTConfiguration,
GladiaSTTConfiguration,
AzureSpeechSTTConfiguration,
SmallestAISTTConfiguration,
],
Field(discriminator="provider"),
]

View file

@ -9,8 +9,8 @@ from api.services.integrations import IntegrationRuntimeSession
from api.services.pipecat.audio_config import AudioConfig
from api.services.pipecat.audio_playback import play_audio_loop
from api.services.pipecat.in_memory_buffers import (
InMemoryAudioBuffer,
InMemoryLogsBuffer,
InMemoryRecordingBuffers,
)
from api.services.pipecat.pipeline_metrics_aggregator import PipelineMetricsAggregator
from api.services.pipecat.tracing_config import get_trace_url
@ -40,11 +40,11 @@ async def _capture_call_event(
"workflow_run_id": workflow_run_id,
"workflow_id": workflow_run.workflow_id if workflow_run else None,
"call_type": workflow_run.mode if workflow_run else None,
"call_direction": (workflow_run.initial_context or {}).get(
"direction", "outbound"
)
if workflow_run
else None,
"call_direction": (
(workflow_run.initial_context or {}).get("direction", "outbound")
if workflow_run
else None
),
}
if extra_properties:
properties.update(extra_properties)
@ -73,7 +73,7 @@ def register_event_handlers(
"""Register all event handlers for transport and task events.
Returns:
in_memory_audio_buffer for use by other handlers.
In-memory recording buffers for use by other handlers.
"""
# Initialize in-memory buffers with proper audio configuration
sample_rate = audio_config.pipeline_sample_rate if audio_config else 16000
@ -84,7 +84,7 @@ def register_event_handlers(
f"with sample_rate={sample_rate}Hz, channels={num_channels}"
)
in_memory_audio_buffer = InMemoryAudioBuffer(
in_memory_audio_buffers = InMemoryRecordingBuffers(
workflow_run_id=workflow_run_id,
sample_rate=sample_rate,
num_channels=num_channels,
@ -363,14 +363,32 @@ def register_event_handlers(
# Write buffers to temp files and enqueue combined processing task
audio_temp_path = None
user_audio_temp_path = None
bot_audio_temp_path = None
transcript_temp_path = None
try:
if not in_memory_audio_buffer.is_empty:
audio_temp_path = await in_memory_audio_buffer.write_to_temp_file()
if not in_memory_audio_buffers.mixed.is_empty:
audio_temp_path = (
await in_memory_audio_buffers.mixed.write_to_temp_file()
)
else:
logger.debug("Audio buffer is empty, skipping upload")
if not in_memory_audio_buffers.user.is_empty:
user_audio_temp_path = (
await in_memory_audio_buffers.user.write_to_temp_file()
)
else:
logger.debug("User audio buffer is empty, skipping upload")
if not in_memory_audio_buffers.bot.is_empty:
bot_audio_temp_path = (
await in_memory_audio_buffers.bot.write_to_temp_file()
)
else:
logger.debug("Bot audio buffer is empty, skipping upload")
transcript_temp_path = in_memory_logs_buffer.write_transcript_to_temp_file()
if not transcript_temp_path:
logger.debug("No transcript events in logs buffer, skipping upload")
@ -385,16 +403,18 @@ def register_event_handlers(
workflow_run_id,
audio_temp_path,
transcript_temp_path,
user_audio_temp_path,
bot_audio_temp_path,
)
# Return the buffer so it can be passed to other handlers
return in_memory_audio_buffer
return in_memory_audio_buffers
def register_audio_data_handler(
audio_buffer: AudioBufferProcessor,
workflow_run_id,
in_memory_buffer: InMemoryAudioBuffer,
in_memory_buffers: InMemoryRecordingBuffers,
):
"""Register event handler for audio data"""
logger.info(f"Registering audio data handler for workflow run {workflow_run_id}")
@ -404,9 +424,19 @@ def register_audio_data_handler(
if not audio:
return
# Use in-memory buffer
try:
await in_memory_buffer.append(audio)
await in_memory_buffers.mixed.append(audio)
except MemoryError as e:
logger.error(f"Memory buffer full: {e}")
# Could implement overflow to disk here if needed
logger.error(f"Mixed audio buffer full: {e}")
@audio_buffer.event_handler("on_track_audio_data")
async def on_track_audio_data(
buffer, user_audio, bot_audio, sample_rate, num_channels
):
try:
if user_audio:
await in_memory_buffers.user.append(user_audio)
if bot_audio:
await in_memory_buffers.bot.append(bot_audio)
except MemoryError as e:
logger.error(f"Track audio buffer full: {e}")

View file

@ -75,6 +75,27 @@ class InMemoryAudioBuffer:
return self._total_size
class InMemoryRecordingBuffers:
"""Holds the mixed recording plus aligned user and bot mono tracks."""
def __init__(self, workflow_run_id: int, sample_rate: int, num_channels: int = 1):
self.mixed = InMemoryAudioBuffer(
workflow_run_id=workflow_run_id,
sample_rate=sample_rate,
num_channels=num_channels,
)
self.user = InMemoryAudioBuffer(
workflow_run_id=workflow_run_id,
sample_rate=sample_rate,
num_channels=1,
)
self.bot = InMemoryAudioBuffer(
workflow_run_id=workflow_run_id,
sample_rate=sample_rate,
num_channels=1,
)
class InMemoryLogsBuffer:
"""Buffer real-time feedback events in memory during a call, then save to workflow run logs."""

View file

@ -39,8 +39,17 @@ from pipecat.services.google.vertex.llm import (
GoogleVertexLLMSettings,
)
from pipecat.services.groq.llm import GroqLLMService, GroqLLMSettings
from pipecat.services.huggingface.llm import (
HuggingFaceLLMService,
HuggingFaceLLMSettings,
)
from pipecat.services.huggingface.stt import (
HuggingFaceSTTService,
HuggingFaceSTTSettings,
)
from pipecat.services.minimax.llm import MiniMaxLLMService
from pipecat.services.minimax.tts import MiniMaxTTSSettings
from pipecat.services.openai._constants import OPENAI_SAMPLE_RATE
from pipecat.services.openai.base_llm import OpenAILLMSettings
from pipecat.services.openai.llm import OpenAILLMService
from pipecat.services.openai.stt import (
@ -53,6 +62,8 @@ from pipecat.services.rime.tts import RimeTTSService, RimeTTSSettings
from pipecat.services.sarvam.llm import SarvamLLMService, SarvamLLMSettings
from pipecat.services.sarvam.stt import SarvamSTTService, SarvamSTTSettings
from pipecat.services.sarvam.tts import SarvamTTSService, SarvamTTSSettings
from pipecat.services.smallest.stt import SmallestSTTService, SmallestSTTSettings
from pipecat.services.smallest.tts import SmallestTTSService, SmallestTTSSettings
from pipecat.services.speaches.llm import SpeachesLLMService, SpeachesLLMSettings
from pipecat.services.speaches.stt import SpeachesSTTService, SpeachesSTTSettings
from pipecat.services.speaches.tts import SpeachesTTSService, SpeachesTTSSettings
@ -218,6 +229,22 @@ def create_stt_service(
),
sample_rate=audio_config.transport_in_sample_rate,
)
elif user_config.stt.provider == ServiceProviders.HUGGINGFACE.value:
base_url = (
getattr(user_config.stt, "base_url", None)
or "https://router.huggingface.co/hf-inference"
)
_validate_runtime_service_url(base_url, "base_url")
return HuggingFaceSTTService(
api_key=user_config.stt.api_key,
base_url=base_url,
bill_to=getattr(user_config.stt, "bill_to", None),
settings=HuggingFaceSTTSettings(
model=user_config.stt.model,
return_timestamps=getattr(user_config.stt, "return_timestamps", False),
),
sample_rate=audio_config.transport_in_sample_rate,
)
elif user_config.stt.provider == ServiceProviders.ASSEMBLYAI.value:
language = getattr(user_config.stt, "language", None)
settings_kwargs = {"model": user_config.stt.model, "language": language}
@ -284,6 +311,20 @@ def create_stt_service(
settings=AzureSTTSettings(language=pipecat_language),
sample_rate=audio_config.transport_in_sample_rate,
)
elif user_config.stt.provider == ServiceProviders.SMALLEST.value:
language_code = getattr(user_config.stt, "language", None) or "en"
try:
pipecat_language = Language(language_code)
except ValueError:
pipecat_language = Language.EN
return SmallestSTTService(
api_key=user_config.stt.api_key,
settings=SmallestSTTSettings(
model=user_config.stt.model,
language=pipecat_language,
),
sample_rate=audio_config.transport_in_sample_rate,
)
else:
raise HTTPException(
status_code=400, detail=f"Invalid STT provider {user_config.stt.provider}"
@ -320,6 +361,7 @@ def create_tts_service(
kwargs["base_url"] = base_url
return OpenAITTSService(
api_key=user_config.tts.api_key,
sample_rate=OPENAI_SAMPLE_RATE,
settings=OpenAITTSSettings(model=user_config.tts.model),
text_filters=[xml_function_tag_filter],
skip_aggregator_types=["recording_router", "recording"],
@ -493,13 +535,17 @@ def create_tts_service(
pipecat_language = language_mapping.get(language, Language.HI)
voice = getattr(user_config.tts, "voice", None) or "anushka"
speed = getattr(user_config.tts, "speed", None)
settings_kwargs = {
"model": user_config.tts.model,
"voice": voice,
"language": pipecat_language,
}
if speed and speed != 1.0:
settings_kwargs["pace"] = speed
return SarvamTTSService(
api_key=user_config.tts.api_key,
settings=SarvamTTSSettings(
model=user_config.tts.model,
voice=voice,
language=pipecat_language,
),
settings=SarvamTTSSettings(**settings_kwargs),
text_filters=[xml_function_tag_filter],
skip_aggregator_types=["recording_router", "recording"],
silence_time_s=1.0,
@ -560,6 +606,28 @@ def create_tts_service(
skip_aggregator_types=["recording_router", "recording"],
silence_time_s=1.0,
)
elif user_config.tts.provider == ServiceProviders.SMALLEST.value:
language_code = getattr(user_config.tts, "language", None) or "en"
try:
pipecat_language = Language(language_code)
except ValueError:
pipecat_language = Language.EN
speed = getattr(user_config.tts, "speed", None)
model = user_config.tts.model.replace("lightning-v", "lightning_v")
settings_kwargs = SmallestTTSSettings(
model=model,
voice=user_config.tts.voice,
language=pipecat_language,
)
if speed and speed != 1.0:
settings_kwargs.speed = speed
return SmallestTTSService(
api_key=user_config.tts.api_key,
settings=settings_kwargs,
text_filters=[xml_function_tag_filter],
skip_aggregator_types=["recording_router", "recording"],
silence_time_s=1.0,
)
else:
raise HTTPException(
status_code=400, detail=f"Invalid TTS provider {user_config.tts.provider}"
@ -581,6 +649,7 @@ def create_llm_service_from_provider(
location: str | None = None,
credentials: str | None = None,
temperature: float | None = None,
bill_to: str | None = None,
):
"""Create an LLM service from explicit provider/model/api_key.
@ -663,6 +732,15 @@ def create_llm_service_from_provider(
api_key=api_key or "none",
settings=SpeachesLLMSettings(model=model),
)
elif provider == ServiceProviders.HUGGINGFACE.value:
base_url = base_url or "https://router.huggingface.co/v1"
_validate_runtime_service_url(base_url, "base_url")
return HuggingFaceLLMService(
api_key=api_key,
base_url=base_url,
bill_to=bill_to,
settings=HuggingFaceLLMSettings(model=model, temperature=0.1),
)
elif provider == ServiceProviders.MINIMAX.value:
base_url = base_url or "https://api.minimax.io/v1"
_validate_runtime_service_url(base_url, "base_url")
@ -875,6 +953,9 @@ def create_llm_service(user_config, correlation_id: str | None = None):
kwargs["endpoint"] = user_config.llm.endpoint
elif provider == ServiceProviders.SPEACHES.value:
kwargs["base_url"] = user_config.llm.base_url
elif provider == ServiceProviders.HUGGINGFACE.value:
kwargs["base_url"] = user_config.llm.base_url
kwargs["bill_to"] = user_config.llm.bill_to
elif provider == ServiceProviders.AWS_BEDROCK.value:
kwargs["aws_access_key"] = user_config.llm.aws_access_key
kwargs["aws_secret_key"] = user_config.llm.aws_secret_key

View file

@ -718,6 +718,8 @@ class TriggerNodeData(BaseNodeData):
"rsvp": "{{gathered_context.rsvp}}",
"duration": "{{cost_info.call_duration_seconds}}",
"recording_url": "{{recording_url}}",
"user_recording_url": "{{user_recording_url}}",
"bot_recording_url": "{{bot_recording_url}}",
"transcript_url": "{{transcript_url}}",
},
},