mirror of
https://github.com/dograh-hq/dograh.git
synced 2026-06-07 07:55:16 +02:00
feat: add openai realtime models (#298)
* feat: add openai realtime models * chore: bump pipecat * fix: resample telephony audio for openai realtime * fix: sampling rate fix for openai realtime * chore: clean up dead code
This commit is contained in:
parent
45b00cd5d0
commit
2381a803ad
45 changed files with 1991 additions and 173 deletions
|
|
@ -29,6 +29,20 @@ class ToolParameter(BaseModel):
|
|||
)
|
||||
|
||||
|
||||
class PresetToolParameter(BaseModel):
|
||||
"""A parameter injected by Dograh at runtime."""
|
||||
|
||||
name: str = Field(description="Parameter name (used as key in request body)")
|
||||
type: str = Field(description="Parameter type: string, number, or boolean")
|
||||
value_template: str = Field(
|
||||
description="Fixed value or template, e.g. {{initial_context.phone_number}}"
|
||||
)
|
||||
required: bool = Field(
|
||||
default=True,
|
||||
description="Whether the parameter must resolve to a non-empty value",
|
||||
)
|
||||
|
||||
|
||||
class HttpApiConfig(BaseModel):
|
||||
"""Configuration for HTTP API tools."""
|
||||
|
||||
|
|
@ -43,6 +57,10 @@ class HttpApiConfig(BaseModel):
|
|||
parameters: Optional[List[ToolParameter]] = Field(
|
||||
default=None, description="Parameters that the tool accepts from LLM"
|
||||
)
|
||||
preset_parameters: Optional[List[PresetToolParameter]] = Field(
|
||||
default=None,
|
||||
description="Parameters injected by Dograh from fixed values or workflow context templates",
|
||||
)
|
||||
timeout_ms: Optional[int] = Field(
|
||||
default=5000, description="Request timeout in milliseconds"
|
||||
)
|
||||
|
|
|
|||
|
|
@ -20,8 +20,8 @@ from api.utils.common import get_backend_endpoints
|
|||
if TYPE_CHECKING:
|
||||
# Type-only — importing api.services.telephony eagerly triggers the
|
||||
# provider package init, which can pull in this module via the routes
|
||||
# chain and create a circular import. Runtime calls below go through
|
||||
# ``factory.get_telephony_provider`` (lazy import inside the method).
|
||||
# chain and create a circular import. Runtime calls below lazy-import the
|
||||
# factory helpers inside methods instead.
|
||||
from api.services.telephony.base import TelephonyProvider
|
||||
|
||||
|
||||
|
|
@ -31,12 +31,6 @@ class CampaignCallDispatcher:
|
|||
def __init__(self):
|
||||
self.default_concurrent_limit = int(DEFAULT_ORG_CONCURRENCY_LIMIT)
|
||||
|
||||
async def get_telephony_provider(self, organization_id: int) -> "TelephonyProvider":
|
||||
"""Get telephony provider instance for specific organization (default config)."""
|
||||
from api.services.telephony.factory import get_default_telephony_provider
|
||||
|
||||
return await get_default_telephony_provider(organization_id)
|
||||
|
||||
async def get_provider_for_campaign(self, campaign) -> "TelephonyProvider":
|
||||
"""Get the telephony provider pinned to this campaign's config. Falls back
|
||||
to the org's default config for legacy campaigns whose
|
||||
|
|
|
|||
|
|
@ -49,6 +49,7 @@ class UserConfigurationValidator:
|
|||
ServiceProviders.SPEACHES.value: self._check_speaches_api_key,
|
||||
ServiceProviders.OPENAI_REALTIME.value: self._check_openai_api_key,
|
||||
ServiceProviders.GOOGLE_REALTIME.value: self._check_google_api_key,
|
||||
ServiceProviders.GOOGLE_VERTEX_REALTIME.value: self._check_google_vertex_realtime_api_key,
|
||||
ServiceProviders.ASSEMBLYAI.value: self._check_assemblyai_api_key,
|
||||
ServiceProviders.GLADIA.value: self._check_gladia_api_key,
|
||||
ServiceProviders.RIME.value: self._check_rime_api_key,
|
||||
|
|
@ -116,6 +117,22 @@ class UserConfigurationValidator:
|
|||
return [{"model": service_name, "message": str(e)}]
|
||||
return []
|
||||
|
||||
# Vertex Realtime uses service-account credentials (or ADC) instead of api_key
|
||||
if provider == ServiceProviders.GOOGLE_VERTEX_REALTIME.value:
|
||||
try:
|
||||
if not self._check_google_vertex_realtime_api_key(
|
||||
provider, service_config
|
||||
):
|
||||
return [
|
||||
{
|
||||
"model": service_name,
|
||||
"message": f"Invalid {provider} configuration",
|
||||
}
|
||||
]
|
||||
except ValueError as e:
|
||||
return [{"model": service_name, "message": str(e)}]
|
||||
return []
|
||||
|
||||
# AWS Bedrock uses AWS credentials instead of api_key
|
||||
if provider == ServiceProviders.AWS_BEDROCK.value:
|
||||
try:
|
||||
|
|
@ -216,6 +233,13 @@ class UserConfigurationValidator:
|
|||
raise ValueError("base_url is required for Speaches services")
|
||||
return True
|
||||
|
||||
def _check_google_vertex_realtime_api_key(self, model: str, service_config) -> bool:
|
||||
if not getattr(service_config, "project_id", None):
|
||||
raise ValueError("project_id is required for Google Vertex Realtime")
|
||||
if not getattr(service_config, "location", None):
|
||||
raise ValueError("location is required for Google Vertex Realtime")
|
||||
return True
|
||||
|
||||
def _check_aws_bedrock_api_key(self, model: str, service_config) -> bool:
|
||||
if not service_config.aws_access_key or not service_config.aws_secret_key:
|
||||
raise ValueError("AWS access key and secret key are required for Bedrock")
|
||||
|
|
|
|||
|
|
@ -34,6 +34,7 @@ class ServiceProviders(str, Enum):
|
|||
RIME = "rime"
|
||||
OPENAI_REALTIME = "openai_realtime"
|
||||
GOOGLE_REALTIME = "google_realtime"
|
||||
GOOGLE_VERTEX_REALTIME = "google_vertex_realtime"
|
||||
|
||||
|
||||
class BaseServiceConfiguration(BaseModel):
|
||||
|
|
@ -53,6 +54,7 @@ class BaseServiceConfiguration(BaseModel):
|
|||
ServiceProviders.RIME,
|
||||
ServiceProviders.OPENAI_REALTIME,
|
||||
ServiceProviders.GOOGLE_REALTIME,
|
||||
ServiceProviders.GOOGLE_VERTEX_REALTIME,
|
||||
# ServiceProviders.SARVAM,
|
||||
]
|
||||
api_key: str | list[str]
|
||||
|
|
@ -205,6 +207,7 @@ class OpenAILLMService(BaseLLMConfiguration):
|
|||
provider: Literal[ServiceProviders.OPENAI] = ServiceProviders.OPENAI
|
||||
model: str = Field(
|
||||
default="gpt-4.1",
|
||||
description="OpenAI chat model to use.",
|
||||
json_schema_extra={"examples": OPENAI_MODELS, "allow_custom_input": True},
|
||||
)
|
||||
|
||||
|
|
@ -214,6 +217,7 @@ class GoogleLLMService(BaseLLMConfiguration):
|
|||
provider: Literal[ServiceProviders.GOOGLE] = ServiceProviders.GOOGLE
|
||||
model: str = Field(
|
||||
default="gemini-2.0-flash",
|
||||
description="Gemini model on Google AI Studio (not Vertex).",
|
||||
json_schema_extra={"examples": GOOGLE_MODELS, "allow_custom_input": True},
|
||||
)
|
||||
|
||||
|
|
@ -223,6 +227,7 @@ class GroqLLMService(BaseLLMConfiguration):
|
|||
provider: Literal[ServiceProviders.GROQ] = ServiceProviders.GROQ
|
||||
model: str = Field(
|
||||
default="llama-3.3-70b-versatile",
|
||||
description="Groq-hosted model identifier.",
|
||||
json_schema_extra={"examples": GROQ_MODELS, "allow_custom_input": True},
|
||||
)
|
||||
|
||||
|
|
@ -232,10 +237,14 @@ class OpenRouterLLMConfiguration(BaseLLMConfiguration):
|
|||
provider: Literal[ServiceProviders.OPENROUTER] = ServiceProviders.OPENROUTER
|
||||
model: str = Field(
|
||||
default="openai/gpt-4.1",
|
||||
description="OpenRouter model slug in 'vendor/model' form.",
|
||||
json_schema_extra={"examples": OPENROUTER_MODELS, "allow_custom_input": True},
|
||||
)
|
||||
|
||||
base_url: str = Field(default="https://openrouter.ai/api/v1")
|
||||
base_url: str = Field(
|
||||
default="https://openrouter.ai/api/v1",
|
||||
description="Override only if proxying OpenRouter through your own gateway.",
|
||||
)
|
||||
|
||||
|
||||
@register_llm
|
||||
|
|
@ -243,10 +252,13 @@ class AzureLLMService(BaseLLMConfiguration):
|
|||
provider: Literal[ServiceProviders.AZURE] = ServiceProviders.AZURE
|
||||
model: str = Field(
|
||||
default="gpt-4.1-mini",
|
||||
description="Azure deployment name (not the upstream OpenAI model id).",
|
||||
json_schema_extra={"examples": AZURE_MODELS, "allow_custom_input": True},
|
||||
)
|
||||
|
||||
endpoint: str
|
||||
endpoint: str = Field(
|
||||
description="Azure OpenAI resource endpoint (e.g. https://<resource>.openai.azure.com).",
|
||||
)
|
||||
|
||||
|
||||
@register_llm
|
||||
|
|
@ -254,6 +266,7 @@ class DograhLLMService(BaseLLMConfiguration):
|
|||
provider: Literal[ServiceProviders.DOGRAH] = ServiceProviders.DOGRAH
|
||||
model: str = Field(
|
||||
default="default",
|
||||
description="Dograh-hosted model tier.",
|
||||
json_schema_extra={"examples": DOGRAH_LLM_MODELS, "allow_custom_input": True},
|
||||
)
|
||||
|
||||
|
|
@ -263,12 +276,25 @@ class AWSBedrockLLMConfiguration(BaseLLMConfiguration):
|
|||
provider: Literal[ServiceProviders.AWS_BEDROCK] = ServiceProviders.AWS_BEDROCK
|
||||
model: str = Field(
|
||||
default="us.amazon.nova-pro-v1:0",
|
||||
description="Bedrock model ID — include the region inference-profile prefix (e.g. 'us.').",
|
||||
json_schema_extra={"examples": AWS_BEDROCK_MODELS, "allow_custom_input": True},
|
||||
)
|
||||
aws_access_key: str = Field(default="")
|
||||
aws_secret_key: str = Field(default="")
|
||||
aws_region: str = Field(default="us-east-1")
|
||||
api_key: str | list[str] | None = Field(default=None)
|
||||
aws_access_key: str = Field(
|
||||
default="",
|
||||
description="AWS access key ID with bedrock:InvokeModel permission.",
|
||||
)
|
||||
aws_secret_key: str = Field(
|
||||
default="",
|
||||
description="AWS secret access key paired with the access key ID.",
|
||||
)
|
||||
aws_region: str = Field(
|
||||
default="us-east-1",
|
||||
description="AWS region where the Bedrock model is available.",
|
||||
)
|
||||
api_key: str | list[str] | None = Field(
|
||||
default=None,
|
||||
description="Not used for Bedrock — authentication is via the AWS credentials above. Leave blank.",
|
||||
)
|
||||
|
||||
|
||||
SPEACHES_LLM_MODELS = ["llama3", "mistral", "phi3", "qwen2", "gemma2", "deepseek-r1"]
|
||||
|
|
@ -279,6 +305,7 @@ class SpeachesLLMConfiguration(BaseLLMConfiguration):
|
|||
provider: Literal[ServiceProviders.SPEACHES] = ServiceProviders.SPEACHES
|
||||
model: str = Field(
|
||||
default="llama3",
|
||||
description="Model name as exposed by your OpenAI-compatible server.",
|
||||
json_schema_extra={
|
||||
"examples": SPEACHES_LLM_MODELS,
|
||||
"allow_custom_input": True,
|
||||
|
|
@ -286,12 +313,15 @@ class SpeachesLLMConfiguration(BaseLLMConfiguration):
|
|||
)
|
||||
base_url: str = Field(
|
||||
default="http://localhost:11434/v1",
|
||||
description="OpenAI-compatible endpoint (Ollama, vLLM, etc.)",
|
||||
description="OpenAI-compatible endpoint (Ollama, vLLM, etc.).",
|
||||
)
|
||||
api_key: str | list[str] | None = Field(
|
||||
default=None,
|
||||
description="Usually not required for self-hosted endpoints. Leave blank unless your server enforces one.",
|
||||
)
|
||||
api_key: str | list[str] | None = Field(default=None)
|
||||
|
||||
|
||||
OPENAI_REALTIME_MODELS = ["gpt-4o-realtime-preview", "gpt-4o-mini-realtime-preview"]
|
||||
OPENAI_REALTIME_MODELS = ["gpt-realtime-2"]
|
||||
OPENAI_REALTIME_VOICES = [
|
||||
"alloy",
|
||||
"ash",
|
||||
|
|
@ -304,22 +334,27 @@ OPENAI_REALTIME_VOICES = [
|
|||
]
|
||||
|
||||
|
||||
# @register_service(ServiceType.REALTIME)
|
||||
# class OpenAIRealtimeLLMConfiguration(BaseLLMConfiguration):
|
||||
# provider: Literal[ServiceProviders.OPENAI_REALTIME] = (
|
||||
# ServiceProviders.OPENAI_REALTIME
|
||||
# )
|
||||
# model: str = Field(
|
||||
# default="gpt-4o-realtime-preview",
|
||||
# json_schema_extra={
|
||||
# "examples": OPENAI_REALTIME_MODELS,
|
||||
# "allow_custom_input": True,
|
||||
# },
|
||||
# )
|
||||
# voice: str = Field(
|
||||
# default="alloy",
|
||||
# json_schema_extra={"examples": OPENAI_REALTIME_VOICES},
|
||||
# )
|
||||
@register_service(ServiceType.REALTIME)
|
||||
class OpenAIRealtimeLLMConfiguration(BaseLLMConfiguration):
|
||||
provider: Literal[ServiceProviders.OPENAI_REALTIME] = (
|
||||
ServiceProviders.OPENAI_REALTIME
|
||||
)
|
||||
model: str = Field(
|
||||
default="gpt-realtime-2",
|
||||
description="OpenAI realtime (speech-to-speech) model.",
|
||||
json_schema_extra={
|
||||
"examples": OPENAI_REALTIME_MODELS,
|
||||
"allow_custom_input": True,
|
||||
},
|
||||
)
|
||||
voice: str = Field(
|
||||
default="alloy",
|
||||
description="Voice the model speaks in.",
|
||||
json_schema_extra={
|
||||
"examples": OPENAI_REALTIME_VOICES,
|
||||
"allow_custom_input": True,
|
||||
},
|
||||
)
|
||||
|
||||
|
||||
GOOGLE_REALTIME_MODELS = ["gemini-3.1-flash-live-preview"]
|
||||
|
|
@ -360,6 +395,7 @@ class GoogleRealtimeLLMConfiguration(BaseLLMConfiguration):
|
|||
)
|
||||
model: str = Field(
|
||||
default="gemini-3.1-flash-live-preview",
|
||||
description="Gemini Live model on Google AI Studio (not Vertex).",
|
||||
json_schema_extra={
|
||||
"examples": GOOGLE_REALTIME_MODELS,
|
||||
"allow_custom_input": True,
|
||||
|
|
@ -367,6 +403,7 @@ class GoogleRealtimeLLMConfiguration(BaseLLMConfiguration):
|
|||
)
|
||||
voice: str = Field(
|
||||
default="Puck",
|
||||
description="Voice the model speaks in.",
|
||||
json_schema_extra={
|
||||
"examples": GOOGLE_REALTIME_VOICES,
|
||||
"allow_custom_input": True,
|
||||
|
|
@ -374,6 +411,7 @@ class GoogleRealtimeLLMConfiguration(BaseLLMConfiguration):
|
|||
)
|
||||
language: str = Field(
|
||||
default="en",
|
||||
description="ISO 639-1 language code.",
|
||||
json_schema_extra={
|
||||
"examples": GOOGLE_REALTIME_LANGUAGES,
|
||||
"allow_custom_input": True,
|
||||
|
|
@ -381,9 +419,68 @@ class GoogleRealtimeLLMConfiguration(BaseLLMConfiguration):
|
|||
)
|
||||
|
||||
|
||||
GOOGLE_VERTEX_REALTIME_MODELS = [
|
||||
"google/gemini-live-2.5-flash-native-audio",
|
||||
]
|
||||
GOOGLE_VERTEX_REALTIME_VOICES = GOOGLE_REALTIME_VOICES
|
||||
GOOGLE_VERTEX_REALTIME_LANGUAGES = GOOGLE_REALTIME_LANGUAGES
|
||||
|
||||
|
||||
@register_service(ServiceType.REALTIME)
|
||||
class GoogleVertexRealtimeLLMConfiguration(BaseLLMConfiguration):
|
||||
provider: Literal[ServiceProviders.GOOGLE_VERTEX_REALTIME] = (
|
||||
ServiceProviders.GOOGLE_VERTEX_REALTIME
|
||||
)
|
||||
model: str = Field(
|
||||
default="google/gemini-live-2.5-flash-native-audio",
|
||||
description="Vertex AI publisher/model identifier.",
|
||||
json_schema_extra={
|
||||
"examples": GOOGLE_VERTEX_REALTIME_MODELS,
|
||||
"allow_custom_input": True,
|
||||
},
|
||||
)
|
||||
voice: str = Field(
|
||||
default="Charon",
|
||||
description="Voice the model speaks in.",
|
||||
json_schema_extra={
|
||||
"examples": GOOGLE_VERTEX_REALTIME_VOICES,
|
||||
"allow_custom_input": True,
|
||||
},
|
||||
)
|
||||
language: str = Field(
|
||||
default="en",
|
||||
description="BCP-47 language code (e.g. 'en-US').",
|
||||
json_schema_extra={
|
||||
"examples": GOOGLE_VERTEX_REALTIME_LANGUAGES,
|
||||
"allow_custom_input": True,
|
||||
},
|
||||
)
|
||||
project_id: str = Field(description="Google Cloud project ID for Vertex AI.")
|
||||
location: str = Field(
|
||||
default="us-east4",
|
||||
description="GCP region for the Vertex AI endpoint (e.g. 'us-east4').",
|
||||
)
|
||||
credentials: str | None = Field(
|
||||
default=None,
|
||||
description=(
|
||||
"Paste the entire service-account JSON file contents. If omitted, "
|
||||
"falls back to Application Default Credentials (ADC)."
|
||||
),
|
||||
json_schema_extra={"multiline": True},
|
||||
)
|
||||
api_key: str | list[str] | None = Field(
|
||||
default=None,
|
||||
description=(
|
||||
"Not used for Vertex AI — authentication is via the service account "
|
||||
"in `credentials` (or ADC). Leave blank."
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
REALTIME_PROVIDERS = {
|
||||
ServiceProviders.OPENAI_REALTIME.value,
|
||||
ServiceProviders.GOOGLE_REALTIME.value,
|
||||
ServiceProviders.GOOGLE_VERTEX_REALTIME.value,
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -403,8 +500,9 @@ LLMConfig = Annotated[
|
|||
|
||||
RealtimeConfig = Annotated[
|
||||
Union[
|
||||
# OpenAIRealtimeLLMConfiguration,
|
||||
OpenAIRealtimeLLMConfiguration,
|
||||
GoogleRealtimeLLMConfiguration,
|
||||
GoogleVertexRealtimeLLMConfiguration,
|
||||
],
|
||||
Field(discriminator="provider"),
|
||||
]
|
||||
|
|
@ -415,7 +513,10 @@ RealtimeConfig = Annotated[
|
|||
@register_tts
|
||||
class DeepgramTTSConfiguration(BaseServiceConfiguration):
|
||||
provider: Literal[ServiceProviders.DEEPGRAM] = ServiceProviders.DEEPGRAM
|
||||
voice: str = "aura-2-helena-en"
|
||||
voice: str = Field(
|
||||
default="aura-2-helena-en",
|
||||
description="Deepgram voice ID (model is inferred from the 'aura-N' prefix).",
|
||||
)
|
||||
|
||||
@computed_field
|
||||
@property
|
||||
|
|
@ -437,10 +538,14 @@ ELEVENLABS_TTS_MODELS = ["eleven_flash_v2_5"]
|
|||
@register_tts
|
||||
class ElevenlabsTTSConfiguration(BaseServiceConfiguration):
|
||||
provider: Literal[ServiceProviders.ELEVENLABS] = ServiceProviders.ELEVENLABS
|
||||
voice: str = "21m00Tcm4TlvDq8ikWAM" # Rachel voice ID
|
||||
speed: float = Field(default=1.0, ge=0.1, le=2.0, description="Speed of the voice")
|
||||
voice: str = Field(
|
||||
default="21m00Tcm4TlvDq8ikWAM",
|
||||
description="ElevenLabs voice ID from your Voice Library.",
|
||||
)
|
||||
speed: float = Field(default=1.0, ge=0.1, le=2.0, description="Speed of the voice.")
|
||||
model: str = Field(
|
||||
default="eleven_flash_v2_5",
|
||||
description="ElevenLabs TTS model.",
|
||||
json_schema_extra={"examples": ELEVENLABS_TTS_MODELS},
|
||||
)
|
||||
base_url: str = Field(
|
||||
|
|
@ -460,9 +565,14 @@ OPENAI_TTS_MODELS = ["gpt-4o-mini-tts"]
|
|||
class OpenAITTSService(BaseTTSConfiguration):
|
||||
provider: Literal[ServiceProviders.OPENAI] = ServiceProviders.OPENAI
|
||||
model: str = Field(
|
||||
default="gpt-4o-mini-tts", json_schema_extra={"examples": OPENAI_TTS_MODELS}
|
||||
default="gpt-4o-mini-tts",
|
||||
description="OpenAI TTS model.",
|
||||
json_schema_extra={"examples": OPENAI_TTS_MODELS},
|
||||
)
|
||||
voice: str = Field(
|
||||
default="alloy",
|
||||
description="OpenAI TTS voice name.",
|
||||
)
|
||||
voice: str = "alloy"
|
||||
|
||||
|
||||
DOGRAH_TTS_MODELS = ["default"]
|
||||
|
|
@ -472,10 +582,15 @@ DOGRAH_TTS_MODELS = ["default"]
|
|||
class DograhTTSService(BaseTTSConfiguration):
|
||||
provider: Literal[ServiceProviders.DOGRAH] = ServiceProviders.DOGRAH
|
||||
model: str = Field(
|
||||
default="default", json_schema_extra={"examples": DOGRAH_TTS_MODELS}
|
||||
default="default",
|
||||
description="Dograh TTS tier.",
|
||||
json_schema_extra={"examples": DOGRAH_TTS_MODELS},
|
||||
)
|
||||
voice: str = "default"
|
||||
speed: float = Field(default=1.0, ge=0.5, le=2.0, description="Speed of the voice")
|
||||
voice: str = Field(
|
||||
default="default",
|
||||
description="Voice preset.",
|
||||
)
|
||||
speed: float = Field(default=1.0, ge=0.5, le=2.0, description="Speed of the voice.")
|
||||
|
||||
|
||||
CARTESIA_TTS_MODELS = ["sonic-3"]
|
||||
|
|
@ -485,15 +600,20 @@ CARTESIA_TTS_MODELS = ["sonic-3"]
|
|||
class CartesiaTTSConfiguration(BaseTTSConfiguration):
|
||||
provider: Literal[ServiceProviders.CARTESIA] = ServiceProviders.CARTESIA
|
||||
model: str = Field(
|
||||
default="sonic-3", json_schema_extra={"examples": CARTESIA_TTS_MODELS}
|
||||
default="sonic-3",
|
||||
description="Cartesia TTS model.",
|
||||
json_schema_extra={"examples": CARTESIA_TTS_MODELS},
|
||||
)
|
||||
voice: str = Field(default="3faa81ae-d3d8-4ab1-9e44-e50e46d33c30")
|
||||
speed: float = Field(default=1.0, ge=0.6, le=1.5, description="Speed of the voice")
|
||||
voice: str = Field(
|
||||
default="3faa81ae-d3d8-4ab1-9e44-e50e46d33c30",
|
||||
description="Cartesia voice UUID from your Cartesia dashboard.",
|
||||
)
|
||||
speed: float = Field(default=1.0, ge=0.6, le=1.5, description="Speed of the voice.")
|
||||
volume: float = Field(
|
||||
default=1.0,
|
||||
ge=0.5,
|
||||
le=2.0,
|
||||
description="Volume multiplier for generated speech",
|
||||
description="Volume multiplier for generated speech.",
|
||||
)
|
||||
|
||||
|
||||
|
|
@ -568,10 +688,13 @@ SARVAM_LANGUAGES = [
|
|||
class SarvamTTSConfiguration(BaseTTSConfiguration):
|
||||
provider: Literal[ServiceProviders.SARVAM] = ServiceProviders.SARVAM
|
||||
model: str = Field(
|
||||
default="bulbul:v2", json_schema_extra={"examples": SARVAM_TTS_MODELS}
|
||||
default="bulbul:v2",
|
||||
description="Sarvam TTS model (voice list depends on this).",
|
||||
json_schema_extra={"examples": SARVAM_TTS_MODELS},
|
||||
)
|
||||
voice: str = Field(
|
||||
default="anushka",
|
||||
description="Sarvam voice name; must match the selected model's voice list.",
|
||||
json_schema_extra={
|
||||
"examples": SARVAM_V2_VOICES,
|
||||
"model_options": {
|
||||
|
|
@ -581,7 +704,9 @@ class SarvamTTSConfiguration(BaseTTSConfiguration):
|
|||
},
|
||||
)
|
||||
language: str = Field(
|
||||
default="hi-IN", json_schema_extra={"examples": SARVAM_LANGUAGES}
|
||||
default="hi-IN",
|
||||
description="BCP-47 Indian-language code (e.g. hi-IN, en-IN).",
|
||||
json_schema_extra={"examples": SARVAM_LANGUAGES},
|
||||
)
|
||||
|
||||
|
||||
|
|
@ -592,10 +717,12 @@ CAMB_TTS_MODELS = ["mars-flash", "mars-pro", "mars-instruct"]
|
|||
class CambTTSConfiguration(BaseTTSConfiguration):
|
||||
provider: Literal[ServiceProviders.CAMB] = ServiceProviders.CAMB
|
||||
model: str = Field(
|
||||
default="mars-flash", json_schema_extra={"examples": CAMB_TTS_MODELS}
|
||||
default="mars-flash",
|
||||
description="Camb.ai TTS model.",
|
||||
json_schema_extra={"examples": CAMB_TTS_MODELS},
|
||||
)
|
||||
voice: str = Field(default="147320", description="Camb.ai voice ID")
|
||||
language: str = Field(default="en-us", description="BCP-47 language code")
|
||||
voice: str = Field(default="147320", description="Camb.ai voice ID.")
|
||||
language: str = Field(default="en-us", description="BCP-47 language code.")
|
||||
|
||||
|
||||
RIME_TTS_MODELS = ["arcana", "mistv3", "mistv2", "mist"]
|
||||
|
|
@ -607,17 +734,19 @@ class RimeTTSConfiguration(BaseTTSConfiguration):
|
|||
provider: Literal[ServiceProviders.RIME] = ServiceProviders.RIME
|
||||
model: str = Field(
|
||||
default="arcana",
|
||||
description="Rime TTS model.",
|
||||
json_schema_extra={"examples": RIME_TTS_MODELS, "allow_custom_input": True},
|
||||
)
|
||||
voice: str = Field(
|
||||
default="celeste",
|
||||
description="Rime voice ID",
|
||||
description="Rime voice ID.",
|
||||
)
|
||||
speed: float = Field(
|
||||
default=1.0, ge=0.5, le=2.0, description="Speech speed multiplier"
|
||||
default=1.0, ge=0.5, le=2.0, description="Speech speed multiplier."
|
||||
)
|
||||
language: str = Field(
|
||||
default="en",
|
||||
description="ISO 639-1 language code.",
|
||||
json_schema_extra={"examples": RIME_TTS_LANGUAGES, "allow_custom_input": True},
|
||||
)
|
||||
|
||||
|
|
@ -630,6 +759,7 @@ class SpeachesTTSConfiguration(BaseTTSConfiguration):
|
|||
provider: Literal[ServiceProviders.SPEACHES] = ServiceProviders.SPEACHES
|
||||
model: str = Field(
|
||||
default="kokoro",
|
||||
description="Model name as served by your TTS endpoint (e.g. Kokoro-FastAPI).",
|
||||
json_schema_extra={
|
||||
"examples": SPEACHES_TTS_MODELS,
|
||||
"allow_custom_input": True,
|
||||
|
|
@ -638,16 +768,19 @@ class SpeachesTTSConfiguration(BaseTTSConfiguration):
|
|||
voice: str = Field(
|
||||
default="af_heart",
|
||||
json_schema_extra={"allow_custom_input": True},
|
||||
description="Voice ID for the TTS engine",
|
||||
description="Voice ID for the TTS engine.",
|
||||
)
|
||||
base_url: str = Field(
|
||||
default="http://localhost:8000/v1",
|
||||
description="OpenAI-compatible TTS endpoint (Kokoro-FastAPI, etc.)",
|
||||
description="OpenAI-compatible TTS endpoint (Kokoro-FastAPI, etc.).",
|
||||
)
|
||||
speed: float = Field(
|
||||
default=1.0, ge=0.25, le=4.0, description="Speech speed (0.25 to 4.0)"
|
||||
default=1.0, ge=0.25, le=4.0, description="Speech speed (0.25 to 4.0)."
|
||||
)
|
||||
api_key: str | list[str] | None = Field(
|
||||
default=None,
|
||||
description="Usually not required for self-hosted TTS. Leave blank unless enforced.",
|
||||
)
|
||||
api_key: str | list[str] | None = Field(default=None)
|
||||
|
||||
|
||||
TTSConfig = Annotated[
|
||||
|
|
@ -758,10 +891,13 @@ DEEPGRAM_LANGUAGES = [
|
|||
class DeepgramSTTConfiguration(BaseSTTConfiguration):
|
||||
provider: Literal[ServiceProviders.DEEPGRAM] = ServiceProviders.DEEPGRAM
|
||||
model: str = Field(
|
||||
default="nova-3-general", json_schema_extra={"examples": DEEPGRAM_STT_MODELS}
|
||||
default="nova-3-general",
|
||||
description="Deepgram STT model.",
|
||||
json_schema_extra={"examples": DEEPGRAM_STT_MODELS},
|
||||
)
|
||||
language: str = Field(
|
||||
default="multi",
|
||||
description="Language code; 'multi' enables auto-detect (Nova-3 only).",
|
||||
json_schema_extra={
|
||||
"examples": DEEPGRAM_LANGUAGES,
|
||||
"model_options": {
|
||||
|
|
@ -779,7 +915,9 @@ CARTESIA_STT_MODELS = ["ink-whisper"]
|
|||
class CartesiaSTTConfiguration(BaseSTTConfiguration):
|
||||
provider: Literal[ServiceProviders.CARTESIA] = ServiceProviders.CARTESIA
|
||||
model: str = Field(
|
||||
default="ink-whisper", json_schema_extra={"examples": CARTESIA_STT_MODELS}
|
||||
default="ink-whisper",
|
||||
description="Cartesia STT model.",
|
||||
json_schema_extra={"examples": CARTESIA_STT_MODELS},
|
||||
)
|
||||
|
||||
|
||||
|
|
@ -790,7 +928,9 @@ OPENAI_STT_MODELS = ["gpt-4o-transcribe"]
|
|||
class OpenAISTTConfiguration(BaseSTTConfiguration):
|
||||
provider: Literal[ServiceProviders.OPENAI] = ServiceProviders.OPENAI
|
||||
model: str = Field(
|
||||
default="gpt-4o-transcribe", json_schema_extra={"examples": OPENAI_STT_MODELS}
|
||||
default="gpt-4o-transcribe",
|
||||
description="OpenAI transcription model.",
|
||||
json_schema_extra={"examples": OPENAI_STT_MODELS},
|
||||
)
|
||||
|
||||
|
||||
|
|
@ -803,10 +943,14 @@ DOGRAH_STT_LANGUAGES = DEEPGRAM_LANGUAGES
|
|||
class DograhSTTService(BaseSTTConfiguration):
|
||||
provider: Literal[ServiceProviders.DOGRAH] = ServiceProviders.DOGRAH
|
||||
model: str = Field(
|
||||
default="default", json_schema_extra={"examples": DOGRAH_STT_MODELS}
|
||||
default="default",
|
||||
description="Dograh STT tier.",
|
||||
json_schema_extra={"examples": DOGRAH_STT_MODELS},
|
||||
)
|
||||
language: str = Field(
|
||||
default="multi", json_schema_extra={"examples": DOGRAH_STT_LANGUAGES}
|
||||
default="multi",
|
||||
description="Language code; use 'multi' for auto-detect.",
|
||||
json_schema_extra={"examples": DOGRAH_STT_LANGUAGES},
|
||||
)
|
||||
|
||||
|
||||
|
|
@ -818,10 +962,14 @@ SARVAM_STT_MODELS = ["saarika:v2.5", "saaras:v2"]
|
|||
class SarvamSTTConfiguration(BaseSTTConfiguration):
|
||||
provider: Literal[ServiceProviders.SARVAM] = ServiceProviders.SARVAM
|
||||
model: str = Field(
|
||||
default="saarika:v2.5", json_schema_extra={"examples": SARVAM_STT_MODELS}
|
||||
default="saarika:v2.5",
|
||||
description="Sarvam STT model.",
|
||||
json_schema_extra={"examples": SARVAM_STT_MODELS},
|
||||
)
|
||||
language: str = Field(
|
||||
default="hi-IN", json_schema_extra={"examples": SARVAM_LANGUAGES}
|
||||
default="hi-IN",
|
||||
description="BCP-47 Indian-language code.",
|
||||
json_schema_extra={"examples": SARVAM_LANGUAGES},
|
||||
)
|
||||
|
||||
|
||||
|
|
@ -857,10 +1005,13 @@ SPEECHMATICS_STT_LANGUAGES = [
|
|||
class SpeechmaticsSTTConfiguration(BaseSTTConfiguration):
|
||||
provider: Literal[ServiceProviders.SPEECHMATICS] = ServiceProviders.SPEECHMATICS
|
||||
model: str = Field(
|
||||
default="enhanced", description="Operating point: standard or enhanced"
|
||||
default="enhanced",
|
||||
description="Speechmatics operating point: 'standard' or 'enhanced'.",
|
||||
)
|
||||
language: str = Field(
|
||||
default="en", json_schema_extra={"examples": SPEECHMATICS_STT_LANGUAGES}
|
||||
default="en",
|
||||
description="ISO 639-1 language code.",
|
||||
json_schema_extra={"examples": SPEECHMATICS_STT_LANGUAGES},
|
||||
)
|
||||
|
||||
|
||||
|
|
@ -876,6 +1027,7 @@ class SpeachesSTTConfiguration(BaseSTTConfiguration):
|
|||
provider: Literal[ServiceProviders.SPEACHES] = ServiceProviders.SPEACHES
|
||||
model: str = Field(
|
||||
default="Systran/faster-distil-whisper-small.en",
|
||||
description="Whisper model identifier as served by your STT endpoint.",
|
||||
json_schema_extra={
|
||||
"examples": SPEACHES_STT_MODELS,
|
||||
"allow_custom_input": True,
|
||||
|
|
@ -883,6 +1035,7 @@ class SpeachesSTTConfiguration(BaseSTTConfiguration):
|
|||
)
|
||||
language: str = Field(
|
||||
default="en",
|
||||
description="ISO 639-1 language code.",
|
||||
json_schema_extra={
|
||||
"examples": SPEACHES_STT_LANGUAGES,
|
||||
"allow_custom_input": True,
|
||||
|
|
@ -890,9 +1043,12 @@ class SpeachesSTTConfiguration(BaseSTTConfiguration):
|
|||
)
|
||||
base_url: str = Field(
|
||||
default="http://localhost:8000/v1",
|
||||
description="OpenAI-compatible STT endpoint (Speaches, etc.)",
|
||||
description="OpenAI-compatible STT endpoint (Speaches, etc.).",
|
||||
)
|
||||
api_key: str | list[str] | None = Field(
|
||||
default=None,
|
||||
description="Usually not required for self-hosted STT. Leave blank unless enforced.",
|
||||
)
|
||||
api_key: str | list[str] | None = Field(default=None)
|
||||
|
||||
|
||||
ASSEMBLYAI_STT_MODELS = ["u3-rt-pro"]
|
||||
|
|
@ -904,10 +1060,12 @@ class AssemblyAISTTConfiguration(BaseSTTConfiguration):
|
|||
provider: Literal[ServiceProviders.ASSEMBLYAI] = ServiceProviders.ASSEMBLYAI
|
||||
model: str = Field(
|
||||
default="u3-rt-pro",
|
||||
description="AssemblyAI realtime STT model.",
|
||||
json_schema_extra={"examples": ASSEMBLYAI_STT_MODELS},
|
||||
)
|
||||
language: str = Field(
|
||||
default="en",
|
||||
description="ISO 639-1 language code.",
|
||||
json_schema_extra={"examples": ASSEMBLYAI_STT_LANGUAGES},
|
||||
)
|
||||
|
||||
|
|
@ -1022,10 +1180,12 @@ class GladiaSTTConfiguration(BaseSTTConfiguration):
|
|||
provider: Literal[ServiceProviders.GLADIA] = ServiceProviders.GLADIA
|
||||
model: str = Field(
|
||||
default="solaria-1",
|
||||
description="Gladia STT model.",
|
||||
json_schema_extra={"examples": GLADIA_STT_MODELS},
|
||||
)
|
||||
language: str = Field(
|
||||
default="en",
|
||||
description="ISO 639-1 language code.",
|
||||
json_schema_extra={"examples": GLADIA_STT_LANGUAGES},
|
||||
)
|
||||
|
||||
|
|
@ -1055,6 +1215,7 @@ class OpenAIEmbeddingsConfiguration(BaseEmbeddingsConfiguration):
|
|||
provider: Literal[ServiceProviders.OPENAI] = ServiceProviders.OPENAI
|
||||
model: str = Field(
|
||||
default="text-embedding-3-small",
|
||||
description="OpenAI embedding model.",
|
||||
json_schema_extra={"examples": OPENAI_EMBEDDING_MODELS},
|
||||
)
|
||||
|
||||
|
|
@ -1067,10 +1228,14 @@ class OpenRouterEmbeddingsConfiguration(BaseEmbeddingsConfiguration):
|
|||
provider: Literal[ServiceProviders.OPENROUTER] = ServiceProviders.OPENROUTER
|
||||
model: str = Field(
|
||||
default="openai/text-embedding-3-small",
|
||||
description="OpenRouter-hosted embedding model slug.",
|
||||
json_schema_extra={"examples": OPENROUTER_EMBEDDING_MODELS},
|
||||
)
|
||||
|
||||
base_url: str = Field(default="https://openrouter.ai/api/v1")
|
||||
base_url: str = Field(
|
||||
default="https://openrouter.ai/api/v1",
|
||||
description="Override only if proxying OpenRouter through your own gateway.",
|
||||
)
|
||||
|
||||
|
||||
EmbeddingsConfig = Annotated[
|
||||
|
|
|
|||
9
api/services/pipecat/realtime/__init__.py
Normal file
9
api/services/pipecat/realtime/__init__.py
Normal file
|
|
@ -0,0 +1,9 @@
|
|||
"""Dograh-specific subclasses of pipecat realtime LLM services.
|
||||
|
||||
Each subclass wires Dograh engine integration quirks (user-mute gating,
|
||||
TTSSpeakFrame greeting trigger, node-transition handling, function-call
|
||||
deferral, etc.) onto the corresponding pipecat realtime service.
|
||||
|
||||
The pipecat fork's services stay close to upstream — Dograh behavior lives
|
||||
here.
|
||||
"""
|
||||
237
api/services/pipecat/realtime/gemini_live.py
Normal file
237
api/services/pipecat/realtime/gemini_live.py
Normal file
|
|
@ -0,0 +1,237 @@
|
|||
"""Dograh subclass of pipecat's Gemini Live LLM service.
|
||||
|
||||
Layers Dograh engine integration quirks onto upstream-pristine
|
||||
:class:`GeminiLiveLLMService`:
|
||||
|
||||
- **Deferred connect.** Connection is held back until ``system_instruction``
|
||||
is set via :meth:`_update_settings`, so pre-call-fetch template variables
|
||||
land before the live session opens.
|
||||
- **Reconnect on node transitions.** Gemini Live cannot update
|
||||
``system_instruction`` mid-session, so a setting change triggers a
|
||||
reconnect (deferred until the bot turn ends if currently responding).
|
||||
- **Function-call deferral.** Tool calls emitted mid-turn are queued and run
|
||||
when the bot stops speaking, to avoid racing the turn's audio.
|
||||
- **User-mute audio gating.** ``UserMuteStarted/StoppedFrame`` from the
|
||||
user aggregator gates whether incoming audio is forwarded to Gemini.
|
||||
- **TTSSpeakFrame as greeting trigger.** The engine queues a TTSSpeakFrame
|
||||
to kick off the first response after node setup; the service intercepts
|
||||
it and runs the initial-context path.
|
||||
- **Finalize-pending on transcriptions.** Marks the transcription emitted
|
||||
immediately after VAD-stop as finalized, distinguishing it from
|
||||
mid-turn partials.
|
||||
"""
|
||||
|
||||
from typing import Any
|
||||
|
||||
from loguru import logger
|
||||
|
||||
from pipecat.frames.frames import (
|
||||
BotStoppedSpeakingFrame,
|
||||
Frame,
|
||||
TranscriptionFrame,
|
||||
TTSSpeakFrame,
|
||||
UserMuteStartedFrame,
|
||||
UserMuteStoppedFrame,
|
||||
)
|
||||
from pipecat.processors.aggregators.llm_context import LLMContext
|
||||
from pipecat.processors.frame_processor import FrameDirection
|
||||
from pipecat.services.google.gemini_live.llm import GeminiLiveLLMService
|
||||
from pipecat.services.llm_service import FunctionCallFromLLM
|
||||
from pipecat.utils.time import time_now_iso8601
|
||||
from pipecat.utils.tracing.service_decorators import traced_gemini_live
|
||||
|
||||
|
||||
class DograhGeminiLiveLLMService(GeminiLiveLLMService):
|
||||
"""Gemini Live with Dograh engine integration quirks. See module docstring."""
|
||||
|
||||
def __init__(self, **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
# User-mute state, driven by broadcast UserMute{Started,Stopped}Frames.
|
||||
# Audio is not forwarded to Gemini while muted.
|
||||
self._user_is_muted: bool = False
|
||||
# Guards initial-response triggering against double-firing across the
|
||||
# initial TTSSpeakFrame and any LLMContextFrame that may arrive.
|
||||
self._handled_initial_context: bool = False
|
||||
# When a system_instruction change arrives mid-bot-turn, the reconnect
|
||||
# is queued and drained when the turn ends.
|
||||
self._reconnect_pending: bool = False
|
||||
# Function calls emitted by Gemini mid-bot-turn are deferred here and
|
||||
# invoked when the turn ends, so they don't race the turn's audio.
|
||||
self._pending_function_calls: list[FunctionCallFromLLM] = []
|
||||
# Tracks whether the next transcription to arrive should be marked as
|
||||
# the finalized transcription for the current user turn.
|
||||
self._finalize_pending: bool = False
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Hooks from upstream GeminiLiveLLMService
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def _should_connect_on_start(self) -> bool:
|
||||
# Hold the connection until the engine sets a system_instruction. This
|
||||
# lets pre-call fetch populate template variables first.
|
||||
return bool(self._settings.system_instruction)
|
||||
|
||||
async def _handle_changed_settings(self, changed: dict[str, Any]) -> set[str]:
|
||||
if "system_instruction" not in changed:
|
||||
return set()
|
||||
if not self._session:
|
||||
# First-time setting after deferred-connect.
|
||||
await self._connect()
|
||||
elif self._bot_is_responding:
|
||||
# Bot is mid-turn — drain the reconnect when it ends so we don't
|
||||
# cut the bot off mid-utterance.
|
||||
self._reconnect_pending = True
|
||||
else:
|
||||
await self._reconnect()
|
||||
return {"system_instruction"}
|
||||
|
||||
async def _run_or_defer_function_calls(
|
||||
self, function_calls_llm: list[FunctionCallFromLLM]
|
||||
):
|
||||
if self._bot_is_responding:
|
||||
# Latest batch wins; Gemini emits tool calls as one batch per
|
||||
# tool_call message, so this overwrite is intentional.
|
||||
self._pending_function_calls = function_calls_llm
|
||||
logger.debug(
|
||||
f"{self}: deferring {len(function_calls_llm)} function call(s) "
|
||||
"until bot turn ends"
|
||||
)
|
||||
return
|
||||
await super()._run_or_defer_function_calls(function_calls_llm)
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# State-transition side effects
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
async def _set_bot_is_responding(self, responding: bool):
|
||||
was_responding = self._bot_is_responding
|
||||
await super()._set_bot_is_responding(responding)
|
||||
if was_responding and not responding:
|
||||
await self._run_pending_function_calls()
|
||||
if self._reconnect_pending:
|
||||
self._reconnect_pending = False
|
||||
await self._reconnect()
|
||||
|
||||
async def _run_pending_function_calls(self):
|
||||
"""Run any function calls deferred during the bot's last turn."""
|
||||
if not self._pending_function_calls:
|
||||
return
|
||||
fcs = self._pending_function_calls
|
||||
self._pending_function_calls = []
|
||||
logger.debug(
|
||||
f"{self}: executing {len(fcs)} deferred function call(s) "
|
||||
"after bot turn ended"
|
||||
)
|
||||
await self.run_function_calls(fcs)
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Frame handling: mute, TTSSpeakFrame, BotStoppedSpeakingFrame flush
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
async def process_frame(self, frame: Frame, direction: FrameDirection):
|
||||
if isinstance(frame, UserMuteStartedFrame):
|
||||
self._user_is_muted = True
|
||||
await self.push_frame(frame, direction)
|
||||
return
|
||||
if isinstance(frame, UserMuteStoppedFrame):
|
||||
self._user_is_muted = False
|
||||
await self.push_frame(frame, direction)
|
||||
return
|
||||
if isinstance(frame, TTSSpeakFrame):
|
||||
# Greeting trigger: the engine queues a TTSSpeakFrame to start the
|
||||
# bot's first turn after node setup. Gemini Live renders its own
|
||||
# audio, so we don't pass the frame through — we re-enter
|
||||
# _handle_context to kick off the initial response.
|
||||
if not self._handled_initial_context:
|
||||
await self._handle_context(self._context)
|
||||
else:
|
||||
logger.warning(
|
||||
f"{self}: TTSSpeakFrame after initial context already "
|
||||
"handled — Gemini Live owns audio generation, ignoring"
|
||||
)
|
||||
return
|
||||
if isinstance(frame, BotStoppedSpeakingFrame):
|
||||
# Belt-and-suspenders: the main drain happens in
|
||||
# _set_bot_is_responding(False), but if Gemini delays turn_complete
|
||||
# past the audible end of the turn, flushing here ensures pending
|
||||
# function calls fire promptly.
|
||||
await self._run_pending_function_calls()
|
||||
# Fall through to super for the actual push.
|
||||
await super().process_frame(frame, direction)
|
||||
|
||||
async def _send_user_audio(self, frame):
|
||||
if self._user_is_muted:
|
||||
return
|
||||
await super()._send_user_audio(frame)
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Context lifecycle: Dograh pre-populates self._context via the engine,
|
||||
# so upstream's "first arrival === self._context is None" check doesn't
|
||||
# work. We gate on _handled_initial_context instead and skip the
|
||||
# init-instruction reconciliation (Dograh updates system_instruction at
|
||||
# runtime via _update_settings, not via init).
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
async def _handle_context(self, context: LLMContext):
|
||||
if not self._handled_initial_context:
|
||||
self._handled_initial_context = True
|
||||
self._context = context
|
||||
await self._create_initial_response()
|
||||
else:
|
||||
self._context = context
|
||||
await self._process_completed_function_calls(send_new_results=True)
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Session lifecycle: drop upstream's automatic reconnect-seed and
|
||||
# initial-context-seed paths. The TTSSpeakFrame trigger and the
|
||||
# function-call-result LLMContextFrame are the only paths that should
|
||||
# kick off bot turns in the Dograh flow.
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
@traced_gemini_live(operation="llm_setup")
|
||||
async def _handle_session_ready(self, session):
|
||||
logger.debug(
|
||||
f"In _handle_session_ready self._run_llm_when_session_ready: {self._run_llm_when_session_ready}"
|
||||
)
|
||||
self._session = session
|
||||
self._ready_for_realtime_input = True
|
||||
if self._run_llm_when_session_ready:
|
||||
# Context arrived before session was ready — fulfil the queued
|
||||
# initial response now.
|
||||
self._run_llm_when_session_ready = False
|
||||
await self._create_initial_response()
|
||||
await self._drain_pending_tool_results()
|
||||
# Otherwise: no automatic seed. Reconnect after a session-resumption
|
||||
# update relies on the server-side restored state; reconnects without
|
||||
# a handle (e.g. node transitions before any handle was issued) are
|
||||
# followed by a function-call-result LLMContextFrame which feeds the
|
||||
# updated-context branch in _handle_context.
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Transcription: broadcast (so downstream voicemail detector and
|
||||
# logs buffer both see it) and set finalized= for turn-boundary
|
||||
# semantics.
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
async def _handle_user_started_speaking(self, frame):
|
||||
await super()._handle_user_started_speaking(frame)
|
||||
# A new VAD start invalidates any pending finalize from a prior stop
|
||||
# that hasn't been paired with a transcription yet.
|
||||
self._finalize_pending = False
|
||||
|
||||
async def _handle_user_stopped_speaking(self, frame):
|
||||
await super()._handle_user_stopped_speaking(frame)
|
||||
self._finalize_pending = True
|
||||
|
||||
async def _push_user_transcription(self, text: str, result=None):
|
||||
await self._handle_user_transcription(text, True, self._settings.language)
|
||||
finalized = self._finalize_pending
|
||||
self._finalize_pending = False
|
||||
await self.broadcast_frame(
|
||||
TranscriptionFrame,
|
||||
text=text,
|
||||
user_id="",
|
||||
timestamp=time_now_iso8601(),
|
||||
result=result,
|
||||
finalized=finalized,
|
||||
)
|
||||
42
api/services/pipecat/realtime/gemini_live_vertex.py
Normal file
42
api/services/pipecat/realtime/gemini_live_vertex.py
Normal file
|
|
@ -0,0 +1,42 @@
|
|||
"""Dograh subclass of pipecat's Gemini Live Vertex AI LLM service.
|
||||
|
||||
Diamond inheritance: combines the Dograh engine-integration overrides from
|
||||
:class:`DograhGeminiLiveLLMService` with the Vertex-specific tweaks from
|
||||
upstream's :class:`GeminiLiveVertexLLMService` (no history config,
|
||||
``NON_BLOCKING`` tools disabled, service-account credentials).
|
||||
|
||||
MRO::
|
||||
|
||||
DograhGeminiLiveVertexLLMService
|
||||
-> DograhGeminiLiveLLMService
|
||||
-> GeminiLiveVertexLLMService
|
||||
-> GeminiLiveLLMService
|
||||
-> LLMService
|
||||
-> ...
|
||||
"""
|
||||
|
||||
from api.services.pipecat.realtime.gemini_live import DograhGeminiLiveLLMService
|
||||
from pipecat.services.google.gemini_live.vertex.llm import (
|
||||
GeminiLiveVertexLLMService,
|
||||
)
|
||||
|
||||
|
||||
class DograhGeminiLiveVertexLLMService(
|
||||
DograhGeminiLiveLLMService,
|
||||
GeminiLiveVertexLLMService,
|
||||
):
|
||||
"""Vertex AI variant of Gemini Live with Dograh integration quirks."""
|
||||
|
||||
pass
|
||||
|
||||
|
||||
# Guard against MRO regressions: a future refactor that flips inheritance
|
||||
# order or breaks the diamond would silently bypass the Dograh overrides.
|
||||
_mro = DograhGeminiLiveVertexLLMService.__mro__
|
||||
assert _mro[1] is DograhGeminiLiveLLMService, (
|
||||
f"Expected DograhGeminiLiveLLMService at MRO[1], got {_mro[1]}"
|
||||
)
|
||||
assert _mro[2] is GeminiLiveVertexLLMService, (
|
||||
f"Expected GeminiLiveVertexLLMService at MRO[2], got {_mro[2]}"
|
||||
)
|
||||
del _mro
|
||||
274
api/services/pipecat/realtime/openai_realtime.py
Normal file
274
api/services/pipecat/realtime/openai_realtime.py
Normal file
|
|
@ -0,0 +1,274 @@
|
|||
"""Dograh subclass of pipecat's OpenAI Realtime LLM service.
|
||||
|
||||
Layers Dograh engine integration quirks onto upstream-pristine
|
||||
:class:`OpenAIRealtimeLLMService`. Substantially smaller than the Gemini
|
||||
subclass because OpenAI Realtime supports runtime ``session.update`` for
|
||||
both ``system_instruction`` and tools — no reconnect/defer-tool-call
|
||||
machinery needed.
|
||||
|
||||
Adds:
|
||||
|
||||
- **User-mute audio gating** via ``UserMuteStarted/StoppedFrame``.
|
||||
- **TTSSpeakFrame as initial-response trigger** so the engine's greeting
|
||||
flow kicks off the bot's first response.
|
||||
- **One-off LLMMessagesAppendFrame handling** for ephemeral realtime prompts
|
||||
like user-idle checks, without mutating Dograh's local ``LLMContext``.
|
||||
- **finalized=True on TranscriptionFrame** for parity with the Gemini
|
||||
service (every OpenAI transcription via the ``completed`` event is
|
||||
final by construction).
|
||||
"""
|
||||
|
||||
import json
|
||||
from typing import Any
|
||||
|
||||
from loguru import logger
|
||||
|
||||
from pipecat.frames.frames import (
|
||||
BotStartedSpeakingFrame,
|
||||
BotStoppedSpeakingFrame,
|
||||
Frame,
|
||||
LLMFullResponseStartFrame,
|
||||
LLMMessagesAppendFrame,
|
||||
TranscriptionFrame,
|
||||
TTSSpeakFrame,
|
||||
UserMuteStartedFrame,
|
||||
UserMuteStoppedFrame,
|
||||
)
|
||||
from pipecat.processors.aggregators.llm_context import LLMContext
|
||||
from pipecat.processors.frame_processor import FrameDirection
|
||||
from pipecat.services.llm_service import FunctionCallFromLLM
|
||||
from pipecat.services.openai.realtime import events
|
||||
from pipecat.services.openai.realtime.llm import OpenAIRealtimeLLMService
|
||||
from pipecat.transcriptions.language import Language
|
||||
from pipecat.utils.time import time_now_iso8601
|
||||
|
||||
|
||||
class DograhOpenAIRealtimeLLMService(OpenAIRealtimeLLMService):
|
||||
"""OpenAI Realtime with Dograh engine integration quirks. See module docstring."""
|
||||
|
||||
def __init__(self, **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
self._user_is_muted: bool = False
|
||||
# Dograh pre-populates self._context via the engine before the first
|
||||
# LLMContextFrame arrives, so upstream's "first arrival means
|
||||
# self._context is None" check no longer works.
|
||||
self._handled_initial_context: bool = False
|
||||
# Track bot speech locally so tool calls can be deferred until the bot
|
||||
# has finished speaking, matching Dograh's Gemini Live behavior.
|
||||
self._bot_is_speaking: bool = False
|
||||
self._deferred_function_calls: list[FunctionCallFromLLM] = []
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Frame handling: mute, TTSSpeakFrame as greeting trigger
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
async def process_frame(self, frame: Frame, direction: FrameDirection):
|
||||
if isinstance(frame, UserMuteStartedFrame):
|
||||
self._user_is_muted = True
|
||||
await self.push_frame(frame, direction)
|
||||
return
|
||||
if isinstance(frame, UserMuteStoppedFrame):
|
||||
self._user_is_muted = False
|
||||
await self.push_frame(frame, direction)
|
||||
return
|
||||
if isinstance(frame, TTSSpeakFrame):
|
||||
# Greeting trigger: the engine queues a TTSSpeakFrame after node
|
||||
# setup. OpenAI Realtime renders its own audio, so we don't pass
|
||||
# the frame to TTS. Route through _handle_context so the initial
|
||||
# response and later tool-result turns share the same context
|
||||
# lifecycle even when Dograh has already pre-populated self._context.
|
||||
if not self._handled_initial_context:
|
||||
await self._handle_context(self._context)
|
||||
else:
|
||||
logger.warning(
|
||||
f"{self}: TTSSpeakFrame after initial context already "
|
||||
"handled — OpenAI Realtime owns audio generation, ignoring"
|
||||
)
|
||||
# Don't forward the frame; the audio path is owned by the realtime
|
||||
# service itself.
|
||||
return
|
||||
if isinstance(frame, LLMMessagesAppendFrame):
|
||||
await self._handle_messages_append(frame)
|
||||
return
|
||||
if isinstance(frame, BotStartedSpeakingFrame):
|
||||
self._bot_is_speaking = True
|
||||
elif isinstance(frame, BotStoppedSpeakingFrame):
|
||||
self._bot_is_speaking = False
|
||||
await self._run_pending_function_calls()
|
||||
await super().process_frame(frame, direction)
|
||||
|
||||
async def _handle_messages_append(self, frame: LLMMessagesAppendFrame):
|
||||
"""Consume a one-off append frame without mutating the local LLMContext."""
|
||||
if self._disconnecting:
|
||||
return
|
||||
|
||||
if not self._api_session_ready:
|
||||
if frame.run_llm:
|
||||
logger.debug(
|
||||
f"{self}: LLMMessagesAppendFrame received before session ready; "
|
||||
"deferring response until the session is initialized"
|
||||
)
|
||||
self._run_llm_when_api_session_ready = True
|
||||
return
|
||||
|
||||
appended_any = False
|
||||
for message in frame.messages:
|
||||
item = self._message_to_conversation_item(message)
|
||||
if item is None:
|
||||
continue
|
||||
evt = events.ConversationItemCreateEvent(item=item)
|
||||
self._messages_added_manually[evt.item.id] = True
|
||||
await self.send_client_event(evt)
|
||||
appended_any = True
|
||||
|
||||
if frame.run_llm and appended_any:
|
||||
await self._send_manual_response_create()
|
||||
|
||||
async def _handle_context(self, context: LLMContext):
|
||||
if not self._handled_initial_context:
|
||||
if context is None:
|
||||
logger.warning(
|
||||
f"{self}: received initial context trigger before context was set"
|
||||
)
|
||||
return
|
||||
self._handled_initial_context = True
|
||||
self._context = context
|
||||
await self._create_response()
|
||||
else:
|
||||
self._context = context
|
||||
await self._process_completed_function_calls(send_new_results=True)
|
||||
|
||||
async def _send_user_audio(self, frame):
|
||||
if self._user_is_muted:
|
||||
return
|
||||
await super()._send_user_audio(frame)
|
||||
|
||||
def _message_to_conversation_item(
|
||||
self, message: dict[str, Any]
|
||||
) -> events.ConversationItem | None:
|
||||
if not isinstance(message, dict):
|
||||
logger.warning(
|
||||
f"{self}: skipping unsupported appended message payload {message!r}"
|
||||
)
|
||||
return None
|
||||
|
||||
role = message.get("role")
|
||||
if role not in {"user", "system", "developer"}:
|
||||
logger.warning(
|
||||
f"{self}: skipping unsupported appended message role {role!r}"
|
||||
)
|
||||
return None
|
||||
|
||||
text = self._extract_text_content(message.get("content"))
|
||||
if not text:
|
||||
logger.warning(
|
||||
f"{self}: skipping appended message with unsupported content {message!r}"
|
||||
)
|
||||
return None
|
||||
|
||||
item_role = "system" if role in {"system", "developer"} else "user"
|
||||
return events.ConversationItem(
|
||||
type="message",
|
||||
role=item_role,
|
||||
content=[events.ItemContent(type="input_text", text=text)],
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def _extract_text_content(content: Any) -> str | None:
|
||||
if isinstance(content, str):
|
||||
return content
|
||||
if isinstance(content, list):
|
||||
parts: list[str] = []
|
||||
for part in content:
|
||||
if not isinstance(part, dict):
|
||||
return None
|
||||
if part.get("type") != "text":
|
||||
return None
|
||||
text = part.get("text")
|
||||
if not isinstance(text, str):
|
||||
return None
|
||||
parts.append(text)
|
||||
return "\n".join(parts) if parts else None
|
||||
return None
|
||||
|
||||
async def _send_manual_response_create(self):
|
||||
"""Trigger inference after manually appending conversation items."""
|
||||
await self.push_frame(LLMFullResponseStartFrame())
|
||||
await self.start_processing_metrics()
|
||||
await self.start_ttfb_metrics()
|
||||
await self.send_client_event(
|
||||
events.ResponseCreateEvent(
|
||||
response=events.ResponseProperties(
|
||||
output_modalities=self._get_enabled_modalities()
|
||||
)
|
||||
)
|
||||
)
|
||||
|
||||
async def _run_pending_function_calls(self):
|
||||
if not self._deferred_function_calls:
|
||||
return
|
||||
function_calls = self._deferred_function_calls
|
||||
self._deferred_function_calls = []
|
||||
logger.debug(
|
||||
f"{self}: executing {len(function_calls)} deferred function call(s) "
|
||||
"after bot turn ended"
|
||||
)
|
||||
await self.run_function_calls(function_calls)
|
||||
|
||||
async def _handle_evt_function_call_arguments_done(self, evt):
|
||||
"""Process or defer tool calls until the bot finishes speaking."""
|
||||
try:
|
||||
args = json.loads(evt.arguments)
|
||||
|
||||
function_call_item = self._pending_function_calls.get(evt.call_id)
|
||||
if function_call_item:
|
||||
del self._pending_function_calls[evt.call_id]
|
||||
|
||||
function_calls = [
|
||||
FunctionCallFromLLM(
|
||||
context=self._context,
|
||||
tool_call_id=evt.call_id,
|
||||
function_name=function_call_item.name,
|
||||
arguments=args,
|
||||
)
|
||||
]
|
||||
|
||||
if self._bot_is_speaking:
|
||||
self._deferred_function_calls.extend(function_calls)
|
||||
logger.debug(
|
||||
f"{self}: deferring function call {function_call_item.name} "
|
||||
"until bot stops speaking"
|
||||
)
|
||||
else:
|
||||
await self.run_function_calls(function_calls)
|
||||
logger.debug(f"Processed function call: {function_call_item.name}")
|
||||
else:
|
||||
logger.warning(
|
||||
f"No tracked function call found for call_id: {evt.call_id}"
|
||||
)
|
||||
logger.warning(
|
||||
f"Available pending calls: {list(self._pending_function_calls.keys())}"
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to process function call arguments: {e}")
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Transcription: broadcast with finalized=True for parity with the
|
||||
# Gemini service (consumers that check `finalized` should see True
|
||||
# for every completed-transcription event from OpenAI).
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
async def handle_evt_input_audio_transcription_completed(self, evt):
|
||||
await self._call_event_handler(
|
||||
"on_conversation_item_updated", evt.item_id, None
|
||||
)
|
||||
await self.broadcast_frame(
|
||||
TranscriptionFrame,
|
||||
text=evt.transcript,
|
||||
user_id="",
|
||||
timestamp=time_now_iso8601(),
|
||||
result=evt,
|
||||
finalized=True,
|
||||
)
|
||||
await self._handle_user_transcription(evt.transcript, True, Language.EN)
|
||||
|
|
@ -86,6 +86,43 @@ from pipecat.utils.run_context import set_current_org_id, set_current_run_id
|
|||
ensure_tracing()
|
||||
|
||||
|
||||
def _create_realtime_user_turn_config(provider: str):
|
||||
"""Return user turn strategies and optional local VAD for realtime providers."""
|
||||
if provider in {
|
||||
ServiceProviders.GOOGLE_REALTIME.value,
|
||||
ServiceProviders.GOOGLE_VERTEX_REALTIME.value,
|
||||
}:
|
||||
# Let Gemini Live own barge-in via its server-side VAD, but keep local
|
||||
# Silero VAD for early user-turn start and speaking-state tracking.
|
||||
return (
|
||||
UserTurnStrategies(
|
||||
start=[VADUserTurnStartStrategy(enable_interruptions=False)],
|
||||
stop=[SpeechTimeoutUserTurnStopStrategy()],
|
||||
),
|
||||
SileroVADAnalyzer(params=VADParams(stop_secs=0.2)),
|
||||
)
|
||||
|
||||
if provider == ServiceProviders.OPENAI_REALTIME.value:
|
||||
# OpenAI Realtime already emits speaking-state frames and interruption
|
||||
# events from the provider, so the aggregator should follow those
|
||||
# external signals rather than run its own local VAD.
|
||||
return (
|
||||
UserTurnStrategies(
|
||||
start=[ExternalUserTurnStartStrategy()],
|
||||
stop=[ExternalUserTurnStopStrategy()],
|
||||
),
|
||||
None,
|
||||
)
|
||||
|
||||
return (
|
||||
UserTurnStrategies(
|
||||
start=[VADUserTurnStartStrategy()],
|
||||
stop=[SpeechTimeoutUserTurnStopStrategy()],
|
||||
),
|
||||
SileroVADAnalyzer(params=VADParams(stop_secs=0.2)),
|
||||
)
|
||||
|
||||
|
||||
async def run_pipeline_telephony(
|
||||
websocket,
|
||||
*,
|
||||
|
|
@ -138,6 +175,20 @@ async def run_pipeline_telephony(
|
|||
"telephony_configuration_id"
|
||||
)
|
||||
|
||||
# Resolve effective user config here so the transport can tune its
|
||||
# bot-stopped-speaking fallback based on is_realtime; pass the resolved
|
||||
# values into _run_pipeline so it doesn't fetch them again.
|
||||
from api.services.configuration.resolve import resolve_effective_config
|
||||
|
||||
user_config = await db_client.get_user_configurations(user_id)
|
||||
run_configs = (
|
||||
(workflow_run.definition.workflow_configurations or {}) if workflow_run else {}
|
||||
)
|
||||
user_config = resolve_effective_config(
|
||||
user_config, run_configs.get("model_overrides")
|
||||
)
|
||||
is_realtime = bool(user_config.is_realtime and user_config.realtime is not None)
|
||||
|
||||
spec = telephony_registry.get(provider_name)
|
||||
audio_config = create_audio_config(provider_name)
|
||||
|
||||
|
|
@ -148,6 +199,7 @@ async def run_pipeline_telephony(
|
|||
workflow.organization_id,
|
||||
ambient_noise_config=ambient_noise_config,
|
||||
telephony_configuration_id=telephony_configuration_id,
|
||||
is_realtime=is_realtime,
|
||||
**transport_kwargs,
|
||||
)
|
||||
|
||||
|
|
@ -158,6 +210,8 @@ async def run_pipeline_telephony(
|
|||
workflow_run_id,
|
||||
user_id,
|
||||
audio_config=audio_config,
|
||||
workflow_run=workflow_run,
|
||||
resolved_user_config=user_config,
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(
|
||||
|
|
@ -198,11 +252,27 @@ async def run_pipeline_smallwebrtc(
|
|||
# Create audio configuration for WebRTC
|
||||
audio_config = create_audio_config(WorkflowRunMode.SMALLWEBRTC.value)
|
||||
|
||||
# Resolve workflow_run + effective user_config here so the transport can
|
||||
# tune its bot-stopped-speaking fallback based on is_realtime. _run_pipeline
|
||||
# reuses these via kwargs so we don't fetch twice.
|
||||
from api.services.configuration.resolve import resolve_effective_config
|
||||
|
||||
workflow_run = await db_client.get_workflow_run(workflow_run_id, user_id)
|
||||
user_config = await db_client.get_user_configurations(user_id)
|
||||
run_configs = (
|
||||
(workflow_run.definition.workflow_configurations or {}) if workflow_run else {}
|
||||
)
|
||||
user_config = resolve_effective_config(
|
||||
user_config, run_configs.get("model_overrides")
|
||||
)
|
||||
is_realtime = bool(user_config.is_realtime and user_config.realtime is not None)
|
||||
|
||||
transport = await create_webrtc_transport(
|
||||
webrtc_connection,
|
||||
workflow_run_id,
|
||||
audio_config,
|
||||
ambient_noise_config,
|
||||
is_realtime=is_realtime,
|
||||
)
|
||||
await _run_pipeline(
|
||||
transport,
|
||||
|
|
@ -212,6 +282,8 @@ async def run_pipeline_smallwebrtc(
|
|||
call_context_vars=call_context_vars,
|
||||
audio_config=audio_config,
|
||||
user_provider_id=user_provider_id,
|
||||
workflow_run=workflow_run,
|
||||
resolved_user_config=user_config,
|
||||
)
|
||||
|
||||
|
||||
|
|
@ -223,6 +295,8 @@ async def _run_pipeline(
|
|||
call_context_vars: dict = {},
|
||||
audio_config: AudioConfig = None,
|
||||
user_provider_id: str | None = None,
|
||||
workflow_run=None,
|
||||
resolved_user_config=None,
|
||||
) -> None:
|
||||
"""
|
||||
Run the pipeline with the given transport and configuration
|
||||
|
|
@ -232,9 +306,12 @@ async def _run_pipeline(
|
|||
workflow_id: The ID of the workflow
|
||||
workflow_run_id: The ID of the workflow run
|
||||
user_id: The ID of the user
|
||||
mode: The mode of the pipeline (twilio or smallwebrtc)
|
||||
workflow_run: Pre-fetched workflow run row. Fetched here if None.
|
||||
resolved_user_config: User configuration with model_overrides already
|
||||
applied. Fetched and resolved here if None.
|
||||
"""
|
||||
workflow_run = await db_client.get_workflow_run(workflow_run_id, user_id)
|
||||
if workflow_run is None:
|
||||
workflow_run = await db_client.get_workflow_run(workflow_run_id, user_id)
|
||||
|
||||
# If the workflow run is already completed, we don't need to run it again
|
||||
if workflow_run.is_completed:
|
||||
|
|
@ -246,9 +323,6 @@ async def _run_pipeline(
|
|||
if call_context_vars:
|
||||
merged_call_context_vars = {**merged_call_context_vars, **call_context_vars}
|
||||
|
||||
# Get user configuration
|
||||
user_config = await db_client.get_user_configurations(user_id)
|
||||
|
||||
# Get workflow for metadata (name, organization_id, call_disposition_codes)
|
||||
workflow = await db_client.get_workflow(workflow_id, user_id)
|
||||
if not workflow:
|
||||
|
|
@ -286,11 +360,17 @@ async def _run_pipeline(
|
|||
term.strip() for term in dictionary.split(",") if term.strip()
|
||||
]
|
||||
|
||||
# Resolve model overrides from the version onto global user config
|
||||
from api.services.configuration.resolve import resolve_effective_config
|
||||
# Resolve model overrides from the version onto global user config (skip
|
||||
# when the caller already resolved it).
|
||||
if resolved_user_config is None:
|
||||
from api.services.configuration.resolve import resolve_effective_config
|
||||
|
||||
model_overrides = run_configs.get("model_overrides")
|
||||
user_config = resolve_effective_config(user_config, model_overrides)
|
||||
user_config = await db_client.get_user_configurations(user_id)
|
||||
user_config = resolve_effective_config(
|
||||
user_config, run_configs.get("model_overrides")
|
||||
)
|
||||
else:
|
||||
user_config = resolved_user_config
|
||||
|
||||
# Detect realtime mode (speech-to-speech services like OpenAI Realtime, Gemini Live)
|
||||
is_realtime = user_config.is_realtime and user_config.realtime is not None
|
||||
|
|
@ -453,23 +533,20 @@ async def _run_pipeline(
|
|||
correct_aggregation_callback=engine.create_aggregation_correction_callback(),
|
||||
)
|
||||
|
||||
user_mute_strategies = [
|
||||
MuteUntilFirstBotCompleteUserMuteStrategy(),
|
||||
FunctionCallUserMuteStrategy(),
|
||||
CallbackUserMuteStrategy(should_mute_callback=engine.should_mute_user),
|
||||
]
|
||||
user_vad_analyzer = SileroVADAnalyzer(params=VADParams(stop_secs=0.2))
|
||||
|
||||
# Configure turn strategies based on STT provider, model, and workflow configuration
|
||||
if is_realtime:
|
||||
# Realtime services do server-side turn detection for response generation,
|
||||
# but we still need a client-side stop strategy so the user aggregator emits
|
||||
# UserStoppedSpeakingFrame. Without it, downstream consumers (e.g. voicemail
|
||||
# detector) and Gemini Live's _finalize_pending flag never see a turn end.
|
||||
user_turn_strategies = UserTurnStrategies(
|
||||
start=[VADUserTurnStartStrategy()],
|
||||
stop=[SpeechTimeoutUserTurnStopStrategy()],
|
||||
# Realtime services still need user-turn tracking even when the model
|
||||
# itself owns speech generation and interruption behavior.
|
||||
user_turn_strategies, user_vad_analyzer = _create_realtime_user_turn_config(
|
||||
user_config.realtime.provider
|
||||
)
|
||||
|
||||
# Lets not start the pipeline as muted for Realtime
|
||||
# - CallbackUserMuteStrategy: mutes based on engine's _mute_pipeline state
|
||||
user_mute_strategies = [
|
||||
FunctionCallUserMuteStrategy(),
|
||||
CallbackUserMuteStrategy(should_mute_callback=engine.should_mute_user),
|
||||
]
|
||||
else:
|
||||
# Deepgram Flux uses external turn detection (VAD + External start/stop)
|
||||
# Other models use configurable turn detection strategy
|
||||
|
|
@ -510,18 +587,11 @@ async def _run_pipeline(
|
|||
stop=[SpeechTimeoutUserTurnStopStrategy()],
|
||||
)
|
||||
|
||||
# - CallbackUserMuteStrategy: mutes based on engine's _mute_pipeline state
|
||||
user_mute_strategies = [
|
||||
MuteUntilFirstBotCompleteUserMuteStrategy(),
|
||||
FunctionCallUserMuteStrategy(),
|
||||
CallbackUserMuteStrategy(should_mute_callback=engine.should_mute_user),
|
||||
]
|
||||
|
||||
user_params = LLMUserAggregatorParams(
|
||||
user_turn_strategies=user_turn_strategies,
|
||||
user_mute_strategies=user_mute_strategies,
|
||||
user_idle_timeout=max_user_idle_timeout,
|
||||
vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.2)),
|
||||
vad_analyzer=user_vad_analyzer,
|
||||
)
|
||||
context_aggregator = LLMContextAggregatorPair(
|
||||
context, assistant_params=assistant_params, user_params=user_params
|
||||
|
|
@ -562,15 +632,14 @@ async def _run_pipeline(
|
|||
)
|
||||
engine.set_fetch_recording_audio(fetch_audio)
|
||||
|
||||
# Voicemail detection works in both modes. In realtime mode the detector sits
|
||||
# after the realtime LLM and consumes the TranscriptionFrames it broadcasts;
|
||||
# the LLM gate / TTS gate are not used (the realtime LLM responds to audio
|
||||
# directly, not LLMContextFrames), so on detection we rely on
|
||||
# end_call_with_reason to drop the call.
|
||||
voicemail_config = (workflow.workflow_configurations or {}).get(
|
||||
"voicemail_detection", {}
|
||||
)
|
||||
if voicemail_config.get("enabled", False):
|
||||
if is_realtime and voicemail_config.get("enabled", False):
|
||||
logger.info(
|
||||
f"Disabling voicemail detection for realtime workflow run {workflow_run_id}"
|
||||
)
|
||||
if voicemail_config.get("enabled", False) and not is_realtime:
|
||||
logger.info(f"Voicemail detection enabled for workflow run {workflow_run_id}")
|
||||
# Create a separate LLM instance for the voicemail sub-pipeline
|
||||
# (can't share with main pipeline as it would mess up frame linking)
|
||||
|
|
|
|||
|
|
@ -493,6 +493,9 @@ def create_realtime_llm_service(user_config, audio_config: "AudioConfig"):
|
|||
)
|
||||
|
||||
if provider == ServiceProviders.OPENAI_REALTIME.value:
|
||||
from api.services.pipecat.realtime.openai_realtime import (
|
||||
DograhOpenAIRealtimeLLMService,
|
||||
)
|
||||
from pipecat.services.openai.realtime.events import (
|
||||
AudioConfiguration,
|
||||
AudioInput,
|
||||
|
|
@ -500,11 +503,10 @@ def create_realtime_llm_service(user_config, audio_config: "AudioConfig"):
|
|||
InputAudioTranscription,
|
||||
SessionProperties,
|
||||
)
|
||||
from pipecat.services.openai.realtime.llm import OpenAIRealtimeLLMService
|
||||
|
||||
return OpenAIRealtimeLLMService(
|
||||
return DograhOpenAIRealtimeLLMService(
|
||||
api_key=api_key,
|
||||
settings=OpenAIRealtimeLLMService.Settings(
|
||||
settings=DograhOpenAIRealtimeLLMService.Settings(
|
||||
model=model,
|
||||
session_properties=SessionProperties(
|
||||
audio=AudioConfiguration(
|
||||
|
|
@ -519,7 +521,9 @@ def create_realtime_llm_service(user_config, audio_config: "AudioConfig"):
|
|||
),
|
||||
)
|
||||
elif provider == ServiceProviders.GOOGLE_REALTIME.value:
|
||||
from pipecat.services.google.gemini_live.llm import GeminiLiveLLMService
|
||||
from api.services.pipecat.realtime.gemini_live import (
|
||||
DograhGeminiLiveLLMService,
|
||||
)
|
||||
|
||||
# Gemini Live enables input/output audio transcription by default
|
||||
# in its _connect() method — no need to configure it explicitly.
|
||||
|
|
@ -529,9 +533,30 @@ def create_realtime_llm_service(user_config, audio_config: "AudioConfig"):
|
|||
}
|
||||
if language:
|
||||
settings_kwargs["language"] = language
|
||||
return GeminiLiveLLMService(
|
||||
return DograhGeminiLiveLLMService(
|
||||
api_key=api_key,
|
||||
settings=GeminiLiveLLMService.Settings(**settings_kwargs),
|
||||
settings=DograhGeminiLiveLLMService.Settings(**settings_kwargs),
|
||||
)
|
||||
elif provider == ServiceProviders.GOOGLE_VERTEX_REALTIME.value:
|
||||
from api.services.pipecat.realtime.gemini_live_vertex import (
|
||||
DograhGeminiLiveVertexLLMService,
|
||||
)
|
||||
|
||||
project_id = getattr(realtime_config, "project_id", None)
|
||||
location = getattr(realtime_config, "location", None) or "us-east4"
|
||||
credentials = getattr(realtime_config, "credentials", None)
|
||||
|
||||
settings_kwargs = {
|
||||
"model": model,
|
||||
"voice": voice or "Charon",
|
||||
}
|
||||
if language:
|
||||
settings_kwargs["language"] = language
|
||||
return DograhGeminiLiveVertexLLMService(
|
||||
credentials=credentials,
|
||||
project_id=project_id,
|
||||
location=location,
|
||||
settings=DograhGeminiLiveVertexLLMService.Settings(**settings_kwargs),
|
||||
)
|
||||
else:
|
||||
raise HTTPException(
|
||||
|
|
|
|||
25
api/services/pipecat/transport_params.py
Normal file
25
api/services/pipecat/transport_params.py
Normal file
|
|
@ -0,0 +1,25 @@
|
|||
"""Shared helpers for tuning pipecat ``TransportParams`` per run mode.
|
||||
|
||||
These live outside ``transport_setup.py`` (which is non-telephony only) so
|
||||
that both the WebRTC factory there and the telephony provider factories
|
||||
under ``api.services.telephony.providers/<name>/transport.py`` can call
|
||||
into the same place.
|
||||
"""
|
||||
|
||||
# Realtime (speech-to-speech) LLMs don't emit ``TTSStoppedFrame``, so the
|
||||
# bot-stopped-speaking signal relies on the output-queue-drained fallback.
|
||||
# The default 3s tail leaves a long gap before the assistant aggregator
|
||||
# closes its turn; 0.5s keeps the conversation snappy without cutting into
|
||||
# the bot's own audio (audio chunks arrive far more frequently than this).
|
||||
REALTIME_BOT_VAD_STOP_SECS = 0.5
|
||||
|
||||
|
||||
def realtime_param_overrides(is_realtime: bool) -> dict:
|
||||
"""Return kwargs to splat into ``TransportParams`` for the given run mode.
|
||||
|
||||
Currently this only tunes ``bot_vad_stop_secs``; new realtime-specific
|
||||
knobs should be added here so each transport stays a thin shim.
|
||||
"""
|
||||
if not is_realtime:
|
||||
return {}
|
||||
return {"bot_vad_stop_secs": REALTIME_BOT_VAD_STOP_SECS}
|
||||
|
|
@ -6,6 +6,7 @@ This module hosts only the shared, non-telephony transports (WebRTC).
|
|||
|
||||
from api.services.pipecat.audio_config import AudioConfig
|
||||
from api.services.pipecat.audio_mixer import build_audio_out_mixer
|
||||
from api.services.pipecat.transport_params import realtime_param_overrides
|
||||
from pipecat.transports.base_transport import TransportParams
|
||||
from pipecat.transports.smallwebrtc.connection import SmallWebRTCConnection
|
||||
from pipecat.transports.smallwebrtc.transport import SmallWebRTCTransport
|
||||
|
|
@ -16,6 +17,7 @@ async def create_webrtc_transport(
|
|||
workflow_run_id: int,
|
||||
audio_config: AudioConfig,
|
||||
ambient_noise_config: dict | None = None,
|
||||
is_realtime: bool = False,
|
||||
):
|
||||
"""Create a transport for WebRTC connections."""
|
||||
mixer = await build_audio_out_mixer(
|
||||
|
|
@ -30,5 +32,6 @@ async def create_webrtc_transport(
|
|||
audio_in_sample_rate=audio_config.transport_in_sample_rate,
|
||||
audio_out_sample_rate=audio_config.transport_out_sample_rate,
|
||||
audio_out_mixer=mixer,
|
||||
**realtime_param_overrides(is_realtime),
|
||||
),
|
||||
)
|
||||
|
|
|
|||
|
|
@ -27,7 +27,9 @@ async def _fetch_telephony_cost(workflow_run) -> dict | None:
|
|||
logger.warning("Workflow not found for workflow run")
|
||||
raise Exception("Workflow not found")
|
||||
|
||||
provider = await get_telephony_provider_for_run(workflow_run, workflow.organization_id)
|
||||
provider = await get_telephony_provider_for_run(
|
||||
workflow_run, workflow.organization_id
|
||||
)
|
||||
call_cost_info = await provider.get_call_cost(call_id)
|
||||
|
||||
if call_cost_info.get("status") == "error":
|
||||
|
|
|
|||
|
|
@ -13,10 +13,16 @@ Business Logic → TelephonyProvider (Interface) → Concrete Provider (Twilio,
|
|||
### Using the Provider in Code
|
||||
|
||||
```python
|
||||
from api.services.telephony.factory import get_telephony_provider
|
||||
from api.services.telephony.factory import (
|
||||
get_default_telephony_provider,
|
||||
get_telephony_provider_by_id,
|
||||
)
|
||||
|
||||
# Get provider based on organization config
|
||||
provider = await get_telephony_provider(organization_id)
|
||||
# Get the org's default outbound provider
|
||||
provider = await get_default_telephony_provider(organization_id)
|
||||
|
||||
# Or resolve a specific telephony configuration row
|
||||
provider = await get_telephony_provider_by_id(config_id, organization_id)
|
||||
|
||||
# Initiate a call
|
||||
result = await provider.initiate_call(
|
||||
|
|
@ -47,7 +53,7 @@ See the [Custom Provider Guide](https://docs.dograh.com/integrations/telephony/c
|
|||
|
||||
Quick checklist:
|
||||
1. Create `providers/your_provider.py` implementing `TelephonyProvider`
|
||||
2. Update `factory.py` to include your provider
|
||||
2. Register the package in `providers/__init__.py` and add its schemas to `api/schemas/telephony_config.py`
|
||||
3. Write unit tests
|
||||
4. Update documentation
|
||||
|
||||
|
|
@ -107,7 +113,7 @@ class MockProvider(TelephonyProvider):
|
|||
# Implement other required methods...
|
||||
|
||||
# In tests
|
||||
@patch('api.services.telephony.factory.get_telephony_provider')
|
||||
@patch('api.services.telephony.factory.get_default_telephony_provider')
|
||||
async def test_call_initiation(mock_get_provider):
|
||||
mock_get_provider.return_value = MockProvider()
|
||||
# Test your business logic
|
||||
|
|
@ -141,8 +147,8 @@ await service.initiate_call(...)
|
|||
|
||||
New code:
|
||||
```python
|
||||
from api.services.telephony.factory import get_telephony_provider
|
||||
provider = await get_telephony_provider(org_id)
|
||||
from api.services.telephony.factory import get_default_telephony_provider
|
||||
provider = await get_default_telephony_provider(org_id)
|
||||
await provider.initiate_call(...)
|
||||
```
|
||||
|
||||
|
|
@ -164,4 +170,4 @@ await provider.initiate_call(...)
|
|||
- [User Documentation](https://docs.dograh.com/integrations/telephony/overview)
|
||||
- [Twilio Integration](https://docs.dograh.com/integrations/telephony/twilio)
|
||||
- [Custom Providers](https://docs.dograh.com/integrations/telephony/custom)
|
||||
- [Webhooks Guide](https://docs.dograh.com/integrations/telephony/webhooks)
|
||||
- [Webhooks Guide](https://docs.dograh.com/integrations/telephony/webhooks)
|
||||
|
|
|
|||
|
|
@ -6,9 +6,7 @@ resolution paths exist:
|
|||
* by config id — the canonical path used by outbound (test calls, campaigns,
|
||||
API triggers) and by the websocket transport once a workflow run has
|
||||
``initial_context.telephony_configuration_id`` stamped on it.
|
||||
* by org default — used as a fallback when no specific config is requested
|
||||
(e.g. the legacy ``/telephony-config`` endpoint, the back-compat
|
||||
``get_telephony_provider(organization_id)`` shim).
|
||||
* by org default — used as a fallback when no specific config is requested.
|
||||
* for inbound — given a detected provider and an account-id from the webhook,
|
||||
iterate the org's configs of that provider and return the one whose stored
|
||||
account-id credential matches.
|
||||
|
|
@ -196,28 +194,6 @@ async def load_credentials_for_transport(
|
|||
return config
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Back-compat shims
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
async def load_telephony_config(organization_id: int) -> Dict[str, Any]:
|
||||
"""Deprecated: returns the org's default config.
|
||||
|
||||
Existing callers that don't carry a config id continue to work via this
|
||||
shim. New code should pass an explicit telephony_configuration_id."""
|
||||
return await load_default_telephony_config(organization_id)
|
||||
|
||||
|
||||
async def get_telephony_provider(organization_id: int) -> TelephonyProvider:
|
||||
"""Deprecated: returns a provider for the org's default config.
|
||||
|
||||
See ``load_telephony_config`` above. New code should call
|
||||
``get_telephony_provider_by_id`` with the resolved config id.
|
||||
"""
|
||||
return await get_default_telephony_provider(organization_id)
|
||||
|
||||
|
||||
async def get_all_telephony_providers() -> List[Type[TelephonyProvider]]:
|
||||
"""All registered provider classes — used by inbound webhook detection."""
|
||||
return [spec.provider_cls for spec in registry.all_specs()]
|
||||
|
|
|
|||
|
|
@ -28,7 +28,7 @@ If you find yourself editing anything else, re-read the registry plumbing first:
|
|||
|
||||
| Want to change... | Source of truth |
|
||||
| --- | --- |
|
||||
| Outbound provider lookup | `factory.get_telephony_provider*` reads `registry.get(name).provider_cls` |
|
||||
| Outbound provider lookup | `factory.get_default_telephony_provider`, `get_telephony_provider_by_id`, and `get_telephony_provider_for_run` read `registry.get(name).provider_cls` |
|
||||
| Stored credentials → constructor dict | `ProviderSpec.config_loader` |
|
||||
| Audio sample rate / VAD rate | `ProviderSpec.transport_sample_rate` (full `AudioConfig` is built in `pipecat/audio_config.py::create_audio_config`) |
|
||||
| Which transport runs in `run_pipeline_telephony` | `ProviderSpec.transport_factory` |
|
||||
|
|
|
|||
|
|
@ -8,6 +8,7 @@ from pipecat.transports.websocket.fastapi import (
|
|||
|
||||
from api.services.pipecat.audio_config import AudioConfig
|
||||
from api.services.pipecat.audio_mixer import build_audio_out_mixer
|
||||
from api.services.pipecat.transport_params import realtime_param_overrides
|
||||
from api.services.telephony.factory import load_credentials_for_transport
|
||||
|
||||
from .serializers import AsteriskFrameSerializer
|
||||
|
|
@ -22,6 +23,7 @@ async def create_transport(
|
|||
*,
|
||||
ambient_noise_config: dict | None = None,
|
||||
telephony_configuration_id: int | None = None,
|
||||
is_realtime: bool = False,
|
||||
channel_id: str,
|
||||
):
|
||||
"""Create a transport for Asterisk ARI connections."""
|
||||
|
|
@ -65,5 +67,6 @@ async def create_transport(
|
|||
audio_out_sample_rate=audio_config.transport_out_sample_rate,
|
||||
audio_out_mixer=mixer,
|
||||
serializer=serializer,
|
||||
**realtime_param_overrides(is_realtime),
|
||||
),
|
||||
)
|
||||
|
|
|
|||
|
|
@ -8,6 +8,7 @@ from pipecat.transports.websocket.fastapi import (
|
|||
|
||||
from api.services.pipecat.audio_config import AudioConfig
|
||||
from api.services.pipecat.audio_mixer import build_audio_out_mixer
|
||||
from api.services.pipecat.transport_params import realtime_param_overrides
|
||||
from api.services.telephony.factory import load_credentials_for_transport
|
||||
|
||||
from .serializers import CloudonixFrameSerializer
|
||||
|
|
@ -22,6 +23,7 @@ async def create_transport(
|
|||
*,
|
||||
ambient_noise_config: dict | None = None,
|
||||
telephony_configuration_id: int | None = None,
|
||||
is_realtime: bool = False,
|
||||
call_id: str,
|
||||
stream_sid: str,
|
||||
bearer_token: str | None = None,
|
||||
|
|
@ -69,5 +71,6 @@ async def create_transport(
|
|||
audio_out_mixer=mixer,
|
||||
serializer=serializer,
|
||||
audio_out_10ms_chunks=2,
|
||||
**realtime_param_overrides(is_realtime),
|
||||
),
|
||||
)
|
||||
|
|
|
|||
|
|
@ -8,6 +8,7 @@ from pipecat.transports.websocket.fastapi import (
|
|||
|
||||
from api.services.pipecat.audio_config import AudioConfig
|
||||
from api.services.pipecat.audio_mixer import build_audio_out_mixer
|
||||
from api.services.pipecat.transport_params import realtime_param_overrides
|
||||
from api.services.telephony.factory import load_credentials_for_transport
|
||||
|
||||
from .serializers import PlivoFrameSerializer
|
||||
|
|
@ -21,6 +22,7 @@ async def create_transport(
|
|||
*,
|
||||
ambient_noise_config: dict | None = None,
|
||||
telephony_configuration_id: int | None = None,
|
||||
is_realtime: bool = False,
|
||||
stream_id: str,
|
||||
call_id: str,
|
||||
):
|
||||
|
|
@ -61,5 +63,6 @@ async def create_transport(
|
|||
audio_out_sample_rate=audio_config.transport_out_sample_rate,
|
||||
audio_out_mixer=mixer,
|
||||
serializer=serializer,
|
||||
**realtime_param_overrides(is_realtime),
|
||||
),
|
||||
)
|
||||
|
|
|
|||
|
|
@ -8,6 +8,7 @@ from pipecat.transports.websocket.fastapi import (
|
|||
|
||||
from api.services.pipecat.audio_config import AudioConfig
|
||||
from api.services.pipecat.audio_mixer import build_audio_out_mixer
|
||||
from api.services.pipecat.transport_params import realtime_param_overrides
|
||||
from api.services.telephony.factory import load_credentials_for_transport
|
||||
|
||||
from .serializers import TelnyxFrameSerializer
|
||||
|
|
@ -22,6 +23,7 @@ async def create_transport(
|
|||
*,
|
||||
ambient_noise_config: dict | None = None,
|
||||
telephony_configuration_id: int | None = None,
|
||||
is_realtime: bool = False,
|
||||
stream_id: str,
|
||||
call_control_id: str,
|
||||
encoding: str = "PCMU",
|
||||
|
|
@ -64,5 +66,6 @@ async def create_transport(
|
|||
audio_out_sample_rate=audio_config.transport_out_sample_rate,
|
||||
audio_out_mixer=mixer,
|
||||
serializer=serializer,
|
||||
**realtime_param_overrides(is_realtime),
|
||||
),
|
||||
)
|
||||
|
|
|
|||
|
|
@ -8,6 +8,7 @@ from pipecat.transports.websocket.fastapi import (
|
|||
|
||||
from api.services.pipecat.audio_config import AudioConfig
|
||||
from api.services.pipecat.audio_mixer import build_audio_out_mixer
|
||||
from api.services.pipecat.transport_params import realtime_param_overrides
|
||||
from api.services.telephony.factory import load_credentials_for_transport
|
||||
|
||||
from .serializers import TwilioFrameSerializer
|
||||
|
|
@ -22,6 +23,7 @@ async def create_transport(
|
|||
*,
|
||||
ambient_noise_config: dict | None = None,
|
||||
telephony_configuration_id: int | None = None,
|
||||
is_realtime: bool = False,
|
||||
stream_sid: str,
|
||||
call_sid: str,
|
||||
):
|
||||
|
|
@ -60,5 +62,6 @@ async def create_transport(
|
|||
audio_out_sample_rate=audio_config.transport_out_sample_rate,
|
||||
audio_out_mixer=mixer,
|
||||
serializer=serializer,
|
||||
**realtime_param_overrides(is_realtime),
|
||||
),
|
||||
)
|
||||
|
|
|
|||
|
|
@ -14,6 +14,7 @@ from pipecat.transports.websocket.fastapi import (
|
|||
|
||||
from api.services.pipecat.audio_config import AudioConfig
|
||||
from api.services.pipecat.audio_mixer import build_audio_out_mixer
|
||||
from api.services.pipecat.transport_params import realtime_param_overrides
|
||||
from api.services.telephony.factory import load_credentials_for_transport
|
||||
|
||||
from .serializers import VobizFrameSerializer
|
||||
|
|
@ -27,6 +28,7 @@ async def create_transport(
|
|||
*,
|
||||
ambient_noise_config: dict | None = None,
|
||||
telephony_configuration_id: int | None = None,
|
||||
is_realtime: bool = False,
|
||||
stream_id: str,
|
||||
call_id: str,
|
||||
):
|
||||
|
|
@ -72,6 +74,7 @@ async def create_transport(
|
|||
audio_out_sample_rate=audio_config.transport_out_sample_rate,
|
||||
audio_out_mixer=mixer,
|
||||
serializer=serializer,
|
||||
**realtime_param_overrides(is_realtime),
|
||||
),
|
||||
)
|
||||
|
||||
|
|
|
|||
|
|
@ -7,6 +7,7 @@ from pipecat.transports.websocket.fastapi import (
|
|||
|
||||
from api.services.pipecat.audio_config import AudioConfig
|
||||
from api.services.pipecat.audio_mixer import build_audio_out_mixer
|
||||
from api.services.pipecat.transport_params import realtime_param_overrides
|
||||
from api.services.telephony.factory import load_credentials_for_transport
|
||||
|
||||
from .serializers import VonageFrameSerializer
|
||||
|
|
@ -20,6 +21,7 @@ async def create_transport(
|
|||
*,
|
||||
ambient_noise_config: dict | None = None,
|
||||
telephony_configuration_id: int | None = None,
|
||||
is_realtime: bool = False,
|
||||
call_uuid: str,
|
||||
):
|
||||
"""Create a transport for Vonage connections."""
|
||||
|
|
@ -59,5 +61,6 @@ async def create_transport(
|
|||
audio_out_sample_rate=audio_config.transport_out_sample_rate,
|
||||
audio_out_mixer=mixer,
|
||||
serializer=serializer,
|
||||
**realtime_param_overrides(is_realtime),
|
||||
),
|
||||
)
|
||||
|
|
|
|||
|
|
@ -81,8 +81,8 @@ class ProviderSpec:
|
|||
stored config JSON and as the WorkflowRunMode value.
|
||||
provider_cls: The TelephonyProvider subclass.
|
||||
config_loader: Normalizes raw stored config into the dict shape the
|
||||
provider constructor expects. Replaces the if/elif chain in the
|
||||
old factory.load_telephony_config().
|
||||
provider constructor expects. Replaces the old factory if/elif
|
||||
chain.
|
||||
transport_factory: Async callable that creates the pipecat transport
|
||||
for an accepted WebSocket. Provider-specific kwargs (stream_sid,
|
||||
call_sid, etc.) are forwarded as ``**kwargs``.
|
||||
|
|
|
|||
|
|
@ -321,6 +321,7 @@ class CustomToolManager:
|
|||
tool=tool,
|
||||
arguments=function_call_params.arguments,
|
||||
call_context_vars=self._engine._call_context_vars,
|
||||
gathered_context_vars=self._engine._gathered_context,
|
||||
organization_id=await self.get_organization_id(),
|
||||
)
|
||||
|
||||
|
|
|
|||
|
|
@ -1,5 +1,6 @@
|
|||
"""Custom tool execution for user-defined HTTP API tools."""
|
||||
|
||||
import json
|
||||
import re
|
||||
from typing import Any, Dict, Optional
|
||||
|
||||
|
|
@ -8,6 +9,7 @@ from loguru import logger
|
|||
|
||||
from api.db import db_client
|
||||
from api.utils.credential_auth import build_auth_header
|
||||
from api.utils.template_renderer import render_template
|
||||
|
||||
# Map tool parameter types to JSON schema types
|
||||
TYPE_MAP = {
|
||||
|
|
@ -84,10 +86,94 @@ def tool_to_function_schema(tool: Any) -> Dict[str, Any]:
|
|||
}
|
||||
|
||||
|
||||
def _coerce_parameter_value(value: Any, param_type: str) -> Any:
|
||||
"""Coerce a rendered preset parameter into the configured JSON type."""
|
||||
|
||||
if value is None:
|
||||
return None
|
||||
|
||||
if param_type == "string":
|
||||
if isinstance(value, str):
|
||||
return value
|
||||
if isinstance(value, (dict, list)):
|
||||
return json.dumps(value)
|
||||
return str(value)
|
||||
|
||||
if param_type == "number":
|
||||
if isinstance(value, (int, float)) and not isinstance(value, bool):
|
||||
return value
|
||||
|
||||
rendered = str(value).strip()
|
||||
if rendered == "":
|
||||
return None
|
||||
|
||||
if re.fullmatch(r"[-+]?\d+", rendered):
|
||||
return int(rendered)
|
||||
|
||||
return float(rendered)
|
||||
|
||||
if param_type == "boolean":
|
||||
if isinstance(value, bool):
|
||||
return value
|
||||
|
||||
if isinstance(value, (int, float)):
|
||||
return bool(value)
|
||||
|
||||
rendered = str(value).strip().lower()
|
||||
if rendered in {"true", "1", "yes", "y", "on"}:
|
||||
return True
|
||||
if rendered in {"false", "0", "no", "n", "off"}:
|
||||
return False
|
||||
|
||||
raise ValueError(f"Cannot convert '{value}' to boolean")
|
||||
|
||||
return value
|
||||
|
||||
|
||||
def _resolve_preset_parameters(
|
||||
config: Dict[str, Any],
|
||||
call_context_vars: Optional[Dict[str, Any]],
|
||||
gathered_context_vars: Optional[Dict[str, Any]],
|
||||
) -> Dict[str, Any]:
|
||||
"""Resolve fixed/template-backed parameters before executing the HTTP request."""
|
||||
|
||||
preset_parameters = config.get("preset_parameters", []) or []
|
||||
if not preset_parameters:
|
||||
return {}
|
||||
|
||||
initial_context = dict(call_context_vars or {})
|
||||
render_context: Dict[str, Any] = {
|
||||
**initial_context,
|
||||
"initial_context": initial_context,
|
||||
"gathered_context": dict(gathered_context_vars or {}),
|
||||
}
|
||||
|
||||
resolved: Dict[str, Any] = {}
|
||||
for param in preset_parameters:
|
||||
param_name = (param.get("name") or "").strip()
|
||||
if not param_name:
|
||||
continue
|
||||
|
||||
rendered = render_template(param.get("value_template", ""), render_context)
|
||||
if rendered in (None, ""):
|
||||
if param.get("required", True):
|
||||
raise ValueError(
|
||||
f"Preset parameter '{param_name}' resolved to an empty value"
|
||||
)
|
||||
continue
|
||||
|
||||
resolved[param_name] = _coerce_parameter_value(
|
||||
rendered, param.get("type", "string")
|
||||
)
|
||||
|
||||
return resolved
|
||||
|
||||
|
||||
async def execute_http_tool(
|
||||
tool: Any,
|
||||
arguments: Dict[str, Any],
|
||||
call_context_vars: Optional[Dict[str, Any]] = None,
|
||||
gathered_context_vars: Optional[Dict[str, Any]] = None,
|
||||
organization_id: Optional[int] = None,
|
||||
) -> Dict[str, Any]:
|
||||
"""Execute an HTTP API tool.
|
||||
|
|
@ -95,7 +181,8 @@ async def execute_http_tool(
|
|||
Args:
|
||||
tool: ToolModel instance
|
||||
arguments: Arguments passed by the LLM (parameter name -> value)
|
||||
call_context_vars: Additional context variables from the call (unused for now)
|
||||
call_context_vars: Initial context variables available at runtime
|
||||
gathered_context_vars: Variables extracted during the conversation
|
||||
organization_id: Organization ID for credential lookup
|
||||
|
||||
Returns:
|
||||
|
|
@ -133,17 +220,31 @@ async def execute_http_tool(
|
|||
timeout_ms = config.get("timeout_ms", 5000)
|
||||
timeout_seconds = timeout_ms / 1000
|
||||
|
||||
try:
|
||||
preset_arguments = _resolve_preset_parameters(
|
||||
config, call_context_vars, gathered_context_vars
|
||||
)
|
||||
except ValueError as e:
|
||||
logger.error(f"Custom tool '{tool.name}' preset parameter error: {e}")
|
||||
return {"status": "error", "error": str(e)}
|
||||
|
||||
resolved_arguments = {**(arguments or {}), **preset_arguments}
|
||||
|
||||
# Build request: JSON body for POST/PUT/PATCH, query params for GET/DELETE
|
||||
body = None
|
||||
params = None
|
||||
if method in ("POST", "PUT", "PATCH"):
|
||||
body = arguments
|
||||
elif method in ("GET", "DELETE") and arguments:
|
||||
params = arguments
|
||||
body = resolved_arguments
|
||||
elif method in ("GET", "DELETE") and resolved_arguments:
|
||||
params = resolved_arguments
|
||||
|
||||
logger.info(
|
||||
f"Executing custom tool '{tool.name}' ({tool.tool_uuid}): {method} {url}"
|
||||
)
|
||||
if preset_arguments:
|
||||
logger.debug(
|
||||
f"Resolved preset parameters for '{tool.name}': {list(preset_arguments.keys())}"
|
||||
)
|
||||
logger.debug(f"Request body: {body}, params: {params}")
|
||||
|
||||
try:
|
||||
|
|
|
|||
|
|
@ -140,6 +140,45 @@ class TestToolToFunctionSchema:
|
|||
assert "duration_minutes" in required
|
||||
assert "is_priority" not in required
|
||||
|
||||
def test_preset_parameters_are_not_exposed_to_llm_schema(self):
|
||||
"""Test that preset parameters are injected at runtime, not shown to the LLM."""
|
||||
tool = MockToolModel(
|
||||
tool_uuid="test-uuid-preset",
|
||||
name="Lookup Customer",
|
||||
description="Lookup a customer using contextual identifiers",
|
||||
category="http_api",
|
||||
definition={
|
||||
"schema_version": 1,
|
||||
"type": "http_api",
|
||||
"config": {
|
||||
"method": "POST",
|
||||
"url": "https://api.example.com/customers/lookup",
|
||||
"parameters": [
|
||||
{
|
||||
"name": "customer_name",
|
||||
"type": "string",
|
||||
"description": "Customer name spoken by the caller",
|
||||
"required": True,
|
||||
}
|
||||
],
|
||||
"preset_parameters": [
|
||||
{
|
||||
"name": "phone_number",
|
||||
"type": "string",
|
||||
"value_template": "{{initial_context.phone_number}}",
|
||||
"required": True,
|
||||
}
|
||||
],
|
||||
},
|
||||
},
|
||||
)
|
||||
|
||||
schema = tool_to_function_schema(tool)
|
||||
props = schema["function"]["parameters"]["properties"]
|
||||
|
||||
assert "customer_name" in props
|
||||
assert "phone_number" not in props
|
||||
|
||||
def test_tool_name_sanitization(self):
|
||||
"""Test that tool names with special characters are sanitized."""
|
||||
tool = MockToolModel(
|
||||
|
|
@ -255,6 +294,108 @@ class TestExecuteHttpTool:
|
|||
assert result["status_code"] == 201
|
||||
assert result["data"]["id"] == 123
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_post_request_injects_preset_parameters(self):
|
||||
"""Test that preset parameters are resolved from runtime context."""
|
||||
tool = MockToolModel(
|
||||
tool_uuid="test-uuid-preset",
|
||||
name="Create Lead",
|
||||
description="Create a lead with caller context",
|
||||
category="http_api",
|
||||
definition={
|
||||
"schema_version": 1,
|
||||
"type": "http_api",
|
||||
"config": {
|
||||
"method": "POST",
|
||||
"url": "https://api.example.com/leads",
|
||||
"timeout_ms": 5000,
|
||||
"preset_parameters": [
|
||||
{
|
||||
"name": "phone_number",
|
||||
"type": "string",
|
||||
"value_template": "{{initial_context.phone_number}}",
|
||||
"required": True,
|
||||
},
|
||||
{
|
||||
"name": "customer_id",
|
||||
"type": "number",
|
||||
"value_template": "{{gathered_context.customer_id}}",
|
||||
"required": True,
|
||||
},
|
||||
{
|
||||
"name": "is_vip",
|
||||
"type": "boolean",
|
||||
"value_template": "{{initial_context.is_vip}}",
|
||||
"required": False,
|
||||
},
|
||||
],
|
||||
},
|
||||
},
|
||||
)
|
||||
|
||||
arguments = {"name": "John"}
|
||||
|
||||
with patch(
|
||||
"api.services.workflow.tools.custom_tool.httpx.AsyncClient"
|
||||
) as mock_client_class:
|
||||
mock_client = AsyncMock()
|
||||
mock_response = Mock()
|
||||
mock_response.status_code = 201
|
||||
mock_response.json.return_value = {"id": 123}
|
||||
mock_client.request.return_value = mock_response
|
||||
mock_client_class.return_value.__aenter__.return_value = mock_client
|
||||
|
||||
result = await execute_http_tool(
|
||||
tool,
|
||||
arguments,
|
||||
call_context_vars={
|
||||
"phone_number": "+14155550123",
|
||||
"is_vip": "true",
|
||||
},
|
||||
gathered_context_vars={"customer_id": "42"},
|
||||
)
|
||||
|
||||
call_kwargs = mock_client.request.call_args.kwargs
|
||||
assert call_kwargs["json"] == {
|
||||
"name": "John",
|
||||
"phone_number": "+14155550123",
|
||||
"customer_id": 42,
|
||||
"is_vip": True,
|
||||
}
|
||||
assert result["status"] == "success"
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_missing_required_preset_parameter_returns_error(self):
|
||||
"""Test that required preset parameters fail before the HTTP request."""
|
||||
tool = MockToolModel(
|
||||
tool_uuid="test-uuid-preset-error",
|
||||
name="Create Lead",
|
||||
description="Create a lead with caller context",
|
||||
category="http_api",
|
||||
definition={
|
||||
"schema_version": 1,
|
||||
"type": "http_api",
|
||||
"config": {
|
||||
"method": "POST",
|
||||
"url": "https://api.example.com/leads",
|
||||
"timeout_ms": 5000,
|
||||
"preset_parameters": [
|
||||
{
|
||||
"name": "phone_number",
|
||||
"type": "string",
|
||||
"value_template": "{{initial_context.phone_number}}",
|
||||
"required": True,
|
||||
}
|
||||
],
|
||||
},
|
||||
},
|
||||
)
|
||||
|
||||
result = await execute_http_tool(tool, {"name": "John"}, call_context_vars={})
|
||||
|
||||
assert result["status"] == "error"
|
||||
assert "phone_number" in result["error"]
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_get_request_sends_query_params(self):
|
||||
"""Test that GET requests send arguments as query parameters."""
|
||||
|
|
|
|||
86
api/tests/test_gemini_live_reconnect_tool_results.py
Normal file
86
api/tests/test_gemini_live_reconnect_tool_results.py
Normal file
|
|
@ -0,0 +1,86 @@
|
|||
import json
|
||||
from types import SimpleNamespace
|
||||
from unittest.mock import AsyncMock
|
||||
|
||||
import pytest
|
||||
from pipecat.processors.aggregators.llm_context import LLMContext
|
||||
|
||||
from api.services.pipecat.realtime.gemini_live import DograhGeminiLiveLLMService
|
||||
|
||||
|
||||
class _TestDograhGeminiLiveLLMService(DograhGeminiLiveLLMService):
|
||||
"""Dograh Gemini service with client creation stubbed for unit tests."""
|
||||
|
||||
def create_client(self):
|
||||
self._client = SimpleNamespace(
|
||||
aio=SimpleNamespace(live=SimpleNamespace(connect=None))
|
||||
)
|
||||
|
||||
|
||||
class _FakeSession:
|
||||
def __init__(self):
|
||||
self.send_tool_response = AsyncMock()
|
||||
self.send_realtime_input = AsyncMock()
|
||||
self.close = AsyncMock()
|
||||
|
||||
|
||||
def _make_service() -> _TestDograhGeminiLiveLLMService:
|
||||
service = _TestDograhGeminiLiveLLMService(api_key="test-key")
|
||||
service.stop_all_metrics = AsyncMock()
|
||||
service.start_ttfb_metrics = AsyncMock()
|
||||
service.cancel_task = AsyncMock()
|
||||
service.push_error = AsyncMock()
|
||||
return service
|
||||
|
||||
|
||||
def _make_tool_result_context(tool_call_id: str) -> LLMContext:
|
||||
return LLMContext(
|
||||
messages=[
|
||||
{
|
||||
"role": "tool",
|
||||
"content": json.dumps({"status": "done"}),
|
||||
"tool_call_id": tool_call_id,
|
||||
}
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_updated_context_during_reconnect_keeps_result_pending_until_session_ready():
|
||||
service = _make_service()
|
||||
service._handled_initial_context = True
|
||||
service._tool_call_id_to_name = {"call-transition": "transition_to_next_node"}
|
||||
service._session = _FakeSession()
|
||||
|
||||
context = _make_tool_result_context("call-transition")
|
||||
|
||||
await service._disconnect()
|
||||
await service._handle_context(context)
|
||||
|
||||
# A reconnect gap should not count as successful delivery to Gemini.
|
||||
assert "call-transition" not in service._completed_tool_calls
|
||||
|
||||
session = _FakeSession()
|
||||
await service._handle_session_ready(session)
|
||||
|
||||
session.send_tool_response.assert_awaited_once()
|
||||
sent_response = session.send_tool_response.await_args.kwargs["function_responses"]
|
||||
assert sent_response.id == "call-transition"
|
||||
assert sent_response.name == "transition_to_next_node"
|
||||
assert "call-transition" in service._completed_tool_calls
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_disconnect_does_not_forget_previously_delivered_tool_results():
|
||||
service = _make_service()
|
||||
service._context = _make_tool_result_context("call-transition")
|
||||
service._completed_tool_calls = {"call-transition"}
|
||||
service._tool_call_id_to_name = {"call-transition": "transition_to_next_node"}
|
||||
service._session = _FakeSession()
|
||||
service._tool_result = AsyncMock()
|
||||
|
||||
await service._disconnect()
|
||||
await service._process_completed_function_calls(send_new_results=True)
|
||||
|
||||
service._tool_result.assert_not_awaited()
|
||||
assert service._completed_tool_calls == {"call-transition"}
|
||||
98
api/tests/test_openai_realtime_initial_context.py
Normal file
98
api/tests/test_openai_realtime_initial_context.py
Normal file
|
|
@ -0,0 +1,98 @@
|
|||
from types import SimpleNamespace
|
||||
from unittest.mock import AsyncMock
|
||||
|
||||
import pytest
|
||||
from pipecat.frames.frames import TTSSpeakFrame
|
||||
from pipecat.processors.aggregators.llm_context import LLMContext
|
||||
from pipecat.processors.frame_processor import FrameDirection
|
||||
|
||||
from api.services.pipecat.realtime.openai_realtime import (
|
||||
DograhOpenAIRealtimeLLMService,
|
||||
)
|
||||
|
||||
|
||||
def _make_service() -> DograhOpenAIRealtimeLLMService:
|
||||
service = DograhOpenAIRealtimeLLMService(api_key="test-key")
|
||||
service._create_response = AsyncMock()
|
||||
service._process_completed_function_calls = AsyncMock()
|
||||
return service
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_initial_context_triggers_response_when_context_was_prepopulated():
|
||||
service = _make_service()
|
||||
context = LLMContext()
|
||||
service._context = context
|
||||
|
||||
await service._handle_context(context)
|
||||
|
||||
assert service._handled_initial_context is True
|
||||
assert service._context is context
|
||||
service._create_response.assert_awaited_once()
|
||||
service._process_completed_function_calls.assert_not_awaited()
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_updated_context_uses_tool_result_path_after_initial_context():
|
||||
service = _make_service()
|
||||
context = LLMContext()
|
||||
service._handled_initial_context = True
|
||||
|
||||
await service._handle_context(context)
|
||||
|
||||
assert service._context is context
|
||||
service._create_response.assert_not_awaited()
|
||||
service._process_completed_function_calls.assert_awaited_once_with(
|
||||
send_new_results=True
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_tts_greeting_uses_initial_context_handler():
|
||||
service = _make_service()
|
||||
service._context = LLMContext()
|
||||
service._handle_context = AsyncMock()
|
||||
|
||||
await service.process_frame(
|
||||
TTSSpeakFrame("hello", append_to_context=True),
|
||||
FrameDirection.DOWNSTREAM,
|
||||
)
|
||||
|
||||
service._handle_context.assert_awaited_once_with(service._context)
|
||||
service._create_response.assert_not_awaited()
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_function_call_executes_immediately_when_bot_is_not_speaking():
|
||||
service = _make_service()
|
||||
service._context = LLMContext()
|
||||
service.run_function_calls = AsyncMock()
|
||||
service._pending_function_calls["call-1"] = SimpleNamespace(name="customer_support")
|
||||
|
||||
await service._handle_evt_function_call_arguments_done(
|
||||
SimpleNamespace(call_id="call-1", arguments='{"department":"sales"}')
|
||||
)
|
||||
|
||||
service.run_function_calls.assert_awaited_once()
|
||||
assert service._deferred_function_calls == []
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_function_call_is_deferred_until_bot_stops_speaking():
|
||||
service = _make_service()
|
||||
service._context = LLMContext()
|
||||
service.run_function_calls = AsyncMock()
|
||||
service._bot_is_speaking = True
|
||||
service._pending_function_calls["call-1"] = SimpleNamespace(name="customer_support")
|
||||
|
||||
await service._handle_evt_function_call_arguments_done(
|
||||
SimpleNamespace(call_id="call-1", arguments='{"department":"sales"}')
|
||||
)
|
||||
|
||||
service.run_function_calls.assert_not_awaited()
|
||||
assert len(service._deferred_function_calls) == 1
|
||||
|
||||
await service._run_pending_function_calls()
|
||||
|
||||
service.run_function_calls.assert_awaited_once()
|
||||
assert service._deferred_function_calls == []
|
||||
59
api/tests/test_realtime_message_append.py
Normal file
59
api/tests/test_realtime_message_append.py
Normal file
|
|
@ -0,0 +1,59 @@
|
|||
from types import SimpleNamespace
|
||||
from unittest.mock import AsyncMock
|
||||
|
||||
import pytest
|
||||
from pipecat.frames.frames import LLMMessagesAppendFrame
|
||||
from pipecat.services.openai.realtime import events
|
||||
|
||||
from api.services.pipecat.realtime.openai_realtime import (
|
||||
DograhOpenAIRealtimeLLMService,
|
||||
)
|
||||
from api.services.workflow.pipecat_engine_callbacks import UserIdleHandler
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_openai_realtime_messages_append_frame_sends_conversation_item():
|
||||
service = DograhOpenAIRealtimeLLMService(api_key="test")
|
||||
service._api_session_ready = True
|
||||
service.send_client_event = AsyncMock()
|
||||
service._send_manual_response_create = AsyncMock()
|
||||
|
||||
await service._handle_messages_append(
|
||||
LLMMessagesAppendFrame(
|
||||
[{"role": "user", "content": "Are you still there?"}],
|
||||
run_llm=True,
|
||||
)
|
||||
)
|
||||
|
||||
service.send_client_event.assert_awaited_once()
|
||||
event = service.send_client_event.await_args.args[0]
|
||||
assert isinstance(event, events.ConversationItemCreateEvent)
|
||||
assert event.item.role == "user"
|
||||
assert event.item.type == "message"
|
||||
assert event.item.content == [
|
||||
events.ItemContent(type="input_text", text="Are you still there?")
|
||||
]
|
||||
service._send_manual_response_create.assert_awaited_once()
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_user_idle_handler_uses_realtime_append_path():
|
||||
engine = SimpleNamespace(
|
||||
llm=SimpleNamespace(),
|
||||
end_call_with_reason=AsyncMock(),
|
||||
)
|
||||
aggregator = SimpleNamespace(push_frame=AsyncMock())
|
||||
handler = UserIdleHandler(engine)
|
||||
|
||||
await handler.handle_idle(aggregator)
|
||||
|
||||
aggregator.push_frame.assert_awaited_once()
|
||||
frame = aggregator.push_frame.await_args.args[0]
|
||||
assert isinstance(frame, LLMMessagesAppendFrame)
|
||||
assert frame.run_llm is True
|
||||
assert frame.messages == [
|
||||
{
|
||||
"role": "user",
|
||||
"content": "The user has been quiet. Politely and briefly ask if they're still there in the language that the user has been speaking so far.",
|
||||
}
|
||||
]
|
||||
61
api/tests/test_run_pipeline_realtime_turn_config.py
Normal file
61
api/tests/test_run_pipeline_realtime_turn_config.py
Normal file
|
|
@ -0,0 +1,61 @@
|
|||
from pipecat.audio.vad.silero import SileroVADAnalyzer
|
||||
from pipecat.turns.user_start import (
|
||||
ExternalUserTurnStartStrategy,
|
||||
)
|
||||
from pipecat.turns.user_start.vad_user_turn_start_strategy import (
|
||||
VADUserTurnStartStrategy,
|
||||
)
|
||||
from pipecat.turns.user_stop import (
|
||||
ExternalUserTurnStopStrategy,
|
||||
SpeechTimeoutUserTurnStopStrategy,
|
||||
)
|
||||
|
||||
from api.services.configuration.registry import ServiceProviders
|
||||
from api.services.pipecat.run_pipeline import _create_realtime_user_turn_config
|
||||
|
||||
|
||||
def test_gemini_realtime_uses_local_vad_without_local_interruptions():
|
||||
strategies, vad_analyzer = _create_realtime_user_turn_config(
|
||||
ServiceProviders.GOOGLE_REALTIME.value
|
||||
)
|
||||
|
||||
assert isinstance(vad_analyzer, SileroVADAnalyzer)
|
||||
assert len(strategies.start) == 1
|
||||
assert isinstance(strategies.start[0], VADUserTurnStartStrategy)
|
||||
assert strategies.start[0]._enable_interruptions is False
|
||||
assert len(strategies.stop) == 1
|
||||
assert isinstance(strategies.stop[0], SpeechTimeoutUserTurnStopStrategy)
|
||||
|
||||
|
||||
def test_gemini_vertex_realtime_uses_same_turn_config_as_gemini_live():
|
||||
strategies, vad_analyzer = _create_realtime_user_turn_config(
|
||||
ServiceProviders.GOOGLE_VERTEX_REALTIME.value
|
||||
)
|
||||
|
||||
assert isinstance(vad_analyzer, SileroVADAnalyzer)
|
||||
assert len(strategies.start) == 1
|
||||
assert isinstance(strategies.start[0], VADUserTurnStartStrategy)
|
||||
assert strategies.start[0]._enable_interruptions is False
|
||||
|
||||
|
||||
def test_openai_realtime_uses_provider_turn_frames_without_local_vad():
|
||||
strategies, vad_analyzer = _create_realtime_user_turn_config(
|
||||
ServiceProviders.OPENAI_REALTIME.value
|
||||
)
|
||||
|
||||
assert vad_analyzer is None
|
||||
assert len(strategies.start) == 1
|
||||
assert isinstance(strategies.start[0], ExternalUserTurnStartStrategy)
|
||||
assert strategies.start[0]._enable_interruptions is False
|
||||
assert len(strategies.stop) == 1
|
||||
assert isinstance(strategies.stop[0], ExternalUserTurnStopStrategy)
|
||||
|
||||
|
||||
def test_unknown_realtime_providers_keep_local_vad():
|
||||
strategies, vad_analyzer = _create_realtime_user_turn_config("other_realtime")
|
||||
|
||||
assert isinstance(vad_analyzer, SileroVADAnalyzer)
|
||||
assert len(strategies.start) == 1
|
||||
assert isinstance(strategies.start[0], VADUserTurnStartStrategy)
|
||||
assert len(strategies.stop) == 1
|
||||
assert isinstance(strategies.stop[0], SpeechTimeoutUserTurnStopStrategy)
|
||||
|
|
@ -66,7 +66,7 @@ class TestUnregisteredFunctionCall:
|
|||
|
||||
# Pipecat's missing-function handler returns a string error.
|
||||
assert isinstance(result_frame.result, str)
|
||||
assert "not registered" in result_frame.result
|
||||
assert "not currently available" in result_frame.result
|
||||
assert "nonexistent_tool" in result_frame.result
|
||||
|
||||
# In-progress frame should also be emitted before the result so mute
|
||||
|
|
|
|||
File diff suppressed because one or more lines are too long
|
|
@ -120,4 +120,68 @@ To use Gemini 3.1 Live with Dograh, you need a Google Gemini API key. Follow the
|
|||
|
||||
<Note>
|
||||
When using a Realtime provider like Gemini Live, you do not need to configure separate TTS and STT services — the realtime model handles speech in and out. However, you **must** still configure an **LLM** under the LLM tab: it powers variable extraction and QA analysis, which the realtime service does not perform.
|
||||
</Note>
|
||||
|
||||
## Gemini Live on Vertex AI
|
||||
|
||||
If you want to run Gemini Live through your own Google Cloud project — for billing consolidation, VPC controls, regional residency, or enterprise IAM — Dograh also supports Gemini Live via **Vertex AI** as a separate provider (`google_vertex_realtime`). The default model is `google/gemini-live-2.5-flash-native-audio`.
|
||||
|
||||
Unlike Google AI Studio (which uses a single Gemini API key), Vertex AI authenticates with a **service account** belonging to your Google Cloud project.
|
||||
|
||||
### Prerequisites
|
||||
|
||||
1. A Google Cloud project with billing enabled.
|
||||
2. The Vertex AI API enabled on that project:
|
||||
|
||||
```bash
|
||||
gcloud services enable aiplatform.googleapis.com --project=YOUR_PROJECT_ID
|
||||
```
|
||||
|
||||
3. A service account with the **Vertex AI User** role (`roles/aiplatform.user`) on the project:
|
||||
|
||||
```bash
|
||||
gcloud projects add-iam-policy-binding YOUR_PROJECT_ID \
|
||||
--member="serviceAccount:YOUR_SA@YOUR_PROJECT_ID.iam.gserviceaccount.com" \
|
||||
--role="roles/aiplatform.user"
|
||||
```
|
||||
|
||||
4. A **JSON** key for that service account (P12 keys are not supported).
|
||||
|
||||
### Creating the service account key
|
||||
|
||||
1. In the GCP Console, go to **IAM & Admin → Service Accounts**.
|
||||
2. Pick an existing service account (or create a new one).
|
||||
3. Open the **Keys** tab → **Add Key → Create new key**.
|
||||
4. Choose **JSON** as the key type and click **Create**.
|
||||
5. The key file will download to your computer — store it securely and treat it as a secret.
|
||||
|
||||
<Note>
|
||||
Always pick **JSON**, not P12. The Vertex AI client libraries used by Dograh only accept service-account JSON keys; P12 is a legacy format retained for older Google Workspace integrations.
|
||||
</Note>
|
||||
|
||||
### Configuring Vertex AI Realtime in Dograh
|
||||
|
||||
1. Go to **Model Configurations** in your Dograh dashboard.
|
||||
2. Enable the **Realtime** toggle.
|
||||
3. Under the **Realtime** section, select `google_vertex_realtime` as the provider.
|
||||
4. Fill in the fields:
|
||||
|
||||
| Field | What to put in |
|
||||
|---|---|
|
||||
| **Model** | Vertex publisher/model id, e.g. `google/gemini-live-2.5-flash-native-audio` |
|
||||
| **Voice** | One of the built-in voices (Puck, Charon, Kore, Fenrir, Aoede) |
|
||||
| **Language** | BCP-47 code (e.g. `en-US`) |
|
||||
| **Project Id** | The `project_id` value from your service-account JSON |
|
||||
| **Location** | GCP region where the model is available (e.g. `us-east4`) |
|
||||
| **Credentials** | Paste the **entire contents** of the service-account JSON file |
|
||||
| **API Key** | Leave blank — Vertex AI does not use API keys |
|
||||
|
||||
5. Save the configuration.
|
||||
|
||||
<Note>
|
||||
Paste the whole JSON file into the **Credentials** field — including `private_key`, `client_email`, and all other entries. Don't try to extract individual fields. If `Credentials` is left blank, Dograh falls back to **Application Default Credentials (ADC)** from the host environment, which is useful when running Dograh on a GCP VM or GKE pod with an attached service account.
|
||||
</Note>
|
||||
|
||||
<Note>
|
||||
IAM changes can take up to ~60 seconds to propagate. If you see `Permission 'aiplatform.endpoints.predict' denied`, wait a minute and retry — or double-check that the role was granted to the same service account whose JSON you pasted.
|
||||
</Note>
|
||||
|
|
@ -193,7 +193,6 @@ If your provider POSTs webhooks to Dograh (answer URL, status callbacks, hangup
|
|||
```python
|
||||
# providers/your_provider/routes.py
|
||||
from fastapi import APIRouter, Request
|
||||
from api.services.telephony.factory import get_telephony_provider
|
||||
from api.services.telephony.status_processor import (
|
||||
StatusCallbackRequest,
|
||||
_process_status_update,
|
||||
|
|
@ -286,7 +285,7 @@ register(SPEC)
|
|||
| Field | Used by |
|
||||
| --- | --- |
|
||||
| `name` | Stored as the discriminator on every `TelephonyConfiguration` row and as the `WorkflowRunMode` value |
|
||||
| `provider_cls` | `factory.get_telephony_provider*` |
|
||||
| `provider_cls` | `factory.get_default_telephony_provider`, `get_telephony_provider_by_id`, `get_telephony_provider_for_run` |
|
||||
| `config_loader` | `factory._normalize_with_phone_numbers` (replaces the old if/elif chain) |
|
||||
| `transport_factory` | `run_pipeline_telephony` |
|
||||
| `audio_config` | `create_audio_config()` and `run_pipeline_telephony` |
|
||||
|
|
@ -375,7 +374,7 @@ For end-to-end testing, save your provider through the telephony-configurations
|
|||
|
||||
## Best Practices
|
||||
|
||||
1. **Trust the registry** — never import another provider's class directly; resolve through `factory.get_telephony_provider*`.
|
||||
1. **Trust the registry** — never import another provider's class directly; resolve through the factory helpers (`get_default_telephony_provider`, `get_telephony_provider_by_id`, etc.).
|
||||
2. **Sensitive fields** — mark every credential field `sensitive=True` in `ProviderUIMetadata`. The save endpoint masks these on read and preserves the original when the client re-submits a masked value.
|
||||
3. **Inbound signature verification** — always validate inbound webhook signatures in `verify_inbound_signature`. Returning `True` when no signature header is present is acceptable; return `False` when a signature *is* present but invalid.
|
||||
4. **Transports load credentials lazily** — call `load_credentials_for_transport` with the `telephony_configuration_id` from the workflow run. Don't read the org's default config from `transport.py`.
|
||||
|
|
|
|||
2
pipecat
2
pipecat
|
|
@ -1 +1 @@
|
|||
Subproject commit 31199bd1f155a872690ff1de24b0fa5ccff49396
|
||||
Subproject commit 8590e5333d63eb69b78a193f9eeb2ff0584f9e9a
|
||||
|
|
@ -4,4 +4,6 @@ set -euo pipefail
|
|||
ruff check api --select I --select F401 --fix
|
||||
ruff format api
|
||||
|
||||
ruff format pipecat
|
||||
|
||||
(cd ui && npm run fix-lint)
|
||||
|
|
|
|||
|
|
@ -1,6 +1,6 @@
|
|||
# generated by datamodel-codegen:
|
||||
# filename: dograh-openapi-XXXXXX.json.ZdsSaa9ytb
|
||||
# timestamp: 2026-05-16T09:21:15+00:00
|
||||
# filename: dograh-openapi-XXXXXX.json.UrxDUj7bBM
|
||||
# timestamp: 2026-05-16T12:21:33+00:00
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
|
|
|
|||
|
|
@ -11,6 +11,8 @@ import {
|
|||
KeyValueEditor,
|
||||
type KeyValueItem,
|
||||
ParameterEditor,
|
||||
PresetParameterEditor,
|
||||
type PresetToolParameter,
|
||||
type ToolParameter,
|
||||
UrlInput,
|
||||
} from "@/components/http";
|
||||
|
|
@ -35,6 +37,8 @@ export interface HttpApiToolConfigProps {
|
|||
onHeadersChange: (headers: KeyValueItem[]) => void;
|
||||
parameters: ToolParameter[];
|
||||
onParametersChange: (parameters: ToolParameter[]) => void;
|
||||
presetParameters: PresetToolParameter[];
|
||||
onPresetParametersChange: (parameters: PresetToolParameter[]) => void;
|
||||
timeoutMs: number;
|
||||
onTimeoutMsChange: (timeout: number) => void;
|
||||
customMessage: string;
|
||||
|
|
@ -61,6 +65,8 @@ export function HttpApiToolConfig({
|
|||
onHeadersChange,
|
||||
parameters,
|
||||
onParametersChange,
|
||||
presetParameters,
|
||||
onPresetParametersChange,
|
||||
timeoutMs,
|
||||
onTimeoutMsChange,
|
||||
customMessage,
|
||||
|
|
@ -182,7 +188,7 @@ export function HttpApiToolConfig({
|
|||
|
||||
<TabsContent value="parameters" className="space-y-4 mt-4">
|
||||
<div className="grid gap-2">
|
||||
<Label>Tool Parameters</Label>
|
||||
<Label>LLM Parameters</Label>
|
||||
<Label className="text-xs text-muted-foreground">
|
||||
Define the parameters that the LLM will provide when calling this tool.
|
||||
These will be sent as JSON body for POST/PUT/PATCH or as URL query params for GET/DELETE.
|
||||
|
|
@ -193,6 +199,18 @@ export function HttpApiToolConfig({
|
|||
/>
|
||||
</div>
|
||||
|
||||
<div className="grid gap-2 pt-4 border-t">
|
||||
<Label>Preset Parameters</Label>
|
||||
<Label className="text-xs text-muted-foreground">
|
||||
Add values that Dograh should inject at runtime. These are not exposed to the LLM and can use
|
||||
workflow templates like {`{{initial_context.phone_number}}`} or fixed literals.
|
||||
</Label>
|
||||
<PresetParameterEditor
|
||||
parameters={presetParameters}
|
||||
onChange={onPresetParametersChange}
|
||||
/>
|
||||
</div>
|
||||
|
||||
<div className="grid gap-2 pt-4 border-t">
|
||||
<Label>Custom Headers</Label>
|
||||
<Label className="text-xs text-muted-foreground">
|
||||
|
|
|
|||
|
|
@ -11,7 +11,13 @@ import {
|
|||
} from "@/client/sdk.gen";
|
||||
import type { RecordingResponseSchema, ToolResponse, TransferCallConfig as APITransferCallConfig } from "@/client/types.gen";
|
||||
import type { EndCallConfig } from "@/client/types.gen";
|
||||
import { type HttpMethod, type KeyValueItem, type ToolParameter, validateUrl } from "@/components/http";
|
||||
import {
|
||||
type HttpMethod,
|
||||
type KeyValueItem,
|
||||
type PresetToolParameter,
|
||||
type ToolParameter,
|
||||
validateUrl,
|
||||
} from "@/components/http";
|
||||
import { Button } from "@/components/ui/button";
|
||||
import {
|
||||
Dialog,
|
||||
|
|
@ -41,6 +47,12 @@ interface HttpApiConfigWithParams {
|
|||
headers?: Record<string, string>;
|
||||
credential_uuid?: string;
|
||||
parameters?: ToolParameter[];
|
||||
preset_parameters?: Array<{
|
||||
name?: string;
|
||||
type?: PresetToolParameter["type"];
|
||||
value_template?: string;
|
||||
required?: boolean;
|
||||
}>;
|
||||
timeout_ms?: number;
|
||||
customMessage?: string;
|
||||
}
|
||||
|
|
@ -70,6 +82,7 @@ export default function ToolDetailPage() {
|
|||
const [credentialUuid, setCredentialUuid] = useState("");
|
||||
const [headers, setHeaders] = useState<KeyValueItem[]>([]);
|
||||
const [parameters, setParameters] = useState<ToolParameter[]>([]);
|
||||
const [presetParameters, setPresetParameters] = useState<PresetToolParameter[]>([]);
|
||||
const [timeoutMs, setTimeoutMs] = useState(5000);
|
||||
|
||||
// End Call form state
|
||||
|
|
@ -209,6 +222,19 @@ export default function ToolDetailPage() {
|
|||
} else {
|
||||
setParameters([]);
|
||||
}
|
||||
|
||||
if (config.preset_parameters && Array.isArray(config.preset_parameters)) {
|
||||
setPresetParameters(
|
||||
config.preset_parameters.map((p) => ({
|
||||
name: p.name || "",
|
||||
type: p.type || "string",
|
||||
valueTemplate: p.value_template || "",
|
||||
required: p.required ?? true,
|
||||
}))
|
||||
);
|
||||
} else {
|
||||
setPresetParameters([]);
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
|
@ -263,6 +289,14 @@ export default function ToolDetailPage() {
|
|||
setError("All parameters must have a name");
|
||||
return;
|
||||
}
|
||||
|
||||
const invalidPresetParams = presetParameters.filter(
|
||||
(p) => !p.name.trim() || !p.valueTemplate.trim()
|
||||
);
|
||||
if (invalidPresetParams.length > 0) {
|
||||
setError("All preset parameters must have a name and a value");
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
try {
|
||||
|
|
@ -325,6 +359,9 @@ export default function ToolDetailPage() {
|
|||
});
|
||||
|
||||
const validParameters = parameters.filter((p) => p.name.trim());
|
||||
const validPresetParameters = presetParameters.filter(
|
||||
(p) => p.name.trim() && p.valueTemplate.trim()
|
||||
);
|
||||
|
||||
requestBody = {
|
||||
name,
|
||||
|
|
@ -342,6 +379,15 @@ export default function ToolDetailPage() {
|
|||
: undefined,
|
||||
parameters:
|
||||
validParameters.length > 0 ? validParameters : undefined,
|
||||
preset_parameters:
|
||||
validPresetParameters.length > 0
|
||||
? validPresetParameters.map((p) => ({
|
||||
name: p.name,
|
||||
type: p.type,
|
||||
value_template: p.valueTemplate,
|
||||
required: p.required,
|
||||
}))
|
||||
: undefined,
|
||||
timeout_ms: timeoutMs,
|
||||
customMessage: customMessageType === 'text' ? (customMessage || undefined) : undefined,
|
||||
customMessageType,
|
||||
|
|
@ -394,8 +440,20 @@ export default function ToolDetailPage() {
|
|||
exampleBody[p.name] = `<${p.name}>`;
|
||||
}
|
||||
});
|
||||
presetParameters.forEach((p) => {
|
||||
if (p.type === "number") {
|
||||
exampleBody[p.name] = p.valueTemplate || 0;
|
||||
} else if (p.type === "boolean") {
|
||||
exampleBody[p.name] = p.valueTemplate || true;
|
||||
} else {
|
||||
exampleBody[p.name] = p.valueTemplate || `<${p.name}>`;
|
||||
}
|
||||
});
|
||||
|
||||
const hasBody = httpMethod !== "GET" && httpMethod !== "DELETE" && parameters.length > 0;
|
||||
const hasBody =
|
||||
httpMethod !== "GET" &&
|
||||
httpMethod !== "DELETE" &&
|
||||
(parameters.length > 0 || presetParameters.length > 0);
|
||||
|
||||
return `// ${tool.name}
|
||||
// ${tool.description || "HTTP API Tool"}
|
||||
|
|
@ -571,6 +629,8 @@ const data = await response.json();`;
|
|||
onHeadersChange={setHeaders}
|
||||
parameters={parameters}
|
||||
onParametersChange={setParameters}
|
||||
presetParameters={presetParameters}
|
||||
onPresetParametersChange={setPresetParameters}
|
||||
timeoutMs={timeoutMs}
|
||||
onTimeoutMsChange={setTimeoutMs}
|
||||
customMessage={customMessage}
|
||||
|
|
|
|||
File diff suppressed because one or more lines are too long
|
|
@ -1808,6 +1808,12 @@ export type HttpApiConfig = {
|
|||
* Parameters that the tool accepts from LLM
|
||||
*/
|
||||
parameters?: Array<ToolParameter> | null;
|
||||
/**
|
||||
* Preset Parameters
|
||||
*
|
||||
* Parameters injected by Dograh from fixed values or workflow context templates
|
||||
*/
|
||||
preset_parameters?: Array<PresetToolParameter> | null;
|
||||
/**
|
||||
* Timeout Ms
|
||||
*
|
||||
|
|
@ -2396,6 +2402,38 @@ export type PlivoConfigurationResponse = {
|
|||
from_numbers: Array<string>;
|
||||
};
|
||||
|
||||
/**
|
||||
* PresetToolParameter
|
||||
*
|
||||
* A parameter injected by Dograh at runtime.
|
||||
*/
|
||||
export type PresetToolParameter = {
|
||||
/**
|
||||
* Name
|
||||
*
|
||||
* Parameter name (used as key in request body)
|
||||
*/
|
||||
name: string;
|
||||
/**
|
||||
* Type
|
||||
*
|
||||
* Parameter type: string, number, or boolean
|
||||
*/
|
||||
type: string;
|
||||
/**
|
||||
* Value Template
|
||||
*
|
||||
* Fixed value or template, e.g. {{initial_context.phone_number}}
|
||||
*/
|
||||
value_template: string;
|
||||
/**
|
||||
* Required
|
||||
*
|
||||
* Whether the parameter must resolve to a non-empty value
|
||||
*/
|
||||
required?: boolean;
|
||||
};
|
||||
|
||||
/**
|
||||
* PresignedUploadUrlRequest
|
||||
*/
|
||||
|
|
|
|||
|
|
@ -13,6 +13,7 @@ import { Label } from "@/components/ui/label";
|
|||
import { Select, SelectContent, SelectItem, SelectTrigger, SelectValue } from "@/components/ui/select";
|
||||
import { Switch } from "@/components/ui/switch";
|
||||
import { Tabs, TabsContent, TabsList, TabsTrigger } from "@/components/ui/tabs";
|
||||
import { Textarea } from "@/components/ui/textarea";
|
||||
import { VoiceSelector } from "@/components/VoiceSelector";
|
||||
import { LANGUAGE_DISPLAY_NAMES } from "@/constants/languages";
|
||||
import { useUserConfig } from "@/context/UserConfigContext";
|
||||
|
|
@ -30,6 +31,7 @@ interface SchemaProperty {
|
|||
$ref?: string;
|
||||
description?: string;
|
||||
format?: string;
|
||||
multiline?: boolean;
|
||||
}
|
||||
|
||||
interface ProviderSchema {
|
||||
|
|
@ -501,18 +503,26 @@ export function ServiceConfigurationForm({
|
|||
|
||||
{currentProvider && providerSchema && configFields.length > 1 && (
|
||||
<div className="grid grid-cols-2 gap-4">
|
||||
{configFields.slice(1).map((field) => (
|
||||
<div key={field} className="space-y-2">
|
||||
<Label className="capitalize">{field.replace(/_/g, ' ')}</Label>
|
||||
{renderField(service, field, providerSchema)}
|
||||
</div>
|
||||
))}
|
||||
{configFields.slice(1).map((field) => {
|
||||
const fieldSchema = providerSchema.properties[field];
|
||||
const actualFieldSchema = fieldSchema?.$ref && providerSchema.$defs
|
||||
? providerSchema.$defs[fieldSchema.$ref.split('/').pop() || '']
|
||||
: fieldSchema;
|
||||
const fullWidth = actualFieldSchema?.multiline;
|
||||
return (
|
||||
<div key={field} className={`space-y-2 ${fullWidth ? "col-span-2" : ""}`}>
|
||||
<Label className="capitalize">{field.replace(/_/g, ' ')}</Label>
|
||||
{renderField(service, field, providerSchema)}
|
||||
</div>
|
||||
);
|
||||
})}
|
||||
</div>
|
||||
)}
|
||||
|
||||
{currentProvider && providerSchema && providerSchema.properties.api_key && (
|
||||
<div className="space-y-2">
|
||||
<Label>{mode === 'override' ? 'API Key (leave empty to use global)' : 'API Key(s)'}</Label>
|
||||
{renderFieldDescription("api_key", providerSchema)}
|
||||
{apiKeys[service].map((key, index) => (
|
||||
<div key={index} className="flex gap-2">
|
||||
<Input
|
||||
|
|
@ -564,7 +574,28 @@ export function ServiceConfigurationForm({
|
|||
);
|
||||
};
|
||||
|
||||
const renderFieldDescription = (field: string, providerSchema: ProviderSchema) => {
|
||||
const schema = providerSchema.properties[field];
|
||||
if (!schema) return null;
|
||||
const actualSchema = schema.$ref && providerSchema.$defs
|
||||
? providerSchema.$defs[schema.$ref.split('/').pop() || '']
|
||||
: schema;
|
||||
if (!actualSchema?.description) return null;
|
||||
return (
|
||||
<p className="text-xs text-muted-foreground">{actualSchema.description}</p>
|
||||
);
|
||||
};
|
||||
|
||||
const renderField = (service: ServiceSegment, field: string, providerSchema: ProviderSchema) => {
|
||||
return (
|
||||
<>
|
||||
{renderFieldInput(service, field, providerSchema)}
|
||||
{renderFieldDescription(field, providerSchema)}
|
||||
</>
|
||||
);
|
||||
};
|
||||
|
||||
const renderFieldInput = (service: ServiceSegment, field: string, providerSchema: ProviderSchema) => {
|
||||
const schema = providerSchema.properties[field];
|
||||
const actualSchema = schema.$ref && providerSchema.$defs
|
||||
? providerSchema.$defs[schema.$ref.split('/').pop() || '']
|
||||
|
|
@ -699,6 +730,19 @@ export function ServiceConfigurationForm({
|
|||
);
|
||||
}
|
||||
|
||||
if (actualSchema?.multiline) {
|
||||
return (
|
||||
<Textarea
|
||||
rows={6}
|
||||
className="font-mono text-xs"
|
||||
placeholder={`Enter ${field}`}
|
||||
{...register(`${service}_${field}`, {
|
||||
required: service !== "embeddings" && providerSchema.required?.includes(field),
|
||||
})}
|
||||
/>
|
||||
);
|
||||
}
|
||||
|
||||
return (
|
||||
<Input
|
||||
type={actualSchema?.type === "number" ? "number" : "text"}
|
||||
|
|
|
|||
|
|
@ -2,5 +2,11 @@ export { CreateCredentialDialog } from "./create-credential-dialog";
|
|||
export { CredentialSelector } from "./credential-selector";
|
||||
export { type HttpMethod, HttpMethodSelector } from "./http-method-selector";
|
||||
export { KeyValueEditor, type KeyValueItem } from "./key-value-editor";
|
||||
export { ParameterEditor, type ParameterType,type ToolParameter } from "./parameter-editor";
|
||||
export { UrlInput, type UrlValidationResult,validateUrl } from "./url-input";
|
||||
export {
|
||||
ParameterEditor,
|
||||
type ParameterType,
|
||||
PresetParameterEditor,
|
||||
type PresetToolParameter,
|
||||
type ToolParameter,
|
||||
} from "./parameter-editor";
|
||||
export { UrlInput, type UrlValidationResult, validateUrl } from "./url-input";
|
||||
|
|
|
|||
|
|
@ -23,6 +23,13 @@ export interface ToolParameter {
|
|||
required: boolean;
|
||||
}
|
||||
|
||||
export interface PresetToolParameter {
|
||||
name: string;
|
||||
type: ParameterType;
|
||||
valueTemplate: string;
|
||||
required: boolean;
|
||||
}
|
||||
|
||||
interface ParameterEditorProps {
|
||||
parameters: ToolParameter[];
|
||||
onChange: (parameters: ToolParameter[]) => void;
|
||||
|
|
@ -165,3 +172,146 @@ export function ParameterEditor({
|
|||
</div>
|
||||
);
|
||||
}
|
||||
|
||||
interface PresetParameterEditorProps {
|
||||
parameters: PresetToolParameter[];
|
||||
onChange: (parameters: PresetToolParameter[]) => void;
|
||||
disabled?: boolean;
|
||||
}
|
||||
|
||||
export function PresetParameterEditor({
|
||||
parameters,
|
||||
onChange,
|
||||
disabled = false,
|
||||
}: PresetParameterEditorProps) {
|
||||
const addParameter = () => {
|
||||
onChange([
|
||||
...parameters,
|
||||
{ name: "", type: "string", valueTemplate: "", required: true },
|
||||
]);
|
||||
};
|
||||
|
||||
const updateParameter = (
|
||||
index: number,
|
||||
field: keyof PresetToolParameter,
|
||||
value: string | boolean
|
||||
) => {
|
||||
const newParams = [...parameters];
|
||||
newParams[index] = { ...newParams[index], [field]: value };
|
||||
onChange(newParams);
|
||||
};
|
||||
|
||||
const removeParameter = (index: number) => {
|
||||
onChange(parameters.filter((_, i) => i !== index));
|
||||
};
|
||||
|
||||
return (
|
||||
<div className="space-y-4">
|
||||
{parameters.length === 0 && (
|
||||
<div className="text-sm text-muted-foreground py-4 text-center border border-dashed rounded-md">
|
||||
No preset parameters defined. Add one to inject a fixed value or workflow context into the request.
|
||||
</div>
|
||||
)}
|
||||
|
||||
{parameters.map((param, index) => (
|
||||
<div
|
||||
key={index}
|
||||
className="border rounded-lg p-4 space-y-3 bg-muted/20"
|
||||
>
|
||||
<div className="flex items-center justify-between">
|
||||
<span className="text-sm font-medium text-muted-foreground">
|
||||
Preset Parameter {index + 1}
|
||||
</span>
|
||||
<Button
|
||||
variant="ghost"
|
||||
size="icon"
|
||||
onClick={() => removeParameter(index)}
|
||||
disabled={disabled}
|
||||
className="h-8 w-8"
|
||||
>
|
||||
<Trash2Icon className="h-4 w-4 text-muted-foreground hover:text-destructive" />
|
||||
</Button>
|
||||
</div>
|
||||
|
||||
<div className="grid grid-cols-2 gap-3">
|
||||
<div className="space-y-1.5">
|
||||
<Label className="text-xs">Name</Label>
|
||||
<Label className="text-xs text-muted-foreground">
|
||||
Key sent to the API, like "phone_number" or "customer_id"
|
||||
</Label>
|
||||
<Input
|
||||
placeholder="e.g., phone_number"
|
||||
value={param.name}
|
||||
onChange={(e) =>
|
||||
updateParameter(index, "name", e.target.value)
|
||||
}
|
||||
disabled={disabled}
|
||||
/>
|
||||
</div>
|
||||
<div className="space-y-1.5">
|
||||
<Label className="text-xs">Type</Label>
|
||||
<Label className="text-xs text-muted-foreground">
|
||||
JSON type to send to the API
|
||||
</Label>
|
||||
<Select
|
||||
value={param.type}
|
||||
onValueChange={(value: ParameterType) =>
|
||||
updateParameter(index, "type", value)
|
||||
}
|
||||
disabled={disabled}
|
||||
>
|
||||
<SelectTrigger>
|
||||
<SelectValue placeholder="Select type" />
|
||||
</SelectTrigger>
|
||||
<SelectContent>
|
||||
<SelectItem value="string">String</SelectItem>
|
||||
<SelectItem value="number">Number</SelectItem>
|
||||
<SelectItem value="boolean">Boolean</SelectItem>
|
||||
</SelectContent>
|
||||
</Select>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div className="space-y-1.5">
|
||||
<Label className="text-xs">Value or Template</Label>
|
||||
<Label className="text-xs text-muted-foreground">
|
||||
Use a fixed value or a template like {`{{initial_context.phone_number}}`} or {`{{gathered_context.customer_id}}`}
|
||||
</Label>
|
||||
<Input
|
||||
placeholder="e.g., {{initial_context.phone_number}}"
|
||||
value={param.valueTemplate}
|
||||
onChange={(e) =>
|
||||
updateParameter(index, "valueTemplate", e.target.value)
|
||||
}
|
||||
disabled={disabled}
|
||||
/>
|
||||
</div>
|
||||
|
||||
<div className="flex items-center gap-2">
|
||||
<Switch
|
||||
id={`preset-required-${index}`}
|
||||
checked={param.required}
|
||||
onCheckedChange={(checked) =>
|
||||
updateParameter(index, "required", checked)
|
||||
}
|
||||
disabled={disabled}
|
||||
/>
|
||||
<Label htmlFor={`preset-required-${index}`} className="text-sm">
|
||||
Required
|
||||
</Label>
|
||||
</div>
|
||||
</div>
|
||||
))}
|
||||
|
||||
<Button
|
||||
variant="outline"
|
||||
size="sm"
|
||||
onClick={addParameter}
|
||||
className="w-fit"
|
||||
disabled={disabled}
|
||||
>
|
||||
<PlusIcon className="h-4 w-4 mr-1" /> Add Preset Parameter
|
||||
</Button>
|
||||
</div>
|
||||
);
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue