feat: add Assembly AI STT

This commit is contained in:
Abhishek Kumar 2026-04-03 07:10:37 +05:30
parent 66b085dde2
commit 501d06c00d
7 changed files with 94 additions and 8 deletions

View file

@ -49,6 +49,7 @@ class UserConfigurationValidator:
ServiceProviders.SPEACHES.value: self._check_speaches_api_key,
ServiceProviders.OPENAI_REALTIME.value: self._check_openai_api_key,
ServiceProviders.GOOGLE_REALTIME.value: self._check_google_api_key,
ServiceProviders.ASSEMBLYAI.value: self._check_assemblyai_api_key,
}
async def validate(
@ -217,3 +218,6 @@ class UserConfigurationValidator:
if not service_config.aws_access_key or not service_config.aws_secret_key:
raise ValueError("AWS access key and secret key are required for Bedrock")
return True
def _check_assemblyai_api_key(self, model: str, service_config) -> bool:
return True

View file

@ -29,6 +29,7 @@ class ServiceProviders(str, Enum):
CAMB = "camb"
AWS_BEDROCK = "aws_bedrock"
SPEACHES = "speaches"
ASSEMBLYAI = "assemblyai"
OPENAI_REALTIME = "openai_realtime"
GOOGLE_REALTIME = "google_realtime"
@ -45,6 +46,7 @@ class BaseServiceConfiguration(BaseModel):
ServiceProviders.DOGRAH,
ServiceProviders.AWS_BEDROCK,
ServiceProviders.SPEACHES,
ServiceProviders.ASSEMBLYAI,
ServiceProviders.OPENAI_REALTIME,
ServiceProviders.GOOGLE_REALTIME,
# ServiceProviders.SARVAM,
@ -318,7 +320,33 @@ OPENAI_REALTIME_VOICES = [
GOOGLE_REALTIME_MODELS = ["gemini-3.1-flash-live-preview"]
GOOGLE_REALTIME_VOICES = ["Puck", "Charon", "Kore", "Fenrir", "Aoede"]
GOOGLE_REALTIME_LANGUAGES = ["en"]
GOOGLE_REALTIME_LANGUAGES = [
"ar",
"bn",
"de",
"en",
"es",
"fr",
"gu",
"hi",
"id",
"it",
"ja",
"kn",
"ko",
"ml",
"mr",
"nl",
"pl",
"pt",
"ru",
"ta",
"te",
"th",
"tr",
"vi",
"zh",
]
@register_service(ServiceType.REALTIME)
@ -830,6 +858,23 @@ class SpeachesSTTConfiguration(BaseSTTConfiguration):
api_key: str | list[str] | None = Field(default=None)
ASSEMBLYAI_STT_MODELS = ["u3-rt-pro"]
ASSEMBLYAI_STT_LANGUAGES = ["en", "es", "de", "fr", "pt", "it"]
@register_stt
class AssemblyAISTTConfiguration(BaseSTTConfiguration):
provider: Literal[ServiceProviders.ASSEMBLYAI] = ServiceProviders.ASSEMBLYAI
model: str = Field(
default="u3-rt-pro",
json_schema_extra={"examples": ASSEMBLYAI_STT_MODELS},
)
language: str = Field(
default="en",
json_schema_extra={"examples": ASSEMBLYAI_STT_LANGUAGES},
)
STTConfig = Annotated[
Union[
DeepgramSTTConfiguration,
@ -839,6 +884,7 @@ STTConfig = Annotated[
SpeechmaticsSTTConfiguration,
SarvamSTTConfiguration,
SpeachesSTTConfiguration,
AssemblyAISTTConfiguration,
],
Field(discriminator="provider"),
]

View file

@ -5,6 +5,7 @@ from loguru import logger
from api.constants import MPS_API_URL
from api.services.configuration.registry import ServiceProviders
from pipecat.services.assemblyai.stt import AssemblyAISTTService, AssemblyAISTTSettings
from pipecat.services.aws.llm import AWSBedrockLLMService, AWSBedrockLLMSettings
from pipecat.services.azure.llm import AzureLLMService, AzureLLMSettings
from pipecat.services.cartesia.stt import CartesiaSTTService
@ -156,6 +157,17 @@ def create_stt_service(
),
sample_rate=audio_config.transport_in_sample_rate,
)
elif user_config.stt.provider == ServiceProviders.ASSEMBLYAI.value:
language = getattr(user_config.stt, "language", None)
pipecat_language = _to_language_enum(language, default=Language.EN)
settings_kwargs = {"model": user_config.stt.model, "language": pipecat_language}
if keyterms:
settings_kwargs["keyterms_prompt"] = keyterms
return AssemblyAISTTService(
api_key=user_config.stt.api_key,
settings=AssemblyAISTTSettings(**settings_kwargs),
sample_rate=audio_config.transport_in_sample_rate,
)
elif user_config.stt.provider == ServiceProviders.SPEECHMATICS.value:
from pipecat.services.speechmatics.stt import (
AdditionalVocabEntry,

View file

@ -26,7 +26,8 @@ def extract_template_variables(text: str) -> Set[str]:
if "." in var_name:
continue
# Skip variables with a fallback (they have a default value)
if filter_name == "fallback":
# Supports both {{var | default}} and legacy {{var | fallback:default}}
if filter_name is not None:
continue
# Skip system-injected variables
if var_name in _SYSTEM_VARIABLES:

View file

@ -187,12 +187,19 @@ def _render_string(template_str: str, context: Dict[str, Any]) -> str:
# Get value using nested path lookup
value = get_nested_value(context, variable_path)
# Apply filters
if filter_name == "fallback":
# Apply fallback: new syntax {{var | default}} or legacy {{var | fallback:default}}
if filter_name is not None:
if value is None or value == "":
value = (
filter_value if filter_value is not None else variable_path.title()
)
if filter_name == "fallback":
# Legacy syntax: {{var | fallback:default}}
value = (
filter_value
if filter_value is not None
else variable_path.title()
)
else:
# New syntax: {{var | default}}
value = filter_name
# Convert to string for substitution
if value is None:

View file

@ -45,6 +45,22 @@ whether they'd like to continue.
When the call starts, Dograh substitutes the values before sending the prompt to the LLM — so the agent speaks naturally as if it already knows the contact.
### Fallback values
If a variable might be missing or empty, use a pipe (`|`) to provide a default value:
```
Hello {{customer_name | there}}, we're calling about your {{plan | current}} plan.
```
When `customer_name` is not set, the agent will say "Hello there" instead of leaving a blank. The syntax is:
```
{{variable_name | fallback_value}}
```
If the variable is present and non-empty, the fallback is ignored and the actual value is used.
### Default variables
Built-in variables for current time and weekday, available in any prompt without setting up `initial_context`.

View file

@ -234,7 +234,7 @@
}
},
"banner": {
"content": "🎉 **New: Pre-recorded Audio** — lower latency, reduced TTS costs, and natural-sounding conversations using your own voice recordings. [Learn more →](/voice-agent/pre-recorded-audio)",
"content": "🎉 **New: Gemini Live 3.1 Support** — Try the latest Google Gemini Live 3.1 on Dograh platform. [Learn more →](/configurations/inference-providers#gemini-3-1-live)",
"dismissible": true
},
"search": {