feat: add speaches models

This commit is contained in:
Abhishek Kumar 2026-03-26 22:04:45 +05:30
parent 123114fb94
commit ec46878848
6 changed files with 185 additions and 91 deletions

View file

@ -40,7 +40,7 @@ class UserConfigurationValidator:
ServiceProviders.SPEECHMATICS.value: self._check_speechmatics_api_key,
ServiceProviders.CAMB.value: self._check_camb_api_key,
ServiceProviders.AWS_BEDROCK.value: self._check_aws_bedrock_api_key,
ServiceProviders.SELF_HOSTED.value: self._check_self_hosted_api_key,
ServiceProviders.SPEACHES.value: self._check_speaches_api_key,
}
async def validate(self, configuration: UserConfiguration) -> APIKeyStatusResponse:
@ -75,10 +75,10 @@ class UserConfigurationValidator:
provider = service_config.provider
# Self-hosted doesn't require an API key
if provider == ServiceProviders.SELF_HOSTED.value:
# Speaches doesn't require an API key
if provider == ServiceProviders.SPEACHES.value:
try:
if not self._check_self_hosted_api_key(provider, service_config):
if not self._check_speaches_api_key(provider, service_config):
return [
{
"model": service_name,
@ -179,9 +179,9 @@ class UserConfigurationValidator:
def _check_camb_api_key(self, model: str, api_key: str) -> bool:
return True
def _check_self_hosted_api_key(self, model: str, service_config) -> bool:
def _check_speaches_api_key(self, model: str, service_config) -> bool:
if not getattr(service_config, "base_url", None):
raise ValueError("base_url is required for self-hosted LLM")
raise ValueError("base_url is required for Speaches services")
return True
def _check_aws_bedrock_api_key(self, model: str, service_config) -> bool:

View file

@ -27,7 +27,7 @@ class ServiceProviders(str, Enum):
SPEECHMATICS = "speechmatics"
CAMB = "camb"
AWS_BEDROCK = "aws_bedrock"
SELF_HOSTED = "self_hosted"
SPEACHES = "speaches"
class BaseServiceConfiguration(BaseModel):
@ -41,7 +41,7 @@ class BaseServiceConfiguration(BaseModel):
ServiceProviders.AZURE,
ServiceProviders.DOGRAH,
ServiceProviders.AWS_BEDROCK,
ServiceProviders.SELF_HOSTED,
ServiceProviders.SPEACHES,
# ServiceProviders.SARVAM,
]
api_key: str | list[str]
@ -191,14 +191,18 @@ AWS_BEDROCK_MODELS = [
@register_llm
class OpenAILLMService(BaseLLMConfiguration):
provider: Literal[ServiceProviders.OPENAI] = ServiceProviders.OPENAI
model: str = Field(default="gpt-4.1", json_schema_extra={"examples": OPENAI_MODELS})
model: str = Field(
default="gpt-4.1",
json_schema_extra={"examples": OPENAI_MODELS, "allow_custom_input": True},
)
@register_llm
class GoogleLLMService(BaseLLMConfiguration):
provider: Literal[ServiceProviders.GOOGLE] = ServiceProviders.GOOGLE
model: str = Field(
default="gemini-2.0-flash", json_schema_extra={"examples": GOOGLE_MODELS}
default="gemini-2.0-flash",
json_schema_extra={"examples": GOOGLE_MODELS, "allow_custom_input": True},
)
@ -206,7 +210,8 @@ class GoogleLLMService(BaseLLMConfiguration):
class GroqLLMService(BaseLLMConfiguration):
provider: Literal[ServiceProviders.GROQ] = ServiceProviders.GROQ
model: str = Field(
default="llama-3.3-70b-versatile", json_schema_extra={"examples": GROQ_MODELS}
default="llama-3.3-70b-versatile",
json_schema_extra={"examples": GROQ_MODELS, "allow_custom_input": True},
)
@ -214,7 +219,8 @@ class GroqLLMService(BaseLLMConfiguration):
class OpenRouterLLMConfiguration(BaseLLMConfiguration):
provider: Literal[ServiceProviders.OPENROUTER] = ServiceProviders.OPENROUTER
model: str = Field(
default="openai/gpt-4.1", json_schema_extra={"examples": OPENROUTER_MODELS}
default="openai/gpt-4.1",
json_schema_extra={"examples": OPENROUTER_MODELS, "allow_custom_input": True},
)
base_url: str = Field(default="https://openrouter.ai/api/v1")
@ -224,7 +230,8 @@ class OpenRouterLLMConfiguration(BaseLLMConfiguration):
class AzureLLMService(BaseLLMConfiguration):
provider: Literal[ServiceProviders.AZURE] = ServiceProviders.AZURE
model: str = Field(
default="gpt-4.1-mini", json_schema_extra={"examples": AZURE_MODELS}
default="gpt-4.1-mini",
json_schema_extra={"examples": AZURE_MODELS, "allow_custom_input": True},
)
endpoint: str
@ -234,7 +241,8 @@ class AzureLLMService(BaseLLMConfiguration):
class DograhLLMService(BaseLLMConfiguration):
provider: Literal[ServiceProviders.DOGRAH] = ServiceProviders.DOGRAH
model: str = Field(
default="default", json_schema_extra={"examples": DOGRAH_LLM_MODELS}
default="default",
json_schema_extra={"examples": DOGRAH_LLM_MODELS, "allow_custom_input": True},
)
@ -243,7 +251,7 @@ class AWSBedrockLLMConfiguration(BaseLLMConfiguration):
provider: Literal[ServiceProviders.AWS_BEDROCK] = ServiceProviders.AWS_BEDROCK
model: str = Field(
default="us.amazon.nova-pro-v1:0",
json_schema_extra={"examples": AWS_BEDROCK_MODELS},
json_schema_extra={"examples": AWS_BEDROCK_MODELS, "allow_custom_input": True},
)
aws_access_key: str = Field(default="")
aws_secret_key: str = Field(default="")
@ -251,14 +259,18 @@ class AWSBedrockLLMConfiguration(BaseLLMConfiguration):
api_key: str | list[str] | None = Field(default=None)
SELF_HOSTED_LLM_MODELS = ["llama3", "mistral", "phi3", "qwen2", "gemma2", "deepseek-r1"]
SPEACHES_LLM_MODELS = ["llama3", "mistral", "phi3", "qwen2", "gemma2", "deepseek-r1"]
@register_llm
class SelfHostedLLMConfiguration(BaseLLMConfiguration):
provider: Literal[ServiceProviders.SELF_HOSTED] = ServiceProviders.SELF_HOSTED
class SpeachesLLMConfiguration(BaseLLMConfiguration):
provider: Literal[ServiceProviders.SPEACHES] = ServiceProviders.SPEACHES
model: str = Field(
default="llama3", json_schema_extra={"examples": SELF_HOSTED_LLM_MODELS}
default="llama3",
json_schema_extra={
"examples": SPEACHES_LLM_MODELS,
"allow_custom_input": True,
},
)
base_url: str = Field(
default="http://localhost:11434/v1",
@ -276,7 +288,7 @@ LLMConfig = Annotated[
AzureLLMService,
DograhLLMService,
AWSBedrockLLMConfiguration,
SelfHostedLLMConfiguration,
SpeachesLLMConfiguration,
],
Field(discriminator="provider"),
]
@ -462,6 +474,34 @@ class CambTTSConfiguration(BaseTTSConfiguration):
language: str = Field(default="en-us", description="BCP-47 language code")
SPEACHES_TTS_MODELS = ["hexgrad/Kokoro-82M"]
@register_tts
class SpeachesTTSConfiguration(BaseTTSConfiguration):
provider: Literal[ServiceProviders.SPEACHES] = ServiceProviders.SPEACHES
model: str = Field(
default="kokoro",
json_schema_extra={
"examples": SPEACHES_TTS_MODELS,
"allow_custom_input": True,
},
)
voice: str = Field(
default="af_heart",
json_schema_extra={"allow_custom_input": True},
description="Voice ID for the TTS engine",
)
base_url: str = Field(
default="http://localhost:8000/v1",
description="OpenAI-compatible TTS endpoint (Kokoro-FastAPI, etc.)",
)
speed: float = Field(
default=1.0, ge=0.25, le=4.0, description="Speech speed (0.25 to 4.0)"
)
api_key: str | list[str] | None = Field(default=None)
TTSConfig = Annotated[
Union[
DeepgramTTSConfiguration,
@ -471,6 +511,7 @@ TTSConfig = Annotated[
DograhTTSService,
SarvamTTSConfiguration,
CambTTSConfiguration,
SpeachesTTSConfiguration,
],
Field(discriminator="provider"),
]
@ -674,6 +715,29 @@ class SpeechmaticsSTTConfiguration(BaseSTTConfiguration):
)
SPEACHES_STT_MODELS = [
"Systran/faster-distil-whisper-small.en",
"Systran/faster-whisper-large-v3",
]
@register_stt
class SpeachesSTTConfiguration(BaseSTTConfiguration):
provider: Literal[ServiceProviders.SPEACHES] = ServiceProviders.SPEACHES
model: str = Field(
default="Systran/faster-distil-whisper-small.en",
json_schema_extra={
"examples": SPEACHES_STT_MODELS,
"allow_custom_input": True,
},
)
base_url: str = Field(
default="http://localhost:8000/v1",
description="OpenAI-compatible STT endpoint (Speaches, etc.)",
)
api_key: str | list[str] | None = Field(default=None)
STTConfig = Annotated[
Union[
DeepgramSTTConfiguration,
@ -682,6 +746,7 @@ STTConfig = Annotated[
DograhSTTService,
SpeechmaticsSTTConfiguration,
SarvamSTTConfiguration,
SpeachesSTTConfiguration,
],
Field(discriminator="provider"),
]

View file

@ -27,11 +27,16 @@ from pipecat.services.google.llm import GoogleLLMService, GoogleLLMSettings
from pipecat.services.groq.llm import GroqLLMService, GroqLLMSettings
from pipecat.services.openai.base_llm import OpenAILLMSettings
from pipecat.services.openai.llm import OpenAILLMService
from pipecat.services.openai.stt import OpenAISTTService, OpenAISTTSettings
from pipecat.services.openai.stt import (
OpenAISTTService,
OpenAISTTSettings,
)
from pipecat.services.openai.tts import OpenAITTSService, OpenAITTSSettings
from pipecat.services.openrouter.llm import OpenRouterLLMService, OpenRouterLLMSettings
from pipecat.services.sarvam.stt import SarvamSTTService, SarvamSTTSettings
from pipecat.services.sarvam.tts import SarvamTTSService, SarvamTTSSettings
from pipecat.services.speaches.stt import SpeachesSTTService, SpeachesSTTSettings
from pipecat.services.speaches.tts import SpeachesTTSService, SpeachesTTSSettings
from pipecat.services.speechmatics.stt import (
SpeechmaticsSTTService,
SpeechmaticsSTTSettings,
@ -137,6 +142,20 @@ def create_stt_service(
),
sample_rate=audio_config.transport_in_sample_rate,
)
elif user_config.stt.provider == ServiceProviders.SPEACHES.value:
base_url = user_config.stt.base_url.replace("http://", "ws://").replace(
"https://", "wss://"
)
language = getattr(user_config.stt, "language", None) or "multi"
return SpeachesSTTService(
base_url=base_url,
api_key=user_config.stt.api_key or "none",
settings=SpeachesSTTSettings(
model=user_config.stt.model,
language=language,
),
sample_rate=audio_config.transport_in_sample_rate,
)
elif user_config.stt.provider == ServiceProviders.SPEECHMATICS.value:
from pipecat.services.speechmatics.stt import (
AdditionalVocabEntry,
@ -261,6 +280,18 @@ def create_tts_service(user_config, audio_config: "AudioConfig"):
# Set language directly as BCP-47 code (bypasses Language enum conversion)
tts._settings.language = language
return tts
elif user_config.tts.provider == ServiceProviders.SPEACHES.value:
return SpeachesTTSService(
base_url=user_config.tts.base_url,
api_key=user_config.tts.api_key or "none",
settings=SpeachesTTSSettings(
model=user_config.tts.model,
voice=user_config.tts.voice,
speed=user_config.tts.speed,
),
text_filters=[xml_function_tag_filter],
silence_time_s=1.0,
)
elif user_config.tts.provider == ServiceProviders.SARVAM.value:
# Map Sarvam language code to pipecat Language enum for TTS
language_mapping = {
@ -363,7 +394,7 @@ def create_llm_service_from_provider(
aws_region=aws_region,
settings=AWSBedrockLLMSettings(model=model),
)
elif provider == ServiceProviders.SELF_HOSTED.value:
elif provider == ServiceProviders.SPEACHES.value:
return OpenAILLMService(
base_url=base_url or "http://localhost:11434/v1",
api_key=api_key or "none",
@ -384,7 +415,7 @@ def create_llm_service(user_config):
kwargs["base_url"] = user_config.llm.base_url
elif provider == ServiceProviders.AZURE.value:
kwargs["endpoint"] = user_config.llm.endpoint
elif provider == ServiceProviders.SELF_HOSTED.value:
elif provider == ServiceProviders.SPEACHES.value:
kwargs["base_url"] = user_config.llm.base_url
elif provider == ServiceProviders.AWS_BEDROCK.value:
kwargs["aws_access_key"] = user_config.llm.aws_access_key

View file

@ -6,9 +6,12 @@ inline WebSocket media streaming.
import json
import random
from typing import Any, Dict, List, Optional, TYPE_CHECKING
from typing import TYPE_CHECKING, Any, Dict, List, Optional
import aiohttp
from fastapi import HTTPException
from loguru import logger
from api.enums import WorkflowRunMode
from api.services.telephony.base import (
CallInitiationResult,
@ -16,8 +19,6 @@ from api.services.telephony.base import (
TelephonyProvider,
)
from api.utils.common import get_backend_endpoints
from fastapi import HTTPException
from loguru import logger
if TYPE_CHECKING:
from fastapi import WebSocket

File diff suppressed because one or more lines are too long

View file

@ -24,6 +24,7 @@ interface SchemaProperty {
enum?: string[];
examples?: string[];
model_options?: Record<string, string[]>;
allow_custom_input?: boolean;
$ref?: string;
description?: string;
format?: string;
@ -80,8 +81,7 @@ export default function ServiceConfiguration() {
stt: [""],
embeddings: [""],
});
const [isManualModelInput, setIsManualModelInput] = useState(false);
const [hasCheckedManualMode, setHasCheckedManualMode] = useState(false);
const [isCustomInput, setIsCustomInput] = useState<Record<string, boolean>>({});
const {
register,
@ -165,39 +165,39 @@ export default function ServiceConfiguration() {
setServicePropertyValues("stt");
setServicePropertyValues("embeddings");
// Detect saved values that are not in suggested options (custom value)
const detectedCustomInput: Record<string, boolean> = {};
const allSchemas = response.data as Record<string, Record<string, ProviderSchema>>;
(["llm", "tts", "stt", "embeddings"] as ServiceSegment[]).forEach(service => {
const provider = selectedProviders[service];
const providerSchema = allSchemas[service]?.[provider];
if (!providerSchema) return;
Object.entries(providerSchema.properties).forEach(([field, schema]) => {
const actualSchema = (schema as SchemaProperty).$ref && providerSchema.$defs
? providerSchema.$defs[(schema as SchemaProperty).$ref!.split('/').pop() || '']
: schema as SchemaProperty;
if (!actualSchema?.allow_custom_input || !actualSchema?.examples) return;
const savedValue = userConfig?.[service]?.[field] as string | undefined;
if (savedValue && !actualSchema.examples.includes(savedValue)) {
detectedCustomInput[`${service}_${field}`] = true;
}
});
});
// IMPORTANT: Reset form values BEFORE changing providers
// Otherwise, Radix Select sees old values that don't match new provider's enum
// and calls onValueChange('') to clear "invalid" values
reset(defaultValues);
setApiKeys(loadedApiKeys);
setServiceProviders(selectedProviders);
setIsCustomInput(detectedCustomInput);
};
fetchConfigurations();
}, [reset, userConfig]);
// Check if the saved LLM model is not in the suggested options (custom model)
useEffect(() => {
if (hasCheckedManualMode) return;
const currentProvider = serviceProviders.llm;
const providerSchema = schemas?.llm?.[currentProvider];
if (!providerSchema) return;
const modelSchema = providerSchema.properties.model;
const actualModelSchema = modelSchema?.$ref && providerSchema.$defs
? providerSchema.$defs[modelSchema.$ref.split('/').pop() || '']
: modelSchema;
if (actualModelSchema?.examples && userConfig?.llm?.model) {
const savedModel = userConfig.llm.model as string;
const isInOptions = actualModelSchema.examples.includes(savedModel);
if (!isInOptions) {
setIsManualModelInput(true);
}
setHasCheckedManualMode(true);
}
}, [schemas, serviceProviders.llm, userConfig?.llm?.model, hasCheckedManualMode]);
// Reset voice when TTS model changes if the provider has model-dependent voice options
const ttsModel = watch("tts_model");
useEffect(() => {
@ -256,10 +256,14 @@ export default function ServiceConfiguration() {
setServiceProviders(prev => ({ ...prev, [service]: providerName }));
setApiKeys(prev => ({ ...prev, [service]: [""] }));
// Reset manual model input when LLM provider changes
if (service === "llm") {
setIsManualModelInput(false);
}
// Reset custom input toggles when provider changes
setIsCustomInput(prev => {
const next = { ...prev };
Object.keys(next).forEach(key => {
if (key.startsWith(`${service}_`)) delete next[key];
});
return next;
});
}
@ -459,15 +463,13 @@ export default function ServiceConfiguration() {
? providerSchema.$defs[schema.$ref.split('/').pop() || '']
: schema;
// Use VoiceSelector for voice field in TTS service (except Sarvam which uses predefined options)
if (service === "tts" && field === "voice") {
const currentProvider = serviceProviders.tts;
// Sarvam uses predefined voice options, not VoiceSelector
// VoiceSelector for TTS voice fields without predefined options or manual input flag
if (service === "tts" && field === "voice" && !actualSchema?.allow_custom_input) {
const hasVoiceOptions = actualSchema?.enum || actualSchema?.examples;
if (currentProvider !== "sarvam" && !hasVoiceOptions) {
if (!hasVoiceOptions) {
return (
<VoiceSelector
provider={currentProvider}
provider={serviceProviders.tts}
value={watch(`${service}_${field}`) as string || ""}
onChange={(voiceId) => {
setValue(`${service}_${field}`, voiceId, { shouldDirty: true });
@ -477,39 +479,36 @@ export default function ServiceConfiguration() {
}
}
// Handle LLM model field with manual input toggle (uses examples from schema)
if (service === "llm" && field === "model" && actualSchema?.examples) {
const currentValue = watch(`${service}_${field}`) as string || "";
const modelOptions = actualSchema.examples;
// Generic allow_custom_input handler for any field (model, voice with options, etc.)
if (actualSchema?.allow_custom_input && actualSchema?.examples) {
const fieldKey = `${service}_${field}`;
const currentValue = watch(fieldKey) as string || "";
const options = actualSchema.examples;
if (isManualModelInput) {
if (isCustomInput[fieldKey]) {
return (
<div className="space-y-2">
<Input
type="text"
placeholder="Enter model name"
placeholder={`Enter ${field}`}
value={currentValue}
onChange={(e) => {
setValue(`${service}_${field}`, e.target.value, { shouldDirty: true });
setValue(fieldKey, e.target.value, { shouldDirty: true });
}}
/>
<div className="flex items-center space-x-2">
<Checkbox
id="manual-model-input"
checked={isManualModelInput}
id={`custom-input-${fieldKey}`}
checked={true}
onCheckedChange={(checked) => {
setIsManualModelInput(checked as boolean);
if (!checked && modelOptions.length > 0) {
// Reset to first option when switching back
setValue(`${service}_${field}`, modelOptions[0], { shouldDirty: true });
setIsCustomInput(prev => ({ ...prev, [fieldKey]: checked as boolean }));
if (!checked && options.length > 0) {
setValue(fieldKey, options[0], { shouldDirty: true });
}
}}
/>
<Label
htmlFor="manual-model-input"
className="text-sm font-normal cursor-pointer"
>
Add Model Manually
<Label htmlFor={`custom-input-${fieldKey}`} className="text-sm font-normal cursor-pointer">
Enter Custom Value
</Label>
</div>
</div>
@ -522,14 +521,14 @@ export default function ServiceConfiguration() {
value={currentValue}
onValueChange={(value) => {
if (!value) return;
setValue(`${service}_${field}`, value, { shouldDirty: true });
setValue(fieldKey, value, { shouldDirty: true });
}}
>
<SelectTrigger className="w-full">
<SelectValue placeholder="Select model" />
<SelectValue placeholder={`Select ${field}`} />
</SelectTrigger>
<SelectContent>
{modelOptions.map((value: string) => (
{options.map((value: string) => (
<SelectItem key={value} value={value}>
{value}
</SelectItem>
@ -538,17 +537,14 @@ export default function ServiceConfiguration() {
</Select>
<div className="flex items-center space-x-2">
<Checkbox
id="manual-model-input-dropdown"
checked={isManualModelInput}
id={`custom-input-${fieldKey}-dropdown`}
checked={false}
onCheckedChange={(checked) => {
setIsManualModelInput(checked as boolean);
setIsCustomInput(prev => ({ ...prev, [fieldKey]: checked as boolean }));
}}
/>
<Label
htmlFor="manual-model-input-dropdown"
className="text-sm font-normal cursor-pointer"
>
Add Model Manually
<Label htmlFor={`custom-input-${fieldKey}-dropdown`} className="text-sm font-normal cursor-pointer">
Enter Custom Value
</Label>
</div>
</div>