feat: add speaches models

2026-06-22 08:38:13 +02:00 · 2026-03-26 22:04:45 +05:30 · 2026-03-26 22:04:45 +05:30 · ec46878848
commit ec46878848
parent 123114fb94
6 changed files with 185 additions and 91 deletions
--- a/api/services/configuration/check_validity.py
+++ b/api/services/configuration/check_validity.py
@ -40,7 +40,7 @@ class UserConfigurationValidator:
            ServiceProviders.SPEECHMATICS.value: self._check_speechmatics_api_key,
            ServiceProviders.CAMB.value: self._check_camb_api_key,
            ServiceProviders.AWS_BEDROCK.value: self._check_aws_bedrock_api_key,
-            ServiceProviders.SELF_HOSTED.value: self._check_self_hosted_api_key,
+            ServiceProviders.SPEACHES.value: self._check_speaches_api_key,
        }

    async def validate(self, configuration: UserConfiguration) -> APIKeyStatusResponse:
@ -75,10 +75,10 @@ class UserConfigurationValidator:

        provider = service_config.provider

-        # Self-hosted doesn't require an API key
-        if provider == ServiceProviders.SELF_HOSTED.value:
+        # Speaches doesn't require an API key
+        if provider == ServiceProviders.SPEACHES.value:
            try:
-                if not self._check_self_hosted_api_key(provider, service_config):
+                if not self._check_speaches_api_key(provider, service_config):
                    return [
                        {
                            "model": service_name,
@ -179,9 +179,9 @@ class UserConfigurationValidator:
    def _check_camb_api_key(self, model: str, api_key: str) -> bool:
        return True

-    def _check_self_hosted_api_key(self, model: str, service_config) -> bool:
+    def _check_speaches_api_key(self, model: str, service_config) -> bool:
        if not getattr(service_config, "base_url", None):
-            raise ValueError("base_url is required for self-hosted LLM")
+            raise ValueError("base_url is required for Speaches services")
        return True

    def _check_aws_bedrock_api_key(self, model: str, service_config) -> bool:
--- a/api/services/configuration/registry.py
+++ b/api/services/configuration/registry.py
@ -27,7 +27,7 @@ class ServiceProviders(str, Enum):
    SPEECHMATICS = "speechmatics"
    CAMB = "camb"
    AWS_BEDROCK = "aws_bedrock"
-    SELF_HOSTED = "self_hosted"
+    SPEACHES = "speaches"


 class BaseServiceConfiguration(BaseModel):
@ -41,7 +41,7 @@ class BaseServiceConfiguration(BaseModel):
        ServiceProviders.AZURE,
        ServiceProviders.DOGRAH,
        ServiceProviders.AWS_BEDROCK,
-        ServiceProviders.SELF_HOSTED,
+        ServiceProviders.SPEACHES,
        # ServiceProviders.SARVAM,
    ]
    api_key: str | list[str]
@ -191,14 +191,18 @@ AWS_BEDROCK_MODELS = [
@register_llm
 class OpenAILLMService(BaseLLMConfiguration):
    provider: Literal[ServiceProviders.OPENAI] = ServiceProviders.OPENAI
-    model: str = Field(default="gpt-4.1", json_schema_extra={"examples": OPENAI_MODELS})
+    model: str = Field(
+        default="gpt-4.1",
+        json_schema_extra={"examples": OPENAI_MODELS, "allow_custom_input": True},
+    )


@register_llm
 class GoogleLLMService(BaseLLMConfiguration):
    provider: Literal[ServiceProviders.GOOGLE] = ServiceProviders.GOOGLE
    model: str = Field(
-        default="gemini-2.0-flash", json_schema_extra={"examples": GOOGLE_MODELS}
+        default="gemini-2.0-flash",
+        json_schema_extra={"examples": GOOGLE_MODELS, "allow_custom_input": True},
    )


@ -206,7 +210,8 @@ class GoogleLLMService(BaseLLMConfiguration):
 class GroqLLMService(BaseLLMConfiguration):
    provider: Literal[ServiceProviders.GROQ] = ServiceProviders.GROQ
    model: str = Field(
-        default="llama-3.3-70b-versatile", json_schema_extra={"examples": GROQ_MODELS}
+        default="llama-3.3-70b-versatile",
+        json_schema_extra={"examples": GROQ_MODELS, "allow_custom_input": True},
    )


@ -214,7 +219,8 @@ class GroqLLMService(BaseLLMConfiguration):
 class OpenRouterLLMConfiguration(BaseLLMConfiguration):
    provider: Literal[ServiceProviders.OPENROUTER] = ServiceProviders.OPENROUTER
    model: str = Field(
-        default="openai/gpt-4.1", json_schema_extra={"examples": OPENROUTER_MODELS}
+        default="openai/gpt-4.1",
+        json_schema_extra={"examples": OPENROUTER_MODELS, "allow_custom_input": True},
    )

    base_url: str = Field(default="https://openrouter.ai/api/v1")
@ -224,7 +230,8 @@ class OpenRouterLLMConfiguration(BaseLLMConfiguration):
 class AzureLLMService(BaseLLMConfiguration):
    provider: Literal[ServiceProviders.AZURE] = ServiceProviders.AZURE
    model: str = Field(
-        default="gpt-4.1-mini", json_schema_extra={"examples": AZURE_MODELS}
+        default="gpt-4.1-mini",
+        json_schema_extra={"examples": AZURE_MODELS, "allow_custom_input": True},
    )

    endpoint: str
@ -234,7 +241,8 @@ class AzureLLMService(BaseLLMConfiguration):
 class DograhLLMService(BaseLLMConfiguration):
    provider: Literal[ServiceProviders.DOGRAH] = ServiceProviders.DOGRAH
    model: str = Field(
-        default="default", json_schema_extra={"examples": DOGRAH_LLM_MODELS}
+        default="default",
+        json_schema_extra={"examples": DOGRAH_LLM_MODELS, "allow_custom_input": True},
    )


@ -243,7 +251,7 @@ class AWSBedrockLLMConfiguration(BaseLLMConfiguration):
    provider: Literal[ServiceProviders.AWS_BEDROCK] = ServiceProviders.AWS_BEDROCK
    model: str = Field(
        default="us.amazon.nova-pro-v1:0",
-        json_schema_extra={"examples": AWS_BEDROCK_MODELS},
+        json_schema_extra={"examples": AWS_BEDROCK_MODELS, "allow_custom_input": True},
    )
    aws_access_key: str = Field(default="")
    aws_secret_key: str = Field(default="")
@ -251,14 +259,18 @@ class AWSBedrockLLMConfiguration(BaseLLMConfiguration):
    api_key: str | list[str] | None = Field(default=None)


-SELF_HOSTED_LLM_MODELS = ["llama3", "mistral", "phi3", "qwen2", "gemma2", "deepseek-r1"]
+SPEACHES_LLM_MODELS = ["llama3", "mistral", "phi3", "qwen2", "gemma2", "deepseek-r1"]


@register_llm
-class SelfHostedLLMConfiguration(BaseLLMConfiguration):
-    provider: Literal[ServiceProviders.SELF_HOSTED] = ServiceProviders.SELF_HOSTED
+class SpeachesLLMConfiguration(BaseLLMConfiguration):
+    provider: Literal[ServiceProviders.SPEACHES] = ServiceProviders.SPEACHES
    model: str = Field(
-        default="llama3", json_schema_extra={"examples": SELF_HOSTED_LLM_MODELS}
+        default="llama3",
+        json_schema_extra={
+            "examples": SPEACHES_LLM_MODELS,
+            "allow_custom_input": True,
+        },
    )
    base_url: str = Field(
        default="http://localhost:11434/v1",
@ -276,7 +288,7 @@ LLMConfig = Annotated[
        AzureLLMService,
        DograhLLMService,
        AWSBedrockLLMConfiguration,
-        SelfHostedLLMConfiguration,
+        SpeachesLLMConfiguration,
    ],
    Field(discriminator="provider"),
 ]
@ -462,6 +474,34 @@ class CambTTSConfiguration(BaseTTSConfiguration):
    language: str = Field(default="en-us", description="BCP-47 language code")


+SPEACHES_TTS_MODELS = ["hexgrad/Kokoro-82M"]
+
+
+@register_tts
+class SpeachesTTSConfiguration(BaseTTSConfiguration):
+    provider: Literal[ServiceProviders.SPEACHES] = ServiceProviders.SPEACHES
+    model: str = Field(
+        default="kokoro",
+        json_schema_extra={
+            "examples": SPEACHES_TTS_MODELS,
+            "allow_custom_input": True,
+        },
+    )
+    voice: str = Field(
+        default="af_heart",
+        json_schema_extra={"allow_custom_input": True},
+        description="Voice ID for the TTS engine",
+    )
+    base_url: str = Field(
+        default="http://localhost:8000/v1",
+        description="OpenAI-compatible TTS endpoint (Kokoro-FastAPI, etc.)",
+    )
+    speed: float = Field(
+        default=1.0, ge=0.25, le=4.0, description="Speech speed (0.25 to 4.0)"
+    )
+    api_key: str | list[str] | None = Field(default=None)
+
+
 TTSConfig = Annotated[
    Union[
        DeepgramTTSConfiguration,
@ -471,6 +511,7 @@ TTSConfig = Annotated[
        DograhTTSService,
        SarvamTTSConfiguration,
        CambTTSConfiguration,
+        SpeachesTTSConfiguration,
    ],
    Field(discriminator="provider"),
 ]
@ -674,6 +715,29 @@ class SpeechmaticsSTTConfiguration(BaseSTTConfiguration):
    )


+SPEACHES_STT_MODELS = [
+    "Systran/faster-distil-whisper-small.en",
+    "Systran/faster-whisper-large-v3",
+]
+
+
+@register_stt
+class SpeachesSTTConfiguration(BaseSTTConfiguration):
+    provider: Literal[ServiceProviders.SPEACHES] = ServiceProviders.SPEACHES
+    model: str = Field(
+        default="Systran/faster-distil-whisper-small.en",
+        json_schema_extra={
+            "examples": SPEACHES_STT_MODELS,
+            "allow_custom_input": True,
+        },
+    )
+    base_url: str = Field(
+        default="http://localhost:8000/v1",
+        description="OpenAI-compatible STT endpoint (Speaches, etc.)",
+    )
+    api_key: str | list[str] | None = Field(default=None)
+
+
 STTConfig = Annotated[
    Union[
        DeepgramSTTConfiguration,
@ -682,6 +746,7 @@ STTConfig = Annotated[
        DograhSTTService,
        SpeechmaticsSTTConfiguration,
        SarvamSTTConfiguration,
+        SpeachesSTTConfiguration,
    ],
    Field(discriminator="provider"),
 ]
--- a/api/services/pipecat/service_factory.py
+++ b/api/services/pipecat/service_factory.py
@ -27,11 +27,16 @@ from pipecat.services.google.llm import GoogleLLMService, GoogleLLMSettings
 from pipecat.services.groq.llm import GroqLLMService, GroqLLMSettings
 from pipecat.services.openai.base_llm import OpenAILLMSettings
 from pipecat.services.openai.llm import OpenAILLMService
-from pipecat.services.openai.stt import OpenAISTTService, OpenAISTTSettings
+from pipecat.services.openai.stt import (
+    OpenAISTTService,
+    OpenAISTTSettings,
+)
 from pipecat.services.openai.tts import OpenAITTSService, OpenAITTSSettings
 from pipecat.services.openrouter.llm import OpenRouterLLMService, OpenRouterLLMSettings
 from pipecat.services.sarvam.stt import SarvamSTTService, SarvamSTTSettings
 from pipecat.services.sarvam.tts import SarvamTTSService, SarvamTTSSettings
+from pipecat.services.speaches.stt import SpeachesSTTService, SpeachesSTTSettings
+from pipecat.services.speaches.tts import SpeachesTTSService, SpeachesTTSSettings
 from pipecat.services.speechmatics.stt import (
    SpeechmaticsSTTService,
    SpeechmaticsSTTSettings,
@ -137,6 +142,20 @@ def create_stt_service(
            ),
            sample_rate=audio_config.transport_in_sample_rate,
        )
+    elif user_config.stt.provider == ServiceProviders.SPEACHES.value:
+        base_url = user_config.stt.base_url.replace("http://", "ws://").replace(
+            "https://", "wss://"
+        )
+        language = getattr(user_config.stt, "language", None) or "multi"
+        return SpeachesSTTService(
+            base_url=base_url,
+            api_key=user_config.stt.api_key or "none",
+            settings=SpeachesSTTSettings(
+                model=user_config.stt.model,
+                language=language,
+            ),
+            sample_rate=audio_config.transport_in_sample_rate,
+        )
    elif user_config.stt.provider == ServiceProviders.SPEECHMATICS.value:
        from pipecat.services.speechmatics.stt import (
            AdditionalVocabEntry,
@ -261,6 +280,18 @@ def create_tts_service(user_config, audio_config: "AudioConfig"):
        # Set language directly as BCP-47 code (bypasses Language enum conversion)
        tts._settings.language = language
        return tts
+    elif user_config.tts.provider == ServiceProviders.SPEACHES.value:
+        return SpeachesTTSService(
+            base_url=user_config.tts.base_url,
+            api_key=user_config.tts.api_key or "none",
+            settings=SpeachesTTSSettings(
+                model=user_config.tts.model,
+                voice=user_config.tts.voice,
+                speed=user_config.tts.speed,
+            ),
+            text_filters=[xml_function_tag_filter],
+            silence_time_s=1.0,
+        )
    elif user_config.tts.provider == ServiceProviders.SARVAM.value:
        # Map Sarvam language code to pipecat Language enum for TTS
        language_mapping = {
@ -363,7 +394,7 @@ def create_llm_service_from_provider(
            aws_region=aws_region,
            settings=AWSBedrockLLMSettings(model=model),
        )
-    elif provider == ServiceProviders.SELF_HOSTED.value:
+    elif provider == ServiceProviders.SPEACHES.value:
        return OpenAILLMService(
            base_url=base_url or "http://localhost:11434/v1",
            api_key=api_key or "none",
@ -384,7 +415,7 @@ def create_llm_service(user_config):
        kwargs["base_url"] = user_config.llm.base_url
    elif provider == ServiceProviders.AZURE.value:
        kwargs["endpoint"] = user_config.llm.endpoint
-    elif provider == ServiceProviders.SELF_HOSTED.value:
+    elif provider == ServiceProviders.SPEACHES.value:
        kwargs["base_url"] = user_config.llm.base_url
    elif provider == ServiceProviders.AWS_BEDROCK.value:
        kwargs["aws_access_key"] = user_config.llm.aws_access_key
--- a/api/services/telephony/providers/telnyx_provider.py
+++ b/api/services/telephony/providers/telnyx_provider.py
@ -6,9 +6,12 @@ inline WebSocket media streaming.

 import json
 import random
-from typing import Any, Dict, List, Optional, TYPE_CHECKING
+from typing import TYPE_CHECKING, Any, Dict, List, Optional

 import aiohttp
+from fastapi import HTTPException
+from loguru import logger
+
 from api.enums import WorkflowRunMode
 from api.services.telephony.base import (
    CallInitiationResult,
@ -16,8 +19,6 @@ from api.services.telephony.base import (
    TelephonyProvider,
 )
 from api.utils.common import get_backend_endpoints
-from fastapi import HTTPException
-from loguru import logger

 if TYPE_CHECKING:
    from fastapi import WebSocket
--- a/ui/src/client/sdk.gen.ts
+++ b/ui/src/client/sdk.gen.ts
--- a/ui/src/components/ServiceConfiguration.tsx
+++ b/ui/src/components/ServiceConfiguration.tsx
@ -24,6 +24,7 @@ interface SchemaProperty {
    enum?: string[];
    examples?: string[];
    model_options?: Record<string, string[]>;
+    allow_custom_input?: boolean;
    $ref?: string;
    description?: string;
    format?: string;
@ -80,8 +81,7 @@ export default function ServiceConfiguration() {
        stt: [""],
        embeddings: [""],
    });
-    const [isManualModelInput, setIsManualModelInput] = useState(false);
-    const [hasCheckedManualMode, setHasCheckedManualMode] = useState(false);
+    const [isCustomInput, setIsCustomInput] = useState<Record<string, boolean>>({});

    const {
        register,
@ -165,39 +165,39 @@ export default function ServiceConfiguration() {
            setServicePropertyValues("stt");
            setServicePropertyValues("embeddings");

+            // Detect saved values that are not in suggested options (custom value)
+            const detectedCustomInput: Record<string, boolean> = {};
+            const allSchemas = response.data as Record<string, Record<string, ProviderSchema>>;
+            (["llm", "tts", "stt", "embeddings"] as ServiceSegment[]).forEach(service => {
+                const provider = selectedProviders[service];
+                const providerSchema = allSchemas[service]?.[provider];
+                if (!providerSchema) return;
+
+                Object.entries(providerSchema.properties).forEach(([field, schema]) => {
+                    const actualSchema = (schema as SchemaProperty).$ref && providerSchema.$defs
+                        ? providerSchema.$defs[(schema as SchemaProperty).$ref!.split('/').pop() || '']
+                        : schema as SchemaProperty;
+
+                    if (!actualSchema?.allow_custom_input || !actualSchema?.examples) return;
+
+                    const savedValue = userConfig?.[service]?.[field] as string | undefined;
+                    if (savedValue && !actualSchema.examples.includes(savedValue)) {
+                        detectedCustomInput[`${service}_${field}`] = true;
+                    }
+                });
+            });
+
            // IMPORTANT: Reset form values BEFORE changing providers
            // Otherwise, Radix Select sees old values that don't match new provider's enum
            // and calls onValueChange('') to clear "invalid" values
            reset(defaultValues);
            setApiKeys(loadedApiKeys);
            setServiceProviders(selectedProviders);
+            setIsCustomInput(detectedCustomInput);
        };
        fetchConfigurations();
    }, [reset, userConfig]);

-    // Check if the saved LLM model is not in the suggested options (custom model)
-    useEffect(() => {
-        if (hasCheckedManualMode) return;
-
-        const currentProvider = serviceProviders.llm;
-        const providerSchema = schemas?.llm?.[currentProvider];
-        if (!providerSchema) return;
-
-        const modelSchema = providerSchema.properties.model;
-        const actualModelSchema = modelSchema?.$ref && providerSchema.$defs
-            ? providerSchema.$defs[modelSchema.$ref.split('/').pop() || '']
-            : modelSchema;
-
-        if (actualModelSchema?.examples && userConfig?.llm?.model) {
-            const savedModel = userConfig.llm.model as string;
-            const isInOptions = actualModelSchema.examples.includes(savedModel);
-            if (!isInOptions) {
-                setIsManualModelInput(true);
-            }
-            setHasCheckedManualMode(true);
-        }
-    }, [schemas, serviceProviders.llm, userConfig?.llm?.model, hasCheckedManualMode]);
-
    // Reset voice when TTS model changes if the provider has model-dependent voice options
    const ttsModel = watch("tts_model");
    useEffect(() => {
@ -256,10 +256,14 @@ export default function ServiceConfiguration() {
        setServiceProviders(prev => ({ ...prev, [service]: providerName }));
        setApiKeys(prev => ({ ...prev, [service]: [""] }));

-        // Reset manual model input when LLM provider changes
-        if (service === "llm") {
-            setIsManualModelInput(false);
-        }
+        // Reset custom input toggles when provider changes
+        setIsCustomInput(prev => {
+            const next = { ...prev };
+            Object.keys(next).forEach(key => {
+                if (key.startsWith(`${service}_`)) delete next[key];
+            });
+            return next;
+        });
    }


@ -459,15 +463,13 @@ export default function ServiceConfiguration() {
            ? providerSchema.$defs[schema.$ref.split('/').pop() || '']
            : schema;

-        // Use VoiceSelector for voice field in TTS service (except Sarvam which uses predefined options)
-        if (service === "tts" && field === "voice") {
-            const currentProvider = serviceProviders.tts;
-            // Sarvam uses predefined voice options, not VoiceSelector
+        // VoiceSelector for TTS voice fields without predefined options or manual input flag
+        if (service === "tts" && field === "voice" && !actualSchema?.allow_custom_input) {
            const hasVoiceOptions = actualSchema?.enum || actualSchema?.examples;
-            if (currentProvider !== "sarvam" && !hasVoiceOptions) {
+            if (!hasVoiceOptions) {
                return (
                    <VoiceSelector
-                        provider={currentProvider}
+                        provider={serviceProviders.tts}
                        value={watch(`${service}_${field}`) as string || ""}
                        onChange={(voiceId) => {
                            setValue(`${service}_${field}`, voiceId, { shouldDirty: true });
@ -477,39 +479,36 @@ export default function ServiceConfiguration() {
            }
        }

-        // Handle LLM model field with manual input toggle (uses examples from schema)
-        if (service === "llm" && field === "model" && actualSchema?.examples) {
-            const currentValue = watch(`${service}_${field}`) as string || "";
-            const modelOptions = actualSchema.examples;
+        // Generic allow_custom_input handler for any field (model, voice with options, etc.)
+        if (actualSchema?.allow_custom_input && actualSchema?.examples) {
+            const fieldKey = `${service}_${field}`;
+            const currentValue = watch(fieldKey) as string || "";
+            const options = actualSchema.examples;

-            if (isManualModelInput) {
+            if (isCustomInput[fieldKey]) {
                return (
                    <div className="space-y-2">
                        <Input
                            type="text"
-                            placeholder="Enter model name"
+                            placeholder={`Enter ${field}`}
                            value={currentValue}
                            onChange={(e) => {
-                                setValue(`${service}_${field}`, e.target.value, { shouldDirty: true });
+                                setValue(fieldKey, e.target.value, { shouldDirty: true });
                            }}
                        />
                        <div className="flex items-center space-x-2">
                            <Checkbox
-                                id="manual-model-input"
-                                checked={isManualModelInput}
+                                id={`custom-input-${fieldKey}`}
+                                checked={true}
                                onCheckedChange={(checked) => {
-                                    setIsManualModelInput(checked as boolean);
-                                    if (!checked && modelOptions.length > 0) {
-                                        // Reset to first option when switching back
-                                        setValue(`${service}_${field}`, modelOptions[0], { shouldDirty: true });
+                                    setIsCustomInput(prev => ({ ...prev, [fieldKey]: checked as boolean }));
+                                    if (!checked && options.length > 0) {
+                                        setValue(fieldKey, options[0], { shouldDirty: true });
                                    }
                                }}
                            />
-                            <Label
-                                htmlFor="manual-model-input"
-                                className="text-sm font-normal cursor-pointer"
-                            >
-                                Add Model Manually
+                            <Label htmlFor={`custom-input-${fieldKey}`} className="text-sm font-normal cursor-pointer">
+                                Enter Custom Value
                            </Label>
                        </div>
                    </div>
@ -522,14 +521,14 @@ export default function ServiceConfiguration() {
                        value={currentValue}
                        onValueChange={(value) => {
                            if (!value) return;
-                            setValue(`${service}_${field}`, value, { shouldDirty: true });
+                            setValue(fieldKey, value, { shouldDirty: true });
                        }}
                    >
                        <SelectTrigger className="w-full">
-                            <SelectValue placeholder="Select model" />
+                            <SelectValue placeholder={`Select ${field}`} />
                        </SelectTrigger>
                        <SelectContent>
-                            {modelOptions.map((value: string) => (
+                            {options.map((value: string) => (
                                <SelectItem key={value} value={value}>
                                    {value}
                                </SelectItem>
@ -538,17 +537,14 @@ export default function ServiceConfiguration() {
                    </Select>
                    <div className="flex items-center space-x-2">
                        <Checkbox
-                            id="manual-model-input-dropdown"
-                            checked={isManualModelInput}
+                            id={`custom-input-${fieldKey}-dropdown`}
+                            checked={false}
                            onCheckedChange={(checked) => {
-                                setIsManualModelInput(checked as boolean);
+                                setIsCustomInput(prev => ({ ...prev, [fieldKey]: checked as boolean }));
                            }}
                        />
-                        <Label
-                            htmlFor="manual-model-input-dropdown"
-                            className="text-sm font-normal cursor-pointer"
-                        >
-                            Add Model Manually
+                        <Label htmlFor={`custom-input-${fieldKey}-dropdown`} className="text-sm font-normal cursor-pointer">
+                            Enter Custom Value
                        </Label>
                    </div>
                </div>