diff --git a/api/Dockerfile b/api/Dockerfile index e1a7463b..e3244125 100644 --- a/api/Dockerfile +++ b/api/Dockerfile @@ -37,7 +37,7 @@ RUN --mount=type=bind,source=api/requirements.txt,target=/tmp/req.txt \ # sys.prefix/nltk_data, so it travels with the venv on COPY. RUN --mount=type=bind,source=pipecat,target=/tmp/pipecat,rw \ --mount=type=cache,target=/root/.cache/uv \ - uv pip install '/tmp/pipecat[cartesia,deepgram,openai,elevenlabs,groq,google,azure,sarvam,soundfile,silero,webrtc,speechmatics,openrouter,camb,mcp]' \ + uv pip install '/tmp/pipecat[cartesia,deepgram,openai,elevenlabs,groq,google,azure,sarvam,soundfile,silero,webrtc,speechmatics,openrouter,camb,mcp,inworld]' \ && uv pip uninstall opencv-python \ && uv pip install opencv-python-headless \ && python -c "import nltk; nltk.download('punkt_tab', download_dir='/opt/venv/nltk_data', quiet=True)" diff --git a/api/services/configuration/check_validity.py b/api/services/configuration/check_validity.py index e8f5bfa7..5fa8f2b5 100644 --- a/api/services/configuration/check_validity.py +++ b/api/services/configuration/check_validity.py @@ -1,5 +1,6 @@ from typing import Optional, TypedDict +import httpx import openai from deepgram import DeepgramClient from groq import Groq @@ -38,6 +39,7 @@ class UserConfigurationValidator: ServiceProviders.DEEPGRAM.value: self._check_deepgram_api_key, ServiceProviders.GROQ.value: self._check_groq_api_key, ServiceProviders.OPENROUTER.value: self._check_openrouter_api_key, + ServiceProviders.INWORLD.value: self._check_inworld_api_key, ServiceProviders.ELEVENLABS.value: self._validate_elevenlabs_api_key, ServiceProviders.GOOGLE.value: self._check_google_api_key, ServiceProviders.AZURE.value: self._check_azure_api_key, @@ -343,6 +345,32 @@ class UserConfigurationValidator: def _check_openrouter_api_key(self, model: str, api_key: str) -> bool: return True + def _check_inworld_api_key(self, model: str, api_key: str) -> bool: + try: + response = httpx.get( + "https://api.inworld.ai/voices/v1/voices", + headers={"Authorization": f"Basic {api_key}"}, + params={"pageSize": 1}, + timeout=10.0, + ) + response.raise_for_status() + return True + except httpx.HTTPStatusError as exc: + if exc.response.status_code in (401, 403): + raise ValueError( + "Invalid Inworld API key. The key was rejected by the Inworld API. " + "Please verify that your API key is correct, active, and has voice read access." + ) from exc + raise ValueError( + "The Inworld API returned an error while validating the API key. " + "Please try again later." + ) from exc + except httpx.RequestError as exc: + raise ValueError( + "Could not connect to the Inworld API. Please check your network connection " + "and try again." + ) from exc + def _check_grok_realtime_api_key(self, model: str, api_key: str) -> bool: return True diff --git a/api/services/configuration/registry.py b/api/services/configuration/registry.py index 9fa9ee3b..7791e7f6 100644 --- a/api/services/configuration/registry.py +++ b/api/services/configuration/registry.py @@ -56,6 +56,7 @@ class ServiceProviders(str, Enum): DEEPGRAM = "deepgram" GROQ = "groq" OPENROUTER = "openrouter" + INWORLD = "inworld" CARTESIA = "cartesia" # NEUPHONIC = "neuphonic" ELEVENLABS = "elevenlabs" @@ -87,6 +88,7 @@ class BaseServiceConfiguration(BaseModel): ServiceProviders.DEEPGRAM, ServiceProviders.GROQ, ServiceProviders.OPENROUTER, + ServiceProviders.INWORLD, ServiceProviders.ELEVENLABS, ServiceProviders.GOOGLE, ServiceProviders.AZURE, @@ -240,6 +242,14 @@ GOOGLE_VERTEX_REALTIME_PROVIDER_MODEL_CONFIG = provider_model_config( DEEPGRAM_PROVIDER_MODEL_CONFIG = provider_model_config("Deepgram") ELEVENLABS_PROVIDER_MODEL_CONFIG = provider_model_config("ElevenLabs") CARTESIA_PROVIDER_MODEL_CONFIG = provider_model_config("Cartesia") +INWORLD_PROVIDER_MODEL_CONFIG = provider_model_config( + "Inworld", + description=( + "Inworld AI streaming text-to-speech with built-in and cloned voices. " + "Defaults to the Ashley system voice on inworld-tts-2." + ), + provider_docs_url="https://docs.inworld.ai/tts/tts", +) SARVAM_PROVIDER_MODEL_CONFIG = provider_model_config("Sarvam") CAMB_PROVIDER_MODEL_CONFIG = provider_model_config("Camb.ai") RIME_PROVIDER_MODEL_CONFIG = provider_model_config("Rime") @@ -912,6 +922,9 @@ class DograhTTSService(BaseTTSConfiguration): CARTESIA_TTS_MODELS = ["sonic-3"] +INWORLD_TTS_MODELS = ["inworld-tts-2"] +INWORLD_TTS_VOICES = ["Ashley"] +INWORLD_TTS_LANGUAGES = ["en-US"] @register_tts @@ -936,6 +949,46 @@ class CartesiaTTSConfiguration(BaseTTSConfiguration): ) +@register_tts +class InworldTTSConfiguration(BaseTTSConfiguration): + model_config = INWORLD_PROVIDER_MODEL_CONFIG + provider: Literal[ServiceProviders.INWORLD] = ServiceProviders.INWORLD + model: str = Field( + default="inworld-tts-2", + description="Inworld TTS model.", + json_schema_extra={"examples": INWORLD_TTS_MODELS, "allow_custom_input": True}, + ) + voice: str = Field( + default="Ashley", + description=( + "Inworld voice ID. Use Ashley for the default warm English voice, " + "or a workspace voice ID for a cloned/custom voice." + ), + json_schema_extra={"examples": INWORLD_TTS_VOICES, "allow_custom_input": True}, + ) + language: str = Field( + default="en-US", + description="BCP-47 language code for synthesis.", + json_schema_extra={ + "examples": INWORLD_TTS_LANGUAGES, + "allow_custom_input": True, + }, + ) + speed: float = Field( + default=1.0, + ge=0.25, + le=4.0, + description="Speech speed multiplier.", + ) + delivery_mode: Literal["STABLE", "BALANCED", "CREATIVE"] = Field( + default="BALANCED", + description=( + "Controls stability versus expressiveness for inworld-tts-2 " + "(STABLE, BALANCED, or CREATIVE)." + ), + ) + + @register_tts class SarvamTTSConfiguration(BaseTTSConfiguration): model_config = SARVAM_PROVIDER_MODEL_CONFIG @@ -1127,6 +1180,7 @@ TTSConfig = Annotated[ OpenAITTSService, ElevenlabsTTSConfiguration, CartesiaTTSConfiguration, + InworldTTSConfiguration, DograhTTSService, SarvamTTSConfiguration, CambTTSConfiguration, diff --git a/api/services/pipecat/inworld_tts.py b/api/services/pipecat/inworld_tts.py new file mode 100644 index 00000000..42558ea4 --- /dev/null +++ b/api/services/pipecat/inworld_tts.py @@ -0,0 +1,23 @@ +"""Inworld TTS wrapper that closes its aiohttp session in cleanup(). + +Pipecat's InworldHttpTTSService leaves session disposal to the caller. Our +factory creates a fresh session per service instance, so we own its close here +to avoid leaking sockets/FDs on shutdown. +""" + +import aiohttp + +from pipecat.services.inworld.tts import InworldHttpTTSService + + +class InworldOwnedSessionTTSService(InworldHttpTTSService): + """InworldHttpTTSService variant that owns its aiohttp session lifecycle.""" + + def __init__(self, *args, aiohttp_session: aiohttp.ClientSession, **kwargs): + super().__init__(*args, aiohttp_session=aiohttp_session, **kwargs) + self._owned_session = aiohttp_session + + async def cleanup(self): + await super().cleanup() + if not self._owned_session.closed: + await self._owned_session.close() diff --git a/api/services/pipecat/service_factory.py b/api/services/pipecat/service_factory.py index 8ed96e40..462a350f 100644 --- a/api/services/pipecat/service_factory.py +++ b/api/services/pipecat/service_factory.py @@ -7,6 +7,7 @@ from loguru import logger from api.constants import MPS_API_URL from api.services.configuration.registry import ServiceProviders +from api.services.pipecat.inworld_tts import InworldOwnedSessionTTSService from api.services.pipecat.minimax_tts import MiniMaxOwnedSessionTTSService from api.utils.url_security import validate_user_configured_service_url from pipecat.services.assemblyai.stt import AssemblyAISTTService, AssemblyAISTTSettings @@ -38,6 +39,7 @@ from pipecat.services.google.vertex.llm import ( GoogleVertexLLMService, GoogleVertexLLMSettings, ) +from pipecat.services.inworld.tts import InworldTTSSettings from pipecat.services.groq.llm import GroqLLMService, GroqLLMSettings from pipecat.services.minimax.llm import MiniMaxLLMService from pipecat.services.minimax.tts import MiniMaxTTSSettings @@ -398,6 +400,28 @@ def create_tts_service(user_config, audio_config: "AudioConfig"): skip_aggregator_types=["recording_router", "recording"], silence_time_s=1.0, ) + elif user_config.tts.provider == ServiceProviders.INWORLD.value: + voice = getattr(user_config.tts, "voice", None) or "Ashley" + model = getattr(user_config.tts, "model", None) or "inworld-tts-2" + speed = getattr(user_config.tts, "speed", None) + language = getattr(user_config.tts, "language", None) or "en-US" + delivery_mode = getattr(user_config.tts, "delivery_mode", None) or "BALANCED" + session = aiohttp.ClientSession() + return InworldOwnedSessionTTSService( + api_key=user_config.tts.api_key, + aiohttp_session=session, + streaming=True, + settings=InworldTTSSettings( + voice=voice, + model=model, + language=language, + speaking_rate=speed, + delivery_mode=delivery_mode, + ), + text_filters=[xml_function_tag_filter], + skip_aggregator_types=["recording_router", "recording"], + silence_time_s=1.0, + ) elif user_config.tts.provider == ServiceProviders.DOGRAH.value: # Convert HTTP URL to WebSocket URL for TTS base_url = MPS_API_URL.replace("http://", "ws://").replace("https://", "wss://") diff --git a/scripts/setup_pipecat.sh b/scripts/setup_pipecat.sh index 04821b0d..6da6844a 100755 --- a/scripts/setup_pipecat.sh +++ b/scripts/setup_pipecat.sh @@ -20,6 +20,6 @@ pip install -r api/requirements.txt # Install pipecat from submodule last so it overrides any pipecat-ai pulled in by dependencies echo "Installing pipecat dependencies..." -pip install -e ./pipecat[cartesia,deepgram,openai,elevenlabs,groq,google,azure,sarvam,soundfile,silero,webrtc,speechmatics,openrouter,camb] +pip install -e ./pipecat[cartesia,deepgram,openai,elevenlabs,groq,google,azure,sarvam,soundfile,silero,webrtc,speechmatics,openrouter,camb,inworld] -echo "Setup complete! Pipecat is now available as a git submodule." \ No newline at end of file +echo "Setup complete! Pipecat is now available as a git submodule." diff --git a/scripts/setup_requirements.sh b/scripts/setup_requirements.sh index f744a2f8..8074661f 100755 --- a/scripts/setup_requirements.sh +++ b/scripts/setup_requirements.sh @@ -80,7 +80,7 @@ fi # Install pipecat in editable mode with all extras echo "Installing pipecat dependencies..." -uv pip install -e ./pipecat[cartesia,deepgram,openai,elevenlabs,groq,google,azure,sarvam,soundfile,silero,webrtc,speechmatics,openrouter,camb,mcp] +uv pip install -e ./pipecat[cartesia,deepgram,openai,elevenlabs,groq,google,azure,sarvam,soundfile,silero,webrtc,speechmatics,openrouter,camb,mcp,inworld] if [ "$DEV_MODE" -eq 1 ]; then echo "Installing pipecat dev dependencies..."