From fc37d5058fe452b20768a43271cf196bef082249 Mon Sep 17 00:00:00 2001 From: Manasseh <112127696+manasseh-zw@users.noreply.github.com> Date: Fri, 19 Jun 2026 09:53:27 +0200 Subject: [PATCH] feat: add Inworld TTS provider support (#420) * Add Inworld TTS provider integration * chore: move from HTTP Service to Websocket Service --------- Co-authored-by: Abhishek Kumar --- api/Dockerfile | 2 +- api/services/configuration/check_validity.py | 28 ++++++++++ api/services/configuration/registry.py | 54 +++++++++++++++++++ api/services/pipecat/service_factory.py | 20 +++++++ api/tests/test_inworld_tts_service_factory.py | 54 +++++++++++++++++++ scripts/setup_pipecat.sh | 4 +- scripts/setup_requirements.sh | 2 +- 7 files changed, 160 insertions(+), 4 deletions(-) create mode 100644 api/tests/test_inworld_tts_service_factory.py diff --git a/api/Dockerfile b/api/Dockerfile index e1a7463b..e3244125 100644 --- a/api/Dockerfile +++ b/api/Dockerfile @@ -37,7 +37,7 @@ RUN --mount=type=bind,source=api/requirements.txt,target=/tmp/req.txt \ # sys.prefix/nltk_data, so it travels with the venv on COPY. RUN --mount=type=bind,source=pipecat,target=/tmp/pipecat,rw \ --mount=type=cache,target=/root/.cache/uv \ - uv pip install '/tmp/pipecat[cartesia,deepgram,openai,elevenlabs,groq,google,azure,sarvam,soundfile,silero,webrtc,speechmatics,openrouter,camb,mcp]' \ + uv pip install '/tmp/pipecat[cartesia,deepgram,openai,elevenlabs,groq,google,azure,sarvam,soundfile,silero,webrtc,speechmatics,openrouter,camb,mcp,inworld]' \ && uv pip uninstall opencv-python \ && uv pip install opencv-python-headless \ && python -c "import nltk; nltk.download('punkt_tab', download_dir='/opt/venv/nltk_data', quiet=True)" diff --git a/api/services/configuration/check_validity.py b/api/services/configuration/check_validity.py index 7bd36752..3e97709c 100644 --- a/api/services/configuration/check_validity.py +++ b/api/services/configuration/check_validity.py @@ -1,5 +1,6 @@ from typing import Optional, TypedDict +import httpx import openai from deepgram import DeepgramClient from groq import Groq @@ -38,6 +39,7 @@ class UserConfigurationValidator: ServiceProviders.DEEPGRAM.value: self._check_deepgram_api_key, ServiceProviders.GROQ.value: self._check_groq_api_key, ServiceProviders.OPENROUTER.value: self._check_openrouter_api_key, + ServiceProviders.INWORLD.value: self._check_inworld_api_key, ServiceProviders.ELEVENLABS.value: self._validate_elevenlabs_api_key, ServiceProviders.GOOGLE.value: self._check_google_api_key, ServiceProviders.AZURE.value: self._check_azure_api_key, @@ -345,6 +347,32 @@ class UserConfigurationValidator: def _check_openrouter_api_key(self, model: str, api_key: str) -> bool: return True + def _check_inworld_api_key(self, model: str, api_key: str) -> bool: + try: + response = httpx.get( + "https://api.inworld.ai/voices/v1/voices", + headers={"Authorization": f"Basic {api_key}"}, + params={"pageSize": 1}, + timeout=10.0, + ) + response.raise_for_status() + return True + except httpx.HTTPStatusError as exc: + if exc.response.status_code in (401, 403): + raise ValueError( + "Invalid Inworld API key. The key was rejected by the Inworld API. " + "Please verify that your API key is correct, active, and has voice read access." + ) from exc + raise ValueError( + "The Inworld API returned an error while validating the API key. " + "Please try again later." + ) from exc + except httpx.RequestError as exc: + raise ValueError( + "Could not connect to the Inworld API. Please check your network connection " + "and try again." + ) from exc + def _check_grok_realtime_api_key(self, model: str, api_key: str) -> bool: return True diff --git a/api/services/configuration/registry.py b/api/services/configuration/registry.py index 99172cc1..7e269080 100644 --- a/api/services/configuration/registry.py +++ b/api/services/configuration/registry.py @@ -61,6 +61,7 @@ class ServiceProviders(str, Enum): DEEPGRAM = "deepgram" GROQ = "groq" OPENROUTER = "openrouter" + INWORLD = "inworld" CARTESIA = "cartesia" # NEUPHONIC = "neuphonic" ELEVENLABS = "elevenlabs" @@ -94,6 +95,7 @@ class BaseServiceConfiguration(BaseModel): ServiceProviders.DEEPGRAM, ServiceProviders.GROQ, ServiceProviders.OPENROUTER, + ServiceProviders.INWORLD, ServiceProviders.ELEVENLABS, ServiceProviders.GOOGLE, ServiceProviders.AZURE, @@ -249,6 +251,14 @@ GOOGLE_VERTEX_REALTIME_PROVIDER_MODEL_CONFIG = provider_model_config( DEEPGRAM_PROVIDER_MODEL_CONFIG = provider_model_config("Deepgram") ELEVENLABS_PROVIDER_MODEL_CONFIG = provider_model_config("ElevenLabs") CARTESIA_PROVIDER_MODEL_CONFIG = provider_model_config("Cartesia") +INWORLD_PROVIDER_MODEL_CONFIG = provider_model_config( + "Inworld", + description=( + "Inworld AI streaming text-to-speech with built-in and cloned voices. " + "Defaults to the Ashley system voice on inworld-tts-2." + ), + provider_docs_url="https://docs.inworld.ai/tts/tts", +) SARVAM_PROVIDER_MODEL_CONFIG = provider_model_config("Sarvam") CAMB_PROVIDER_MODEL_CONFIG = provider_model_config("Camb.ai") RIME_PROVIDER_MODEL_CONFIG = provider_model_config("Rime") @@ -957,6 +967,9 @@ class DograhTTSService(BaseTTSConfiguration): CARTESIA_TTS_MODELS = ["sonic-3.5", "sonic-3"] +INWORLD_TTS_MODELS = ["inworld-tts-2"] +INWORLD_TTS_VOICES = ["Ashley"] +INWORLD_TTS_LANGUAGES = ["en-US"] @register_tts @@ -986,6 +999,46 @@ class CartesiaTTSConfiguration(BaseTTSConfiguration): ) +@register_tts +class InworldTTSConfiguration(BaseTTSConfiguration): + model_config = INWORLD_PROVIDER_MODEL_CONFIG + provider: Literal[ServiceProviders.INWORLD] = ServiceProviders.INWORLD + model: str = Field( + default="inworld-tts-2", + description="Inworld TTS model.", + json_schema_extra={"examples": INWORLD_TTS_MODELS, "allow_custom_input": True}, + ) + voice: str = Field( + default="Ashley", + description=( + "Inworld voice ID. Use Ashley for the default warm English voice, " + "or a workspace voice ID for a cloned/custom voice." + ), + json_schema_extra={"examples": INWORLD_TTS_VOICES, "allow_custom_input": True}, + ) + language: str = Field( + default="en-US", + description="BCP-47 language code for synthesis.", + json_schema_extra={ + "examples": INWORLD_TTS_LANGUAGES, + "allow_custom_input": True, + }, + ) + speed: float = Field( + default=1.0, + ge=0.25, + le=4.0, + description="Speech speed multiplier.", + ) + delivery_mode: Literal["STABLE", "BALANCED", "CREATIVE"] = Field( + default="BALANCED", + description=( + "Controls stability versus expressiveness for inworld-tts-2 " + "(STABLE, BALANCED, or CREATIVE)." + ), + ) + + @register_tts class SarvamTTSConfiguration(BaseTTSConfiguration): model_config = SARVAM_PROVIDER_MODEL_CONFIG @@ -1228,6 +1281,7 @@ TTSConfig = Annotated[ OpenAITTSService, ElevenlabsTTSConfiguration, CartesiaTTSConfiguration, + InworldTTSConfiguration, DograhTTSService, SarvamTTSConfiguration, CambTTSConfiguration, diff --git a/api/services/pipecat/service_factory.py b/api/services/pipecat/service_factory.py index 8a2dc5a7..b7f64295 100644 --- a/api/services/pipecat/service_factory.py +++ b/api/services/pipecat/service_factory.py @@ -48,6 +48,7 @@ from pipecat.services.huggingface.stt import ( HuggingFaceSTTService, HuggingFaceSTTSettings, ) +from pipecat.services.inworld.tts import InworldTTSService, InworldTTSSettings from pipecat.services.minimax.llm import MiniMaxLLMService from pipecat.services.minimax.tts import MiniMaxTTSSettings from pipecat.services.openai._constants import OPENAI_SAMPLE_RATE @@ -469,6 +470,25 @@ def create_tts_service( skip_aggregator_types=["recording_router", "recording"], silence_time_s=1.0, ) + elif user_config.tts.provider == ServiceProviders.INWORLD.value: + voice = getattr(user_config.tts, "voice", None) or "Ashley" + model = getattr(user_config.tts, "model", None) or "inworld-tts-2" + speed = getattr(user_config.tts, "speed", None) + language = getattr(user_config.tts, "language", None) or "en-US" + delivery_mode = getattr(user_config.tts, "delivery_mode", None) or "BALANCED" + return InworldTTSService( + api_key=user_config.tts.api_key, + settings=InworldTTSSettings( + voice=voice, + model=model, + language=language, + speaking_rate=speed, + delivery_mode=delivery_mode, + ), + text_filters=[xml_function_tag_filter], + skip_aggregator_types=["recording_router", "recording"], + silence_time_s=1.0, + ) elif user_config.tts.provider == ServiceProviders.DOGRAH.value: # Convert HTTP URL to WebSocket URL for TTS base_url = MPS_API_URL.replace("http://", "ws://").replace("https://", "wss://") diff --git a/api/tests/test_inworld_tts_service_factory.py b/api/tests/test_inworld_tts_service_factory.py new file mode 100644 index 00000000..f2d063d7 --- /dev/null +++ b/api/tests/test_inworld_tts_service_factory.py @@ -0,0 +1,54 @@ +from types import SimpleNamespace +from unittest.mock import patch + +from api.services.configuration.registry import ( + InworldTTSConfiguration, + ServiceProviders, +) +from api.services.pipecat.service_factory import create_tts_service + + +def test_inworld_tts_configuration_defaults(): + config = InworldTTSConfiguration(api_key="test-key") + + assert config.provider == ServiceProviders.INWORLD + assert config.model == "inworld-tts-2" + assert config.voice == "Ashley" + assert config.language == "en-US" + assert config.delivery_mode == "BALANCED" + + +def test_create_inworld_tts_service_uses_websocket_service_without_http_session(): + user_config = SimpleNamespace( + tts=SimpleNamespace( + provider=ServiceProviders.INWORLD.value, + api_key="test-key", + model="inworld-tts-2", + voice="Ashley", + speed=1.1, + language="en-US", + delivery_mode="CREATIVE", + ) + ) + audio_config = SimpleNamespace( + transport_out_sample_rate=24000, + transport_in_sample_rate=16000, + ) + + with ( + patch("api.services.pipecat.service_factory.aiohttp.ClientSession") as session, + patch("api.services.pipecat.service_factory.InworldTTSService") as mock_service, + ): + create_tts_service(user_config, audio_config) + + session.assert_not_called() + assert mock_service.call_count == 1 + kwargs = mock_service.call_args.kwargs + assert kwargs["api_key"] == "test-key" + assert "aiohttp_session" not in kwargs + assert "streaming" not in kwargs + assert kwargs["settings"].model == "inworld-tts-2" + assert kwargs["settings"].voice == "Ashley" + assert kwargs["settings"].language == "en-US" + assert kwargs["settings"].speaking_rate == 1.1 + assert kwargs["settings"].delivery_mode == "CREATIVE" diff --git a/scripts/setup_pipecat.sh b/scripts/setup_pipecat.sh index 04821b0d..6da6844a 100755 --- a/scripts/setup_pipecat.sh +++ b/scripts/setup_pipecat.sh @@ -20,6 +20,6 @@ pip install -r api/requirements.txt # Install pipecat from submodule last so it overrides any pipecat-ai pulled in by dependencies echo "Installing pipecat dependencies..." -pip install -e ./pipecat[cartesia,deepgram,openai,elevenlabs,groq,google,azure,sarvam,soundfile,silero,webrtc,speechmatics,openrouter,camb] +pip install -e ./pipecat[cartesia,deepgram,openai,elevenlabs,groq,google,azure,sarvam,soundfile,silero,webrtc,speechmatics,openrouter,camb,inworld] -echo "Setup complete! Pipecat is now available as a git submodule." \ No newline at end of file +echo "Setup complete! Pipecat is now available as a git submodule." diff --git a/scripts/setup_requirements.sh b/scripts/setup_requirements.sh index f744a2f8..8074661f 100755 --- a/scripts/setup_requirements.sh +++ b/scripts/setup_requirements.sh @@ -80,7 +80,7 @@ fi # Install pipecat in editable mode with all extras echo "Installing pipecat dependencies..." -uv pip install -e ./pipecat[cartesia,deepgram,openai,elevenlabs,groq,google,azure,sarvam,soundfile,silero,webrtc,speechmatics,openrouter,camb,mcp] +uv pip install -e ./pipecat[cartesia,deepgram,openai,elevenlabs,groq,google,azure,sarvam,soundfile,silero,webrtc,speechmatics,openrouter,camb,mcp,inworld] if [ "$DEV_MODE" -eq 1 ]; then echo "Installing pipecat dev dependencies..."