From fc37d5058fe452b20768a43271cf196bef082249 Mon Sep 17 00:00:00 2001
From: Manasseh <112127696+manasseh-zw@users.noreply.github.com>
Date: Fri, 19 Jun 2026 09:53:27 +0200
Subject: [PATCH] feat: add Inworld TTS provider support (#420)

* Add Inworld TTS provider integration

* chore: move from HTTP Service to Websocket Service

---------

Co-authored-by: Abhishek Kumar <abhishek@a6k.me>
---
 api/Dockerfile                                |  2 +-
 api/services/configuration/check_validity.py  | 28 ++++++++++
 api/services/configuration/registry.py        | 54 +++++++++++++++++++
 api/services/pipecat/service_factory.py       | 20 +++++++
 api/tests/test_inworld_tts_service_factory.py | 54 +++++++++++++++++++
 scripts/setup_pipecat.sh                      |  4 +-
 scripts/setup_requirements.sh                 |  2 +-
 7 files changed, 160 insertions(+), 4 deletions(-)
 create mode 100644 api/tests/test_inworld_tts_service_factory.py

diff --git a/api/Dockerfile b/api/Dockerfile
index e1a7463b..e3244125 100644
--- a/api/Dockerfile
+++ b/api/Dockerfile
@@ -37,7 +37,7 @@ RUN --mount=type=bind,source=api/requirements.txt,target=/tmp/req.txt \
 #      sys.prefix/nltk_data, so it travels with the venv on COPY.
 RUN --mount=type=bind,source=pipecat,target=/tmp/pipecat,rw \
     --mount=type=cache,target=/root/.cache/uv \
-    uv pip install '/tmp/pipecat[cartesia,deepgram,openai,elevenlabs,groq,google,azure,sarvam,soundfile,silero,webrtc,speechmatics,openrouter,camb,mcp]' \
+    uv pip install '/tmp/pipecat[cartesia,deepgram,openai,elevenlabs,groq,google,azure,sarvam,soundfile,silero,webrtc,speechmatics,openrouter,camb,mcp,inworld]' \
  && uv pip uninstall opencv-python \
  && uv pip install opencv-python-headless \
  && python -c "import nltk; nltk.download('punkt_tab', download_dir='/opt/venv/nltk_data', quiet=True)"
diff --git a/api/services/configuration/check_validity.py b/api/services/configuration/check_validity.py
index 7bd36752..3e97709c 100644
--- a/api/services/configuration/check_validity.py
+++ b/api/services/configuration/check_validity.py
@@ -1,5 +1,6 @@
 from typing import Optional, TypedDict
 
+import httpx
 import openai
 from deepgram import DeepgramClient
 from groq import Groq
@@ -38,6 +39,7 @@ class UserConfigurationValidator:
             ServiceProviders.DEEPGRAM.value: self._check_deepgram_api_key,
             ServiceProviders.GROQ.value: self._check_groq_api_key,
             ServiceProviders.OPENROUTER.value: self._check_openrouter_api_key,
+            ServiceProviders.INWORLD.value: self._check_inworld_api_key,
             ServiceProviders.ELEVENLABS.value: self._validate_elevenlabs_api_key,
             ServiceProviders.GOOGLE.value: self._check_google_api_key,
             ServiceProviders.AZURE.value: self._check_azure_api_key,
@@ -345,6 +347,32 @@ class UserConfigurationValidator:
     def _check_openrouter_api_key(self, model: str, api_key: str) -> bool:
         return True
 
+    def _check_inworld_api_key(self, model: str, api_key: str) -> bool:
+        try:
+            response = httpx.get(
+                "https://api.inworld.ai/voices/v1/voices",
+                headers={"Authorization": f"Basic {api_key}"},
+                params={"pageSize": 1},
+                timeout=10.0,
+            )
+            response.raise_for_status()
+            return True
+        except httpx.HTTPStatusError as exc:
+            if exc.response.status_code in (401, 403):
+                raise ValueError(
+                    "Invalid Inworld API key. The key was rejected by the Inworld API. "
+                    "Please verify that your API key is correct, active, and has voice read access."
+                ) from exc
+            raise ValueError(
+                "The Inworld API returned an error while validating the API key. "
+                "Please try again later."
+            ) from exc
+        except httpx.RequestError as exc:
+            raise ValueError(
+                "Could not connect to the Inworld API. Please check your network connection "
+                "and try again."
+            ) from exc
+
     def _check_grok_realtime_api_key(self, model: str, api_key: str) -> bool:
         return True
 
diff --git a/api/services/configuration/registry.py b/api/services/configuration/registry.py
index 99172cc1..7e269080 100644
--- a/api/services/configuration/registry.py
+++ b/api/services/configuration/registry.py
@@ -61,6 +61,7 @@ class ServiceProviders(str, Enum):
     DEEPGRAM = "deepgram"
     GROQ = "groq"
     OPENROUTER = "openrouter"
+    INWORLD = "inworld"
     CARTESIA = "cartesia"
     # NEUPHONIC = "neuphonic"
     ELEVENLABS = "elevenlabs"
@@ -94,6 +95,7 @@ class BaseServiceConfiguration(BaseModel):
         ServiceProviders.DEEPGRAM,
         ServiceProviders.GROQ,
         ServiceProviders.OPENROUTER,
+        ServiceProviders.INWORLD,
         ServiceProviders.ELEVENLABS,
         ServiceProviders.GOOGLE,
         ServiceProviders.AZURE,
@@ -249,6 +251,14 @@ GOOGLE_VERTEX_REALTIME_PROVIDER_MODEL_CONFIG = provider_model_config(
 DEEPGRAM_PROVIDER_MODEL_CONFIG = provider_model_config("Deepgram")
 ELEVENLABS_PROVIDER_MODEL_CONFIG = provider_model_config("ElevenLabs")
 CARTESIA_PROVIDER_MODEL_CONFIG = provider_model_config("Cartesia")
+INWORLD_PROVIDER_MODEL_CONFIG = provider_model_config(
+    "Inworld",
+    description=(
+        "Inworld AI streaming text-to-speech with built-in and cloned voices. "
+        "Defaults to the Ashley system voice on inworld-tts-2."
+    ),
+    provider_docs_url="https://docs.inworld.ai/tts/tts",
+)
 SARVAM_PROVIDER_MODEL_CONFIG = provider_model_config("Sarvam")
 CAMB_PROVIDER_MODEL_CONFIG = provider_model_config("Camb.ai")
 RIME_PROVIDER_MODEL_CONFIG = provider_model_config("Rime")
@@ -957,6 +967,9 @@ class DograhTTSService(BaseTTSConfiguration):
 
 
 CARTESIA_TTS_MODELS = ["sonic-3.5", "sonic-3"]
+INWORLD_TTS_MODELS = ["inworld-tts-2"]
+INWORLD_TTS_VOICES = ["Ashley"]
+INWORLD_TTS_LANGUAGES = ["en-US"]
 
 
 @register_tts
@@ -986,6 +999,46 @@ class CartesiaTTSConfiguration(BaseTTSConfiguration):
     )
 
 
+@register_tts
+class InworldTTSConfiguration(BaseTTSConfiguration):
+    model_config = INWORLD_PROVIDER_MODEL_CONFIG
+    provider: Literal[ServiceProviders.INWORLD] = ServiceProviders.INWORLD
+    model: str = Field(
+        default="inworld-tts-2",
+        description="Inworld TTS model.",
+        json_schema_extra={"examples": INWORLD_TTS_MODELS, "allow_custom_input": True},
+    )
+    voice: str = Field(
+        default="Ashley",
+        description=(
+            "Inworld voice ID. Use Ashley for the default warm English voice, "
+            "or a workspace voice ID for a cloned/custom voice."
+        ),
+        json_schema_extra={"examples": INWORLD_TTS_VOICES, "allow_custom_input": True},
+    )
+    language: str = Field(
+        default="en-US",
+        description="BCP-47 language code for synthesis.",
+        json_schema_extra={
+            "examples": INWORLD_TTS_LANGUAGES,
+            "allow_custom_input": True,
+        },
+    )
+    speed: float = Field(
+        default=1.0,
+        ge=0.25,
+        le=4.0,
+        description="Speech speed multiplier.",
+    )
+    delivery_mode: Literal["STABLE", "BALANCED", "CREATIVE"] = Field(
+        default="BALANCED",
+        description=(
+            "Controls stability versus expressiveness for inworld-tts-2 "
+            "(STABLE, BALANCED, or CREATIVE)."
+        ),
+    )
+
+
 @register_tts
 class SarvamTTSConfiguration(BaseTTSConfiguration):
     model_config = SARVAM_PROVIDER_MODEL_CONFIG
@@ -1228,6 +1281,7 @@ TTSConfig = Annotated[
         OpenAITTSService,
         ElevenlabsTTSConfiguration,
         CartesiaTTSConfiguration,
+        InworldTTSConfiguration,
         DograhTTSService,
         SarvamTTSConfiguration,
         CambTTSConfiguration,
diff --git a/api/services/pipecat/service_factory.py b/api/services/pipecat/service_factory.py
index 8a2dc5a7..b7f64295 100644
--- a/api/services/pipecat/service_factory.py
+++ b/api/services/pipecat/service_factory.py
@@ -48,6 +48,7 @@ from pipecat.services.huggingface.stt import (
     HuggingFaceSTTService,
     HuggingFaceSTTSettings,
 )
+from pipecat.services.inworld.tts import InworldTTSService, InworldTTSSettings
 from pipecat.services.minimax.llm import MiniMaxLLMService
 from pipecat.services.minimax.tts import MiniMaxTTSSettings
 from pipecat.services.openai._constants import OPENAI_SAMPLE_RATE
@@ -469,6 +470,25 @@ def create_tts_service(
             skip_aggregator_types=["recording_router", "recording"],
             silence_time_s=1.0,
         )
+    elif user_config.tts.provider == ServiceProviders.INWORLD.value:
+        voice = getattr(user_config.tts, "voice", None) or "Ashley"
+        model = getattr(user_config.tts, "model", None) or "inworld-tts-2"
+        speed = getattr(user_config.tts, "speed", None)
+        language = getattr(user_config.tts, "language", None) or "en-US"
+        delivery_mode = getattr(user_config.tts, "delivery_mode", None) or "BALANCED"
+        return InworldTTSService(
+            api_key=user_config.tts.api_key,
+            settings=InworldTTSSettings(
+                voice=voice,
+                model=model,
+                language=language,
+                speaking_rate=speed,
+                delivery_mode=delivery_mode,
+            ),
+            text_filters=[xml_function_tag_filter],
+            skip_aggregator_types=["recording_router", "recording"],
+            silence_time_s=1.0,
+        )
     elif user_config.tts.provider == ServiceProviders.DOGRAH.value:
         # Convert HTTP URL to WebSocket URL for TTS
         base_url = MPS_API_URL.replace("http://", "ws://").replace("https://", "wss://")
diff --git a/api/tests/test_inworld_tts_service_factory.py b/api/tests/test_inworld_tts_service_factory.py
new file mode 100644
index 00000000..f2d063d7
--- /dev/null
+++ b/api/tests/test_inworld_tts_service_factory.py
@@ -0,0 +1,54 @@
+from types import SimpleNamespace
+from unittest.mock import patch
+
+from api.services.configuration.registry import (
+    InworldTTSConfiguration,
+    ServiceProviders,
+)
+from api.services.pipecat.service_factory import create_tts_service
+
+
+def test_inworld_tts_configuration_defaults():
+    config = InworldTTSConfiguration(api_key="test-key")
+
+    assert config.provider == ServiceProviders.INWORLD
+    assert config.model == "inworld-tts-2"
+    assert config.voice == "Ashley"
+    assert config.language == "en-US"
+    assert config.delivery_mode == "BALANCED"
+
+
+def test_create_inworld_tts_service_uses_websocket_service_without_http_session():
+    user_config = SimpleNamespace(
+        tts=SimpleNamespace(
+            provider=ServiceProviders.INWORLD.value,
+            api_key="test-key",
+            model="inworld-tts-2",
+            voice="Ashley",
+            speed=1.1,
+            language="en-US",
+            delivery_mode="CREATIVE",
+        )
+    )
+    audio_config = SimpleNamespace(
+        transport_out_sample_rate=24000,
+        transport_in_sample_rate=16000,
+    )
+
+    with (
+        patch("api.services.pipecat.service_factory.aiohttp.ClientSession") as session,
+        patch("api.services.pipecat.service_factory.InworldTTSService") as mock_service,
+    ):
+        create_tts_service(user_config, audio_config)
+
+    session.assert_not_called()
+    assert mock_service.call_count == 1
+    kwargs = mock_service.call_args.kwargs
+    assert kwargs["api_key"] == "test-key"
+    assert "aiohttp_session" not in kwargs
+    assert "streaming" not in kwargs
+    assert kwargs["settings"].model == "inworld-tts-2"
+    assert kwargs["settings"].voice == "Ashley"
+    assert kwargs["settings"].language == "en-US"
+    assert kwargs["settings"].speaking_rate == 1.1
+    assert kwargs["settings"].delivery_mode == "CREATIVE"
diff --git a/scripts/setup_pipecat.sh b/scripts/setup_pipecat.sh
index 04821b0d..6da6844a 100755
--- a/scripts/setup_pipecat.sh
+++ b/scripts/setup_pipecat.sh
@@ -20,6 +20,6 @@ pip install -r api/requirements.txt
 
 # Install pipecat from submodule last so it overrides any pipecat-ai pulled in by dependencies
 echo "Installing pipecat dependencies..."
-pip install -e ./pipecat[cartesia,deepgram,openai,elevenlabs,groq,google,azure,sarvam,soundfile,silero,webrtc,speechmatics,openrouter,camb]
+pip install -e ./pipecat[cartesia,deepgram,openai,elevenlabs,groq,google,azure,sarvam,soundfile,silero,webrtc,speechmatics,openrouter,camb,inworld]
 
-echo "Setup complete! Pipecat is now available as a git submodule."
\ No newline at end of file
+echo "Setup complete! Pipecat is now available as a git submodule."
diff --git a/scripts/setup_requirements.sh b/scripts/setup_requirements.sh
index f744a2f8..8074661f 100755
--- a/scripts/setup_requirements.sh
+++ b/scripts/setup_requirements.sh
@@ -80,7 +80,7 @@ fi
 
 # Install pipecat in editable mode with all extras
 echo "Installing pipecat dependencies..."
-uv pip install -e ./pipecat[cartesia,deepgram,openai,elevenlabs,groq,google,azure,sarvam,soundfile,silero,webrtc,speechmatics,openrouter,camb,mcp]
+uv pip install -e ./pipecat[cartesia,deepgram,openai,elevenlabs,groq,google,azure,sarvam,soundfile,silero,webrtc,speechmatics,openrouter,camb,mcp,inworld]
 
 if [ "$DEV_MODE" -eq 1 ]; then
     echo "Installing pipecat dev dependencies..."