Add Inworld TTS provider integration

This commit is contained in:
manasseh-zw 2026-06-06 21:16:11 +02:00
parent 49e68b49d5
commit 952316c1ed
7 changed files with 133 additions and 4 deletions

View file

@ -37,7 +37,7 @@ RUN --mount=type=bind,source=api/requirements.txt,target=/tmp/req.txt \
# sys.prefix/nltk_data, so it travels with the venv on COPY.
RUN --mount=type=bind,source=pipecat,target=/tmp/pipecat,rw \
--mount=type=cache,target=/root/.cache/uv \
uv pip install '/tmp/pipecat[cartesia,deepgram,openai,elevenlabs,groq,google,azure,sarvam,soundfile,silero,webrtc,speechmatics,openrouter,camb,mcp]' \
uv pip install '/tmp/pipecat[cartesia,deepgram,openai,elevenlabs,groq,google,azure,sarvam,soundfile,silero,webrtc,speechmatics,openrouter,camb,mcp,inworld]' \
&& uv pip uninstall opencv-python \
&& uv pip install opencv-python-headless \
&& python -c "import nltk; nltk.download('punkt_tab', download_dir='/opt/venv/nltk_data', quiet=True)"

View file

@ -1,5 +1,6 @@
from typing import Optional, TypedDict
import httpx
import openai
from deepgram import DeepgramClient
from groq import Groq
@ -38,6 +39,7 @@ class UserConfigurationValidator:
ServiceProviders.DEEPGRAM.value: self._check_deepgram_api_key,
ServiceProviders.GROQ.value: self._check_groq_api_key,
ServiceProviders.OPENROUTER.value: self._check_openrouter_api_key,
ServiceProviders.INWORLD.value: self._check_inworld_api_key,
ServiceProviders.ELEVENLABS.value: self._validate_elevenlabs_api_key,
ServiceProviders.GOOGLE.value: self._check_google_api_key,
ServiceProviders.AZURE.value: self._check_azure_api_key,
@ -343,6 +345,32 @@ class UserConfigurationValidator:
def _check_openrouter_api_key(self, model: str, api_key: str) -> bool:
return True
def _check_inworld_api_key(self, model: str, api_key: str) -> bool:
try:
response = httpx.get(
"https://api.inworld.ai/voices/v1/voices",
headers={"Authorization": f"Basic {api_key}"},
params={"pageSize": 1},
timeout=10.0,
)
response.raise_for_status()
return True
except httpx.HTTPStatusError as exc:
if exc.response.status_code in (401, 403):
raise ValueError(
"Invalid Inworld API key. The key was rejected by the Inworld API. "
"Please verify that your API key is correct, active, and has voice read access."
) from exc
raise ValueError(
"The Inworld API returned an error while validating the API key. "
"Please try again later."
) from exc
except httpx.RequestError as exc:
raise ValueError(
"Could not connect to the Inworld API. Please check your network connection "
"and try again."
) from exc
def _check_grok_realtime_api_key(self, model: str, api_key: str) -> bool:
return True

View file

@ -56,6 +56,7 @@ class ServiceProviders(str, Enum):
DEEPGRAM = "deepgram"
GROQ = "groq"
OPENROUTER = "openrouter"
INWORLD = "inworld"
CARTESIA = "cartesia"
# NEUPHONIC = "neuphonic"
ELEVENLABS = "elevenlabs"
@ -87,6 +88,7 @@ class BaseServiceConfiguration(BaseModel):
ServiceProviders.DEEPGRAM,
ServiceProviders.GROQ,
ServiceProviders.OPENROUTER,
ServiceProviders.INWORLD,
ServiceProviders.ELEVENLABS,
ServiceProviders.GOOGLE,
ServiceProviders.AZURE,
@ -240,6 +242,14 @@ GOOGLE_VERTEX_REALTIME_PROVIDER_MODEL_CONFIG = provider_model_config(
DEEPGRAM_PROVIDER_MODEL_CONFIG = provider_model_config("Deepgram")
ELEVENLABS_PROVIDER_MODEL_CONFIG = provider_model_config("ElevenLabs")
CARTESIA_PROVIDER_MODEL_CONFIG = provider_model_config("Cartesia")
INWORLD_PROVIDER_MODEL_CONFIG = provider_model_config(
"Inworld",
description=(
"Inworld AI streaming text-to-speech with built-in and cloned voices. "
"Defaults to the Ashley system voice on inworld-tts-2."
),
provider_docs_url="https://docs.inworld.ai/tts/tts",
)
SARVAM_PROVIDER_MODEL_CONFIG = provider_model_config("Sarvam")
CAMB_PROVIDER_MODEL_CONFIG = provider_model_config("Camb.ai")
RIME_PROVIDER_MODEL_CONFIG = provider_model_config("Rime")
@ -912,6 +922,9 @@ class DograhTTSService(BaseTTSConfiguration):
CARTESIA_TTS_MODELS = ["sonic-3"]
INWORLD_TTS_MODELS = ["inworld-tts-2"]
INWORLD_TTS_VOICES = ["Ashley"]
INWORLD_TTS_LANGUAGES = ["en-US"]
@register_tts
@ -936,6 +949,46 @@ class CartesiaTTSConfiguration(BaseTTSConfiguration):
)
@register_tts
class InworldTTSConfiguration(BaseTTSConfiguration):
model_config = INWORLD_PROVIDER_MODEL_CONFIG
provider: Literal[ServiceProviders.INWORLD] = ServiceProviders.INWORLD
model: str = Field(
default="inworld-tts-2",
description="Inworld TTS model.",
json_schema_extra={"examples": INWORLD_TTS_MODELS, "allow_custom_input": True},
)
voice: str = Field(
default="Ashley",
description=(
"Inworld voice ID. Use Ashley for the default warm English voice, "
"or a workspace voice ID for a cloned/custom voice."
),
json_schema_extra={"examples": INWORLD_TTS_VOICES, "allow_custom_input": True},
)
language: str = Field(
default="en-US",
description="BCP-47 language code for synthesis.",
json_schema_extra={
"examples": INWORLD_TTS_LANGUAGES,
"allow_custom_input": True,
},
)
speed: float = Field(
default=1.0,
ge=0.25,
le=4.0,
description="Speech speed multiplier.",
)
delivery_mode: Literal["STABLE", "BALANCED", "CREATIVE"] = Field(
default="BALANCED",
description=(
"Controls stability versus expressiveness for inworld-tts-2 "
"(STABLE, BALANCED, or CREATIVE)."
),
)
@register_tts
class SarvamTTSConfiguration(BaseTTSConfiguration):
model_config = SARVAM_PROVIDER_MODEL_CONFIG
@ -1127,6 +1180,7 @@ TTSConfig = Annotated[
OpenAITTSService,
ElevenlabsTTSConfiguration,
CartesiaTTSConfiguration,
InworldTTSConfiguration,
DograhTTSService,
SarvamTTSConfiguration,
CambTTSConfiguration,

View file

@ -0,0 +1,23 @@
"""Inworld TTS wrapper that closes its aiohttp session in cleanup().
Pipecat's InworldHttpTTSService leaves session disposal to the caller. Our
factory creates a fresh session per service instance, so we own its close here
to avoid leaking sockets/FDs on shutdown.
"""
import aiohttp
from pipecat.services.inworld.tts import InworldHttpTTSService
class InworldOwnedSessionTTSService(InworldHttpTTSService):
"""InworldHttpTTSService variant that owns its aiohttp session lifecycle."""
def __init__(self, *args, aiohttp_session: aiohttp.ClientSession, **kwargs):
super().__init__(*args, aiohttp_session=aiohttp_session, **kwargs)
self._owned_session = aiohttp_session
async def cleanup(self):
await super().cleanup()
if not self._owned_session.closed:
await self._owned_session.close()

View file

@ -7,6 +7,7 @@ from loguru import logger
from api.constants import MPS_API_URL
from api.services.configuration.registry import ServiceProviders
from api.services.pipecat.inworld_tts import InworldOwnedSessionTTSService
from api.services.pipecat.minimax_tts import MiniMaxOwnedSessionTTSService
from api.utils.url_security import validate_user_configured_service_url
from pipecat.services.assemblyai.stt import AssemblyAISTTService, AssemblyAISTTSettings
@ -38,6 +39,7 @@ from pipecat.services.google.vertex.llm import (
GoogleVertexLLMService,
GoogleVertexLLMSettings,
)
from pipecat.services.inworld.tts import InworldTTSSettings
from pipecat.services.groq.llm import GroqLLMService, GroqLLMSettings
from pipecat.services.minimax.llm import MiniMaxLLMService
from pipecat.services.minimax.tts import MiniMaxTTSSettings
@ -398,6 +400,28 @@ def create_tts_service(user_config, audio_config: "AudioConfig"):
skip_aggregator_types=["recording_router", "recording"],
silence_time_s=1.0,
)
elif user_config.tts.provider == ServiceProviders.INWORLD.value:
voice = getattr(user_config.tts, "voice", None) or "Ashley"
model = getattr(user_config.tts, "model", None) or "inworld-tts-2"
speed = getattr(user_config.tts, "speed", None)
language = getattr(user_config.tts, "language", None) or "en-US"
delivery_mode = getattr(user_config.tts, "delivery_mode", None) or "BALANCED"
session = aiohttp.ClientSession()
return InworldOwnedSessionTTSService(
api_key=user_config.tts.api_key,
aiohttp_session=session,
streaming=True,
settings=InworldTTSSettings(
voice=voice,
model=model,
language=language,
speaking_rate=speed,
delivery_mode=delivery_mode,
),
text_filters=[xml_function_tag_filter],
skip_aggregator_types=["recording_router", "recording"],
silence_time_s=1.0,
)
elif user_config.tts.provider == ServiceProviders.DOGRAH.value:
# Convert HTTP URL to WebSocket URL for TTS
base_url = MPS_API_URL.replace("http://", "ws://").replace("https://", "wss://")

View file

@ -20,6 +20,6 @@ pip install -r api/requirements.txt
# Install pipecat from submodule last so it overrides any pipecat-ai pulled in by dependencies
echo "Installing pipecat dependencies..."
pip install -e ./pipecat[cartesia,deepgram,openai,elevenlabs,groq,google,azure,sarvam,soundfile,silero,webrtc,speechmatics,openrouter,camb]
pip install -e ./pipecat[cartesia,deepgram,openai,elevenlabs,groq,google,azure,sarvam,soundfile,silero,webrtc,speechmatics,openrouter,camb,inworld]
echo "Setup complete! Pipecat is now available as a git submodule."
echo "Setup complete! Pipecat is now available as a git submodule."

View file

@ -80,7 +80,7 @@ fi
# Install pipecat in editable mode with all extras
echo "Installing pipecat dependencies..."
uv pip install -e ./pipecat[cartesia,deepgram,openai,elevenlabs,groq,google,azure,sarvam,soundfile,silero,webrtc,speechmatics,openrouter,camb,mcp]
uv pip install -e ./pipecat[cartesia,deepgram,openai,elevenlabs,groq,google,azure,sarvam,soundfile,silero,webrtc,speechmatics,openrouter,camb,mcp,inworld]
if [ "$DEV_MODE" -eq 1 ]; then
echo "Installing pipecat dev dependencies..."