mirror of
https://github.com/dograh-hq/dograh.git
synced 2026-06-22 08:38:13 +02:00
feat: add Inworld TTS provider support (#420)
* Add Inworld TTS provider integration * chore: move from HTTP Service to Websocket Service --------- Co-authored-by: Abhishek Kumar <abhishek@a6k.me>
This commit is contained in:
parent
00a0de8a62
commit
fc37d5058f
7 changed files with 160 additions and 4 deletions
|
|
@ -37,7 +37,7 @@ RUN --mount=type=bind,source=api/requirements.txt,target=/tmp/req.txt \
|
|||
# sys.prefix/nltk_data, so it travels with the venv on COPY.
|
||||
RUN --mount=type=bind,source=pipecat,target=/tmp/pipecat,rw \
|
||||
--mount=type=cache,target=/root/.cache/uv \
|
||||
uv pip install '/tmp/pipecat[cartesia,deepgram,openai,elevenlabs,groq,google,azure,sarvam,soundfile,silero,webrtc,speechmatics,openrouter,camb,mcp]' \
|
||||
uv pip install '/tmp/pipecat[cartesia,deepgram,openai,elevenlabs,groq,google,azure,sarvam,soundfile,silero,webrtc,speechmatics,openrouter,camb,mcp,inworld]' \
|
||||
&& uv pip uninstall opencv-python \
|
||||
&& uv pip install opencv-python-headless \
|
||||
&& python -c "import nltk; nltk.download('punkt_tab', download_dir='/opt/venv/nltk_data', quiet=True)"
|
||||
|
|
|
|||
|
|
@ -1,5 +1,6 @@
|
|||
from typing import Optional, TypedDict
|
||||
|
||||
import httpx
|
||||
import openai
|
||||
from deepgram import DeepgramClient
|
||||
from groq import Groq
|
||||
|
|
@ -38,6 +39,7 @@ class UserConfigurationValidator:
|
|||
ServiceProviders.DEEPGRAM.value: self._check_deepgram_api_key,
|
||||
ServiceProviders.GROQ.value: self._check_groq_api_key,
|
||||
ServiceProviders.OPENROUTER.value: self._check_openrouter_api_key,
|
||||
ServiceProviders.INWORLD.value: self._check_inworld_api_key,
|
||||
ServiceProviders.ELEVENLABS.value: self._validate_elevenlabs_api_key,
|
||||
ServiceProviders.GOOGLE.value: self._check_google_api_key,
|
||||
ServiceProviders.AZURE.value: self._check_azure_api_key,
|
||||
|
|
@ -345,6 +347,32 @@ class UserConfigurationValidator:
|
|||
def _check_openrouter_api_key(self, model: str, api_key: str) -> bool:
|
||||
return True
|
||||
|
||||
def _check_inworld_api_key(self, model: str, api_key: str) -> bool:
|
||||
try:
|
||||
response = httpx.get(
|
||||
"https://api.inworld.ai/voices/v1/voices",
|
||||
headers={"Authorization": f"Basic {api_key}"},
|
||||
params={"pageSize": 1},
|
||||
timeout=10.0,
|
||||
)
|
||||
response.raise_for_status()
|
||||
return True
|
||||
except httpx.HTTPStatusError as exc:
|
||||
if exc.response.status_code in (401, 403):
|
||||
raise ValueError(
|
||||
"Invalid Inworld API key. The key was rejected by the Inworld API. "
|
||||
"Please verify that your API key is correct, active, and has voice read access."
|
||||
) from exc
|
||||
raise ValueError(
|
||||
"The Inworld API returned an error while validating the API key. "
|
||||
"Please try again later."
|
||||
) from exc
|
||||
except httpx.RequestError as exc:
|
||||
raise ValueError(
|
||||
"Could not connect to the Inworld API. Please check your network connection "
|
||||
"and try again."
|
||||
) from exc
|
||||
|
||||
def _check_grok_realtime_api_key(self, model: str, api_key: str) -> bool:
|
||||
return True
|
||||
|
||||
|
|
|
|||
|
|
@ -61,6 +61,7 @@ class ServiceProviders(str, Enum):
|
|||
DEEPGRAM = "deepgram"
|
||||
GROQ = "groq"
|
||||
OPENROUTER = "openrouter"
|
||||
INWORLD = "inworld"
|
||||
CARTESIA = "cartesia"
|
||||
# NEUPHONIC = "neuphonic"
|
||||
ELEVENLABS = "elevenlabs"
|
||||
|
|
@ -94,6 +95,7 @@ class BaseServiceConfiguration(BaseModel):
|
|||
ServiceProviders.DEEPGRAM,
|
||||
ServiceProviders.GROQ,
|
||||
ServiceProviders.OPENROUTER,
|
||||
ServiceProviders.INWORLD,
|
||||
ServiceProviders.ELEVENLABS,
|
||||
ServiceProviders.GOOGLE,
|
||||
ServiceProviders.AZURE,
|
||||
|
|
@ -249,6 +251,14 @@ GOOGLE_VERTEX_REALTIME_PROVIDER_MODEL_CONFIG = provider_model_config(
|
|||
DEEPGRAM_PROVIDER_MODEL_CONFIG = provider_model_config("Deepgram")
|
||||
ELEVENLABS_PROVIDER_MODEL_CONFIG = provider_model_config("ElevenLabs")
|
||||
CARTESIA_PROVIDER_MODEL_CONFIG = provider_model_config("Cartesia")
|
||||
INWORLD_PROVIDER_MODEL_CONFIG = provider_model_config(
|
||||
"Inworld",
|
||||
description=(
|
||||
"Inworld AI streaming text-to-speech with built-in and cloned voices. "
|
||||
"Defaults to the Ashley system voice on inworld-tts-2."
|
||||
),
|
||||
provider_docs_url="https://docs.inworld.ai/tts/tts",
|
||||
)
|
||||
SARVAM_PROVIDER_MODEL_CONFIG = provider_model_config("Sarvam")
|
||||
CAMB_PROVIDER_MODEL_CONFIG = provider_model_config("Camb.ai")
|
||||
RIME_PROVIDER_MODEL_CONFIG = provider_model_config("Rime")
|
||||
|
|
@ -957,6 +967,9 @@ class DograhTTSService(BaseTTSConfiguration):
|
|||
|
||||
|
||||
CARTESIA_TTS_MODELS = ["sonic-3.5", "sonic-3"]
|
||||
INWORLD_TTS_MODELS = ["inworld-tts-2"]
|
||||
INWORLD_TTS_VOICES = ["Ashley"]
|
||||
INWORLD_TTS_LANGUAGES = ["en-US"]
|
||||
|
||||
|
||||
@register_tts
|
||||
|
|
@ -986,6 +999,46 @@ class CartesiaTTSConfiguration(BaseTTSConfiguration):
|
|||
)
|
||||
|
||||
|
||||
@register_tts
|
||||
class InworldTTSConfiguration(BaseTTSConfiguration):
|
||||
model_config = INWORLD_PROVIDER_MODEL_CONFIG
|
||||
provider: Literal[ServiceProviders.INWORLD] = ServiceProviders.INWORLD
|
||||
model: str = Field(
|
||||
default="inworld-tts-2",
|
||||
description="Inworld TTS model.",
|
||||
json_schema_extra={"examples": INWORLD_TTS_MODELS, "allow_custom_input": True},
|
||||
)
|
||||
voice: str = Field(
|
||||
default="Ashley",
|
||||
description=(
|
||||
"Inworld voice ID. Use Ashley for the default warm English voice, "
|
||||
"or a workspace voice ID for a cloned/custom voice."
|
||||
),
|
||||
json_schema_extra={"examples": INWORLD_TTS_VOICES, "allow_custom_input": True},
|
||||
)
|
||||
language: str = Field(
|
||||
default="en-US",
|
||||
description="BCP-47 language code for synthesis.",
|
||||
json_schema_extra={
|
||||
"examples": INWORLD_TTS_LANGUAGES,
|
||||
"allow_custom_input": True,
|
||||
},
|
||||
)
|
||||
speed: float = Field(
|
||||
default=1.0,
|
||||
ge=0.25,
|
||||
le=4.0,
|
||||
description="Speech speed multiplier.",
|
||||
)
|
||||
delivery_mode: Literal["STABLE", "BALANCED", "CREATIVE"] = Field(
|
||||
default="BALANCED",
|
||||
description=(
|
||||
"Controls stability versus expressiveness for inworld-tts-2 "
|
||||
"(STABLE, BALANCED, or CREATIVE)."
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
@register_tts
|
||||
class SarvamTTSConfiguration(BaseTTSConfiguration):
|
||||
model_config = SARVAM_PROVIDER_MODEL_CONFIG
|
||||
|
|
@ -1228,6 +1281,7 @@ TTSConfig = Annotated[
|
|||
OpenAITTSService,
|
||||
ElevenlabsTTSConfiguration,
|
||||
CartesiaTTSConfiguration,
|
||||
InworldTTSConfiguration,
|
||||
DograhTTSService,
|
||||
SarvamTTSConfiguration,
|
||||
CambTTSConfiguration,
|
||||
|
|
|
|||
|
|
@ -48,6 +48,7 @@ from pipecat.services.huggingface.stt import (
|
|||
HuggingFaceSTTService,
|
||||
HuggingFaceSTTSettings,
|
||||
)
|
||||
from pipecat.services.inworld.tts import InworldTTSService, InworldTTSSettings
|
||||
from pipecat.services.minimax.llm import MiniMaxLLMService
|
||||
from pipecat.services.minimax.tts import MiniMaxTTSSettings
|
||||
from pipecat.services.openai._constants import OPENAI_SAMPLE_RATE
|
||||
|
|
@ -469,6 +470,25 @@ def create_tts_service(
|
|||
skip_aggregator_types=["recording_router", "recording"],
|
||||
silence_time_s=1.0,
|
||||
)
|
||||
elif user_config.tts.provider == ServiceProviders.INWORLD.value:
|
||||
voice = getattr(user_config.tts, "voice", None) or "Ashley"
|
||||
model = getattr(user_config.tts, "model", None) or "inworld-tts-2"
|
||||
speed = getattr(user_config.tts, "speed", None)
|
||||
language = getattr(user_config.tts, "language", None) or "en-US"
|
||||
delivery_mode = getattr(user_config.tts, "delivery_mode", None) or "BALANCED"
|
||||
return InworldTTSService(
|
||||
api_key=user_config.tts.api_key,
|
||||
settings=InworldTTSSettings(
|
||||
voice=voice,
|
||||
model=model,
|
||||
language=language,
|
||||
speaking_rate=speed,
|
||||
delivery_mode=delivery_mode,
|
||||
),
|
||||
text_filters=[xml_function_tag_filter],
|
||||
skip_aggregator_types=["recording_router", "recording"],
|
||||
silence_time_s=1.0,
|
||||
)
|
||||
elif user_config.tts.provider == ServiceProviders.DOGRAH.value:
|
||||
# Convert HTTP URL to WebSocket URL for TTS
|
||||
base_url = MPS_API_URL.replace("http://", "ws://").replace("https://", "wss://")
|
||||
|
|
|
|||
54
api/tests/test_inworld_tts_service_factory.py
Normal file
54
api/tests/test_inworld_tts_service_factory.py
Normal file
|
|
@ -0,0 +1,54 @@
|
|||
from types import SimpleNamespace
|
||||
from unittest.mock import patch
|
||||
|
||||
from api.services.configuration.registry import (
|
||||
InworldTTSConfiguration,
|
||||
ServiceProviders,
|
||||
)
|
||||
from api.services.pipecat.service_factory import create_tts_service
|
||||
|
||||
|
||||
def test_inworld_tts_configuration_defaults():
|
||||
config = InworldTTSConfiguration(api_key="test-key")
|
||||
|
||||
assert config.provider == ServiceProviders.INWORLD
|
||||
assert config.model == "inworld-tts-2"
|
||||
assert config.voice == "Ashley"
|
||||
assert config.language == "en-US"
|
||||
assert config.delivery_mode == "BALANCED"
|
||||
|
||||
|
||||
def test_create_inworld_tts_service_uses_websocket_service_without_http_session():
|
||||
user_config = SimpleNamespace(
|
||||
tts=SimpleNamespace(
|
||||
provider=ServiceProviders.INWORLD.value,
|
||||
api_key="test-key",
|
||||
model="inworld-tts-2",
|
||||
voice="Ashley",
|
||||
speed=1.1,
|
||||
language="en-US",
|
||||
delivery_mode="CREATIVE",
|
||||
)
|
||||
)
|
||||
audio_config = SimpleNamespace(
|
||||
transport_out_sample_rate=24000,
|
||||
transport_in_sample_rate=16000,
|
||||
)
|
||||
|
||||
with (
|
||||
patch("api.services.pipecat.service_factory.aiohttp.ClientSession") as session,
|
||||
patch("api.services.pipecat.service_factory.InworldTTSService") as mock_service,
|
||||
):
|
||||
create_tts_service(user_config, audio_config)
|
||||
|
||||
session.assert_not_called()
|
||||
assert mock_service.call_count == 1
|
||||
kwargs = mock_service.call_args.kwargs
|
||||
assert kwargs["api_key"] == "test-key"
|
||||
assert "aiohttp_session" not in kwargs
|
||||
assert "streaming" not in kwargs
|
||||
assert kwargs["settings"].model == "inworld-tts-2"
|
||||
assert kwargs["settings"].voice == "Ashley"
|
||||
assert kwargs["settings"].language == "en-US"
|
||||
assert kwargs["settings"].speaking_rate == 1.1
|
||||
assert kwargs["settings"].delivery_mode == "CREATIVE"
|
||||
|
|
@ -20,6 +20,6 @@ pip install -r api/requirements.txt
|
|||
|
||||
# Install pipecat from submodule last so it overrides any pipecat-ai pulled in by dependencies
|
||||
echo "Installing pipecat dependencies..."
|
||||
pip install -e ./pipecat[cartesia,deepgram,openai,elevenlabs,groq,google,azure,sarvam,soundfile,silero,webrtc,speechmatics,openrouter,camb]
|
||||
pip install -e ./pipecat[cartesia,deepgram,openai,elevenlabs,groq,google,azure,sarvam,soundfile,silero,webrtc,speechmatics,openrouter,camb,inworld]
|
||||
|
||||
echo "Setup complete! Pipecat is now available as a git submodule."
|
||||
echo "Setup complete! Pipecat is now available as a git submodule."
|
||||
|
|
|
|||
|
|
@ -80,7 +80,7 @@ fi
|
|||
|
||||
# Install pipecat in editable mode with all extras
|
||||
echo "Installing pipecat dependencies..."
|
||||
uv pip install -e ./pipecat[cartesia,deepgram,openai,elevenlabs,groq,google,azure,sarvam,soundfile,silero,webrtc,speechmatics,openrouter,camb,mcp]
|
||||
uv pip install -e ./pipecat[cartesia,deepgram,openai,elevenlabs,groq,google,azure,sarvam,soundfile,silero,webrtc,speechmatics,openrouter,camb,mcp,inworld]
|
||||
|
||||
if [ "$DEV_MODE" -eq 1 ]; then
|
||||
echo "Installing pipecat dev dependencies..."
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue