feat(podcasts): add text-to-speech adapters

This commit is contained in:
CREDO23 2026-06-10 18:44:03 +02:00
parent ee24925747
commit 75287020e1
9 changed files with 328 additions and 0 deletions

View file

@ -0,0 +1,22 @@
"""Text-to-speech: a per-segment synthesis port with provider adapters.
Callers depend on :class:`TextToSpeech` and obtain the configured provider from
:func:`get_text_to_speech`; the concrete Kokoro/LiteLLM adapters stay private.
"""
from __future__ import annotations
from .audio import SynthesizedAudio
from .errors import TextToSpeechError
from .factory import get_text_to_speech
from .port import TextToSpeech
from .request import SynthesisRequest, VoiceRef
__all__ = [
"SynthesisRequest",
"SynthesizedAudio",
"TextToSpeech",
"TextToSpeechError",
"VoiceRef",
"get_text_to_speech",
]

View file

@ -0,0 +1,3 @@
"""Per-provider TextToSpeech implementations."""
from __future__ import annotations

View file

@ -0,0 +1,111 @@
"""Local Kokoro adapter: on-box synthesis, no network or per-segment cost.
Kokoro selects its language model by a single-letter ``lang_code``, so this
adapter maps the brief's BCP-47 tag to that code and caches one pipeline per
code (pipeline construction loads weights and is expensive). Pipelines run in a
thread pool because Kokoro is synchronous; the renderer caps how many segments
synthesise at once.
"""
from __future__ import annotations
import asyncio
import io
from typing import TYPE_CHECKING
from ..audio import SynthesizedAudio
from ..errors import TextToSpeechError
from ..port import TextToSpeech
from ..request import SynthesisRequest
if TYPE_CHECKING:
from kokoro import KPipeline
# Kokoro emits 24 kHz mono PCM regardless of voice.
_SAMPLE_RATE = 24000
# BCP-47 primary subtag -> Kokoro language code. English defaults to American;
# the en-GB region override below switches it to British.
_LANG_CODE_BY_PRIMARY = {
"en": "a",
"es": "e",
"fr": "f",
"hi": "h",
"it": "i",
"ja": "j",
"pt": "p",
"zh": "z",
}
class KokoroTextToSpeech(TextToSpeech):
"""Synthesises segments with locally hosted Kokoro pipelines."""
def __init__(self) -> None:
self._pipelines: dict[str, KPipeline] = {}
@property
def container(self) -> str:
return "wav"
async def synthesize(self, request: SynthesisRequest) -> SynthesizedAudio:
if not isinstance(request.voice, str):
raise TextToSpeechError(
"Kokoro voices are named by string, not a mapping"
)
pipeline = self._pipeline_for(request.language)
loop = asyncio.get_event_loop()
try:
generator = await loop.run_in_executor(
None,
lambda: pipeline(
request.text,
voice=request.voice,
speed=request.speed,
split_pattern=r"\n+",
),
)
segments = [audio for _gs, _ps, audio in generator]
except Exception as exc: # noqa: BLE001 - normalise provider errors
raise TextToSpeechError(f"Kokoro synthesis failed: {exc}") from exc
if not segments:
raise TextToSpeechError("Kokoro produced no audio for the text")
return SynthesizedAudio(
data=_encode_wav(segments, _SAMPLE_RATE),
container="wav",
sample_rate=_SAMPLE_RATE,
)
def _pipeline_for(self, language: str) -> KPipeline:
lang_code = _lang_code(language)
pipeline = self._pipelines.get(lang_code)
if pipeline is None:
from kokoro import KPipeline
pipeline = KPipeline(lang_code=lang_code)
self._pipelines[lang_code] = pipeline
return pipeline
def _lang_code(language: str) -> str:
normalised = language.strip().lower()
if normalised.startswith("en-gb") or normalised == "en-uk":
return "b"
primary = normalised.partition("-")[0]
code = _LANG_CODE_BY_PRIMARY.get(primary)
if code is None:
raise TextToSpeechError(f"Kokoro has no language model for {language!r}")
return code
def _encode_wav(segments: list, sample_rate: int) -> bytes:
import numpy as np
import soundfile as sf
waveform = segments[0] if len(segments) == 1 else np.concatenate(segments)
buffer = io.BytesIO()
sf.write(buffer, waveform, sample_rate, format="WAV")
return buffer.getvalue()

View file

@ -0,0 +1,69 @@
"""LiteLLM adapter: hosted TTS (OpenAI, Azure, Vertex AI) via one ``aspeech`` call.
LiteLLM normalises every hosted provider behind the same ``aspeech`` surface,
so a single adapter covers them all. The provider is encoded in the model
string (e.g. ``openai/tts-1``, ``vertex_ai/...``) and the voice reference is
whatever that provider expects, which the catalog already supplies.
"""
from __future__ import annotations
from ..audio import SynthesizedAudio
from ..errors import TextToSpeechError
from ..port import TextToSpeech
from ..request import SynthesisRequest
# Hosted providers return MP3-encoded bytes from ``aspeech``.
_CONTAINER = "mp3"
# Matches the legacy podcaster timeouts; long single segments still finish well
# under this, and retries cover transient upstream failures.
_TIMEOUT_SECONDS = 600
_MAX_RETRIES = 2
class LiteLlmTextToSpeech(TextToSpeech):
"""Synthesises segments through any LiteLLM-supported hosted TTS model."""
def __init__(
self,
*,
model: str,
api_base: str | None = None,
api_key: str | None = None,
) -> None:
self._model = model
self._api_base = api_base
self._api_key = api_key
@property
def container(self) -> str:
return _CONTAINER
async def synthesize(self, request: SynthesisRequest) -> SynthesizedAudio:
from litellm import aspeech
kwargs = {
"model": self._model,
"voice": request.voice,
"input": request.text,
"max_retries": _MAX_RETRIES,
"timeout": _TIMEOUT_SECONDS,
}
if self._api_base:
kwargs["api_base"] = self._api_base
if self._api_key:
kwargs["api_key"] = self._api_key
try:
response = await aspeech(**kwargs)
except Exception as exc: # noqa: BLE001 - normalise provider errors
raise TextToSpeechError(
f"{self._model} synthesis failed: {exc}"
) from exc
data = getattr(response, "content", None)
if not data:
raise TextToSpeechError(f"{self._model} returned no audio")
return SynthesizedAudio(data=data, container=_CONTAINER)

View file

@ -0,0 +1,19 @@
"""The bytes a TTS provider returns for one segment."""
from __future__ import annotations
from dataclasses import dataclass
@dataclass(frozen=True, slots=True)
class SynthesizedAudio:
"""Encoded audio for a single segment, ready to cache and concatenate.
``container`` is the file extension the bytes are encoded as (``"wav"`` or
``"mp3"``); the renderer uses it to name the on-disk segment so FFmpeg can
demux the right format during merge.
"""
data: bytes
container: str
sample_rate: int | None = None

View file

@ -0,0 +1,13 @@
"""Failures raised by the TTS layer."""
from __future__ import annotations
class TextToSpeechError(RuntimeError):
"""A provider failed to synthesise a segment.
Raised for both configuration faults (an unusable voice reference) and
provider faults (the upstream call errored or returned no audio), so the
renderer can fail the segment without unwrapping provider-specific
exceptions.
"""

View file

@ -0,0 +1,38 @@
"""Resolve the configured :class:`TextToSpeech` as a process-wide singleton."""
from __future__ import annotations
from functools import lru_cache
from .port import TextToSpeech
# Sentinel model string that selects the local Kokoro pipeline; anything else is
# treated as a LiteLLM-hosted model (``openai/...``, ``vertex_ai/...``, etc.).
KOKORO_SERVICE = "local/kokoro"
@lru_cache(maxsize=1)
def get_text_to_speech() -> TextToSpeech:
"""Build the provider selected by ``TTS_SERVICE`` (adapters lazy-imported).
Cached because the Kokoro adapter holds loaded pipelines that must be reused
across segments and requests rather than rebuilt per call.
"""
from app.config import config as app_config
service = app_config.TTS_SERVICE
if not service:
raise ValueError("TTS_SERVICE is not configured")
if service == KOKORO_SERVICE:
from .adapters.kokoro import KokoroTextToSpeech
return KokoroTextToSpeech()
from .adapters.litellm import LiteLlmTextToSpeech
return LiteLlmTextToSpeech(
model=service,
api_base=app_config.TTS_SERVICE_API_BASE,
api_key=app_config.TTS_SERVICE_API_KEY,
)

View file

@ -0,0 +1,31 @@
"""The TTS contract: turn one segment of text into encoded audio."""
from __future__ import annotations
from abc import ABC, abstractmethod
from .audio import SynthesizedAudio
from .request import SynthesisRequest
class TextToSpeech(ABC):
"""Synthesises a single segment; one implementation per provider.
The contract is intentionally per-segment rather than per-episode: it keeps
each call independently cacheable and lets the renderer cap concurrency and
retry segments in isolation. Stitching segments into one file is the
renderer's job, not the provider's.
"""
@property
@abstractmethod
def container(self) -> str:
"""File extension/container this provider emits (e.g. ``"mp3"``)."""
@abstractmethod
async def synthesize(self, request: SynthesisRequest) -> SynthesizedAudio:
"""Voice ``request.text`` and return its encoded audio.
Raises :class:`~app.podcasts.tts.errors.TextToSpeechError` on any
provider or configuration failure.
"""

View file

@ -0,0 +1,22 @@
"""What the renderer hands a TTS provider to voice a single segment."""
from __future__ import annotations
from collections.abc import Mapping
from dataclasses import dataclass
from typing import Any
# A provider-native voice reference. OpenAI/Azure/Kokoro name a voice with a
# string; Vertex passes a mapping (``languageCode`` + ``name``). The catalog
# stores whichever shape the provider expects and we pass it through untouched.
VoiceRef = str | Mapping[str, Any]
@dataclass(frozen=True, slots=True)
class SynthesisRequest:
"""One unit of speech to synthesise: the smallest cacheable render step."""
text: str
voice: VoiceRef
language: str
speed: float = 1.0