feat(podcasts): add text-to-speech adapters

2026-06-12 20:45:20 +02:00 · 2026-06-10 18:44:03 +02:00 · 2026-06-10 18:44:03 +02:00 · 75287020e1
commit 75287020e1
parent ee24925747
9 changed files with 328 additions and 0 deletions
--- a/surfsense_backend/app/podcasts/tts/init.py
+++ b/surfsense_backend/app/podcasts/tts/init.py
@ -0,0 +1,22 @@
+"""Text-to-speech: a per-segment synthesis port with provider adapters.
+
+Callers depend on :class:`TextToSpeech` and obtain the configured provider from
+:func:`get_text_to_speech`; the concrete Kokoro/LiteLLM adapters stay private.
+"""
+
+from __future__ import annotations
+
+from .audio import SynthesizedAudio
+from .errors import TextToSpeechError
+from .factory import get_text_to_speech
+from .port import TextToSpeech
+from .request import SynthesisRequest, VoiceRef
+
+__all__ = [
+    "SynthesisRequest",
+    "SynthesizedAudio",
+    "TextToSpeech",
+    "TextToSpeechError",
+    "VoiceRef",
+    "get_text_to_speech",
+]
--- a/surfsense_backend/app/podcasts/tts/adapters/init.py
+++ b/surfsense_backend/app/podcasts/tts/adapters/init.py
@ -0,0 +1,3 @@
+"""Per-provider TextToSpeech implementations."""
+
+from __future__ import annotations
--- a/surfsense_backend/app/podcasts/tts/adapters/kokoro.py
+++ b/surfsense_backend/app/podcasts/tts/adapters/kokoro.py
@ -0,0 +1,111 @@
+"""Local Kokoro adapter: on-box synthesis, no network or per-segment cost.
+
+Kokoro selects its language model by a single-letter ``lang_code``, so this
+adapter maps the brief's BCP-47 tag to that code and caches one pipeline per
+code (pipeline construction loads weights and is expensive). Pipelines run in a
+thread pool because Kokoro is synchronous; the renderer caps how many segments
+synthesise at once.
+"""
+
+from __future__ import annotations
+
+import asyncio
+import io
+from typing import TYPE_CHECKING
+
+from ..audio import SynthesizedAudio
+from ..errors import TextToSpeechError
+from ..port import TextToSpeech
+from ..request import SynthesisRequest
+
+if TYPE_CHECKING:
+    from kokoro import KPipeline
+
+# Kokoro emits 24 kHz mono PCM regardless of voice.
+_SAMPLE_RATE = 24000
+
+# BCP-47 primary subtag -> Kokoro language code. English defaults to American;
+# the en-GB region override below switches it to British.
+_LANG_CODE_BY_PRIMARY = {
+    "en": "a",
+    "es": "e",
+    "fr": "f",
+    "hi": "h",
+    "it": "i",
+    "ja": "j",
+    "pt": "p",
+    "zh": "z",
+}
+
+
+class KokoroTextToSpeech(TextToSpeech):
+    """Synthesises segments with locally hosted Kokoro pipelines."""
+
+    def __init__(self) -> None:
+        self._pipelines: dict[str, KPipeline] = {}
+
+    @property
+    def container(self) -> str:
+        return "wav"
+
+    async def synthesize(self, request: SynthesisRequest) -> SynthesizedAudio:
+        if not isinstance(request.voice, str):
+            raise TextToSpeechError(
+                "Kokoro voices are named by string, not a mapping"
+            )
+
+        pipeline = self._pipeline_for(request.language)
+        loop = asyncio.get_event_loop()
+        try:
+            generator = await loop.run_in_executor(
+                None,
+                lambda: pipeline(
+                    request.text,
+                    voice=request.voice,
+                    speed=request.speed,
+                    split_pattern=r"\n+",
+                ),
+            )
+            segments = [audio for _gs, _ps, audio in generator]
+        except Exception as exc:  # noqa: BLE001 - normalise provider errors
+            raise TextToSpeechError(f"Kokoro synthesis failed: {exc}") from exc
+
+        if not segments:
+            raise TextToSpeechError("Kokoro produced no audio for the text")
+
+        return SynthesizedAudio(
+            data=_encode_wav(segments, _SAMPLE_RATE),
+            container="wav",
+            sample_rate=_SAMPLE_RATE,
+        )
+
+    def _pipeline_for(self, language: str) -> KPipeline:
+        lang_code = _lang_code(language)
+        pipeline = self._pipelines.get(lang_code)
+        if pipeline is None:
+            from kokoro import KPipeline
+
+            pipeline = KPipeline(lang_code=lang_code)
+            self._pipelines[lang_code] = pipeline
+        return pipeline
+
+
+def _lang_code(language: str) -> str:
+    normalised = language.strip().lower()
+    if normalised.startswith("en-gb") or normalised == "en-uk":
+        return "b"
+    primary = normalised.partition("-")[0]
+    code = _LANG_CODE_BY_PRIMARY.get(primary)
+    if code is None:
+        raise TextToSpeechError(f"Kokoro has no language model for {language!r}")
+    return code
+
+
+def _encode_wav(segments: list, sample_rate: int) -> bytes:
+    import numpy as np
+    import soundfile as sf
+
+    waveform = segments[0] if len(segments) == 1 else np.concatenate(segments)
+    buffer = io.BytesIO()
+    sf.write(buffer, waveform, sample_rate, format="WAV")
+    return buffer.getvalue()
--- a/surfsense_backend/app/podcasts/tts/adapters/litellm.py
+++ b/surfsense_backend/app/podcasts/tts/adapters/litellm.py
@ -0,0 +1,69 @@
+"""LiteLLM adapter: hosted TTS (OpenAI, Azure, Vertex AI) via one ``aspeech`` call.
+
+LiteLLM normalises every hosted provider behind the same ``aspeech`` surface,
+so a single adapter covers them all. The provider is encoded in the model
+string (e.g. ``openai/tts-1``, ``vertex_ai/...``) and the voice reference is
+whatever that provider expects, which the catalog already supplies.
+"""
+
+from __future__ import annotations
+
+from ..audio import SynthesizedAudio
+from ..errors import TextToSpeechError
+from ..port import TextToSpeech
+from ..request import SynthesisRequest
+
+# Hosted providers return MP3-encoded bytes from ``aspeech``.
+_CONTAINER = "mp3"
+
+# Matches the legacy podcaster timeouts; long single segments still finish well
+# under this, and retries cover transient upstream failures.
+_TIMEOUT_SECONDS = 600
+_MAX_RETRIES = 2
+
+
+class LiteLlmTextToSpeech(TextToSpeech):
+    """Synthesises segments through any LiteLLM-supported hosted TTS model."""
+
+    def __init__(
+        self,
+        *,
+        model: str,
+        api_base: str | None = None,
+        api_key: str | None = None,
+    ) -> None:
+        self._model = model
+        self._api_base = api_base
+        self._api_key = api_key
+
+    @property
+    def container(self) -> str:
+        return _CONTAINER
+
+    async def synthesize(self, request: SynthesisRequest) -> SynthesizedAudio:
+        from litellm import aspeech
+
+        kwargs = {
+            "model": self._model,
+            "voice": request.voice,
+            "input": request.text,
+            "max_retries": _MAX_RETRIES,
+            "timeout": _TIMEOUT_SECONDS,
+        }
+        if self._api_base:
+            kwargs["api_base"] = self._api_base
+        if self._api_key:
+            kwargs["api_key"] = self._api_key
+
+        try:
+            response = await aspeech(**kwargs)
+        except Exception as exc:  # noqa: BLE001 - normalise provider errors
+            raise TextToSpeechError(
+                f"{self._model} synthesis failed: {exc}"
+            ) from exc
+
+        data = getattr(response, "content", None)
+        if not data:
+            raise TextToSpeechError(f"{self._model} returned no audio")
+
+        return SynthesizedAudio(data=data, container=_CONTAINER)
--- a/surfsense_backend/app/podcasts/tts/audio.py
+++ b/surfsense_backend/app/podcasts/tts/audio.py
@ -0,0 +1,19 @@
+"""The bytes a TTS provider returns for one segment."""
+
+from __future__ import annotations
+
+from dataclasses import dataclass
+
+
+@dataclass(frozen=True, slots=True)
+class SynthesizedAudio:
+    """Encoded audio for a single segment, ready to cache and concatenate.
+
+    ``container`` is the file extension the bytes are encoded as (``"wav"`` or
+    ``"mp3"``); the renderer uses it to name the on-disk segment so FFmpeg can
+    demux the right format during merge.
+    """
+
+    data: bytes
+    container: str
+    sample_rate: int | None = None
--- a/surfsense_backend/app/podcasts/tts/errors.py
+++ b/surfsense_backend/app/podcasts/tts/errors.py
@ -0,0 +1,13 @@
+"""Failures raised by the TTS layer."""
+
+from __future__ import annotations
+
+
+class TextToSpeechError(RuntimeError):
+    """A provider failed to synthesise a segment.
+
+    Raised for both configuration faults (an unusable voice reference) and
+    provider faults (the upstream call errored or returned no audio), so the
+    renderer can fail the segment without unwrapping provider-specific
+    exceptions.
+    """
--- a/surfsense_backend/app/podcasts/tts/factory.py
+++ b/surfsense_backend/app/podcasts/tts/factory.py
@ -0,0 +1,38 @@
+"""Resolve the configured :class:`TextToSpeech` as a process-wide singleton."""
+
+from __future__ import annotations
+
+from functools import lru_cache
+
+from .port import TextToSpeech
+
+# Sentinel model string that selects the local Kokoro pipeline; anything else is
+# treated as a LiteLLM-hosted model (``openai/...``, ``vertex_ai/...``, etc.).
+KOKORO_SERVICE = "local/kokoro"
+
+
+@lru_cache(maxsize=1)
+def get_text_to_speech() -> TextToSpeech:
+    """Build the provider selected by ``TTS_SERVICE`` (adapters lazy-imported).
+
+    Cached because the Kokoro adapter holds loaded pipelines that must be reused
+    across segments and requests rather than rebuilt per call.
+    """
+    from app.config import config as app_config
+
+    service = app_config.TTS_SERVICE
+    if not service:
+        raise ValueError("TTS_SERVICE is not configured")
+
+    if service == KOKORO_SERVICE:
+        from .adapters.kokoro import KokoroTextToSpeech
+
+        return KokoroTextToSpeech()
+
+    from .adapters.litellm import LiteLlmTextToSpeech
+
+    return LiteLlmTextToSpeech(
+        model=service,
+        api_base=app_config.TTS_SERVICE_API_BASE,
+        api_key=app_config.TTS_SERVICE_API_KEY,
+    )
--- a/surfsense_backend/app/podcasts/tts/port.py
+++ b/surfsense_backend/app/podcasts/tts/port.py
@ -0,0 +1,31 @@
+"""The TTS contract: turn one segment of text into encoded audio."""
+
+from __future__ import annotations
+
+from abc import ABC, abstractmethod
+
+from .audio import SynthesizedAudio
+from .request import SynthesisRequest
+
+
+class TextToSpeech(ABC):
+    """Synthesises a single segment; one implementation per provider.
+
+    The contract is intentionally per-segment rather than per-episode: it keeps
+    each call independently cacheable and lets the renderer cap concurrency and
+    retry segments in isolation. Stitching segments into one file is the
+    renderer's job, not the provider's.
+    """
+
+    @property
+    @abstractmethod
+    def container(self) -> str:
+        """File extension/container this provider emits (e.g. ``"mp3"``)."""
+
+    @abstractmethod
+    async def synthesize(self, request: SynthesisRequest) -> SynthesizedAudio:
+        """Voice ``request.text`` and return its encoded audio.
+
+        Raises :class:`~app.podcasts.tts.errors.TextToSpeechError` on any
+        provider or configuration failure.
+        """
--- a/surfsense_backend/app/podcasts/tts/request.py
+++ b/surfsense_backend/app/podcasts/tts/request.py
@ -0,0 +1,22 @@
+"""What the renderer hands a TTS provider to voice a single segment."""
+
+from __future__ import annotations
+
+from collections.abc import Mapping
+from dataclasses import dataclass
+from typing import Any
+
+# A provider-native voice reference. OpenAI/Azure/Kokoro name a voice with a
+# string; Vertex passes a mapping (``languageCode`` + ``name``). The catalog
+# stores whichever shape the provider expects and we pass it through untouched.
+VoiceRef = str | Mapping[str, Any]
+
+
+@dataclass(frozen=True, slots=True)
+class SynthesisRequest:
+    """One unit of speech to synthesise: the smallest cacheable render step."""
+
+    text: str
+    voice: VoiceRef
+    language: str
+    speed: float = 1.0