diff --git a/surfsense_backend/app/podcasts/tts/__init__.py b/surfsense_backend/app/podcasts/tts/__init__.py new file mode 100644 index 000000000..16379dc2b --- /dev/null +++ b/surfsense_backend/app/podcasts/tts/__init__.py @@ -0,0 +1,22 @@ +"""Text-to-speech: a per-segment synthesis port with provider adapters. + +Callers depend on :class:`TextToSpeech` and obtain the configured provider from +:func:`get_text_to_speech`; the concrete Kokoro/LiteLLM adapters stay private. +""" + +from __future__ import annotations + +from .audio import SynthesizedAudio +from .errors import TextToSpeechError +from .factory import get_text_to_speech +from .port import TextToSpeech +from .request import SynthesisRequest, VoiceRef + +__all__ = [ + "SynthesisRequest", + "SynthesizedAudio", + "TextToSpeech", + "TextToSpeechError", + "VoiceRef", + "get_text_to_speech", +] diff --git a/surfsense_backend/app/podcasts/tts/adapters/__init__.py b/surfsense_backend/app/podcasts/tts/adapters/__init__.py new file mode 100644 index 000000000..24d517e55 --- /dev/null +++ b/surfsense_backend/app/podcasts/tts/adapters/__init__.py @@ -0,0 +1,3 @@ +"""Per-provider TextToSpeech implementations.""" + +from __future__ import annotations diff --git a/surfsense_backend/app/podcasts/tts/adapters/kokoro.py b/surfsense_backend/app/podcasts/tts/adapters/kokoro.py new file mode 100644 index 000000000..031b48e86 --- /dev/null +++ b/surfsense_backend/app/podcasts/tts/adapters/kokoro.py @@ -0,0 +1,111 @@ +"""Local Kokoro adapter: on-box synthesis, no network or per-segment cost. + +Kokoro selects its language model by a single-letter ``lang_code``, so this +adapter maps the brief's BCP-47 tag to that code and caches one pipeline per +code (pipeline construction loads weights and is expensive). Pipelines run in a +thread pool because Kokoro is synchronous; the renderer caps how many segments +synthesise at once. +""" + +from __future__ import annotations + +import asyncio +import io +from typing import TYPE_CHECKING + +from ..audio import SynthesizedAudio +from ..errors import TextToSpeechError +from ..port import TextToSpeech +from ..request import SynthesisRequest + +if TYPE_CHECKING: + from kokoro import KPipeline + +# Kokoro emits 24 kHz mono PCM regardless of voice. +_SAMPLE_RATE = 24000 + +# BCP-47 primary subtag -> Kokoro language code. English defaults to American; +# the en-GB region override below switches it to British. +_LANG_CODE_BY_PRIMARY = { + "en": "a", + "es": "e", + "fr": "f", + "hi": "h", + "it": "i", + "ja": "j", + "pt": "p", + "zh": "z", +} + + +class KokoroTextToSpeech(TextToSpeech): + """Synthesises segments with locally hosted Kokoro pipelines.""" + + def __init__(self) -> None: + self._pipelines: dict[str, KPipeline] = {} + + @property + def container(self) -> str: + return "wav" + + async def synthesize(self, request: SynthesisRequest) -> SynthesizedAudio: + if not isinstance(request.voice, str): + raise TextToSpeechError( + "Kokoro voices are named by string, not a mapping" + ) + + pipeline = self._pipeline_for(request.language) + loop = asyncio.get_event_loop() + try: + generator = await loop.run_in_executor( + None, + lambda: pipeline( + request.text, + voice=request.voice, + speed=request.speed, + split_pattern=r"\n+", + ), + ) + segments = [audio for _gs, _ps, audio in generator] + except Exception as exc: # noqa: BLE001 - normalise provider errors + raise TextToSpeechError(f"Kokoro synthesis failed: {exc}") from exc + + if not segments: + raise TextToSpeechError("Kokoro produced no audio for the text") + + return SynthesizedAudio( + data=_encode_wav(segments, _SAMPLE_RATE), + container="wav", + sample_rate=_SAMPLE_RATE, + ) + + def _pipeline_for(self, language: str) -> KPipeline: + lang_code = _lang_code(language) + pipeline = self._pipelines.get(lang_code) + if pipeline is None: + from kokoro import KPipeline + + pipeline = KPipeline(lang_code=lang_code) + self._pipelines[lang_code] = pipeline + return pipeline + + +def _lang_code(language: str) -> str: + normalised = language.strip().lower() + if normalised.startswith("en-gb") or normalised == "en-uk": + return "b" + primary = normalised.partition("-")[0] + code = _LANG_CODE_BY_PRIMARY.get(primary) + if code is None: + raise TextToSpeechError(f"Kokoro has no language model for {language!r}") + return code + + +def _encode_wav(segments: list, sample_rate: int) -> bytes: + import numpy as np + import soundfile as sf + + waveform = segments[0] if len(segments) == 1 else np.concatenate(segments) + buffer = io.BytesIO() + sf.write(buffer, waveform, sample_rate, format="WAV") + return buffer.getvalue() diff --git a/surfsense_backend/app/podcasts/tts/adapters/litellm.py b/surfsense_backend/app/podcasts/tts/adapters/litellm.py new file mode 100644 index 000000000..55f49bd1e --- /dev/null +++ b/surfsense_backend/app/podcasts/tts/adapters/litellm.py @@ -0,0 +1,69 @@ +"""LiteLLM adapter: hosted TTS (OpenAI, Azure, Vertex AI) via one ``aspeech`` call. + +LiteLLM normalises every hosted provider behind the same ``aspeech`` surface, +so a single adapter covers them all. The provider is encoded in the model +string (e.g. ``openai/tts-1``, ``vertex_ai/...``) and the voice reference is +whatever that provider expects, which the catalog already supplies. +""" + +from __future__ import annotations + +from ..audio import SynthesizedAudio +from ..errors import TextToSpeechError +from ..port import TextToSpeech +from ..request import SynthesisRequest + +# Hosted providers return MP3-encoded bytes from ``aspeech``. +_CONTAINER = "mp3" + +# Matches the legacy podcaster timeouts; long single segments still finish well +# under this, and retries cover transient upstream failures. +_TIMEOUT_SECONDS = 600 +_MAX_RETRIES = 2 + + +class LiteLlmTextToSpeech(TextToSpeech): + """Synthesises segments through any LiteLLM-supported hosted TTS model.""" + + def __init__( + self, + *, + model: str, + api_base: str | None = None, + api_key: str | None = None, + ) -> None: + self._model = model + self._api_base = api_base + self._api_key = api_key + + @property + def container(self) -> str: + return _CONTAINER + + async def synthesize(self, request: SynthesisRequest) -> SynthesizedAudio: + from litellm import aspeech + + kwargs = { + "model": self._model, + "voice": request.voice, + "input": request.text, + "max_retries": _MAX_RETRIES, + "timeout": _TIMEOUT_SECONDS, + } + if self._api_base: + kwargs["api_base"] = self._api_base + if self._api_key: + kwargs["api_key"] = self._api_key + + try: + response = await aspeech(**kwargs) + except Exception as exc: # noqa: BLE001 - normalise provider errors + raise TextToSpeechError( + f"{self._model} synthesis failed: {exc}" + ) from exc + + data = getattr(response, "content", None) + if not data: + raise TextToSpeechError(f"{self._model} returned no audio") + + return SynthesizedAudio(data=data, container=_CONTAINER) diff --git a/surfsense_backend/app/podcasts/tts/audio.py b/surfsense_backend/app/podcasts/tts/audio.py new file mode 100644 index 000000000..f3c79dd5a --- /dev/null +++ b/surfsense_backend/app/podcasts/tts/audio.py @@ -0,0 +1,19 @@ +"""The bytes a TTS provider returns for one segment.""" + +from __future__ import annotations + +from dataclasses import dataclass + + +@dataclass(frozen=True, slots=True) +class SynthesizedAudio: + """Encoded audio for a single segment, ready to cache and concatenate. + + ``container`` is the file extension the bytes are encoded as (``"wav"`` or + ``"mp3"``); the renderer uses it to name the on-disk segment so FFmpeg can + demux the right format during merge. + """ + + data: bytes + container: str + sample_rate: int | None = None diff --git a/surfsense_backend/app/podcasts/tts/errors.py b/surfsense_backend/app/podcasts/tts/errors.py new file mode 100644 index 000000000..8e7ec3f2b --- /dev/null +++ b/surfsense_backend/app/podcasts/tts/errors.py @@ -0,0 +1,13 @@ +"""Failures raised by the TTS layer.""" + +from __future__ import annotations + + +class TextToSpeechError(RuntimeError): + """A provider failed to synthesise a segment. + + Raised for both configuration faults (an unusable voice reference) and + provider faults (the upstream call errored or returned no audio), so the + renderer can fail the segment without unwrapping provider-specific + exceptions. + """ diff --git a/surfsense_backend/app/podcasts/tts/factory.py b/surfsense_backend/app/podcasts/tts/factory.py new file mode 100644 index 000000000..7b4a48adf --- /dev/null +++ b/surfsense_backend/app/podcasts/tts/factory.py @@ -0,0 +1,38 @@ +"""Resolve the configured :class:`TextToSpeech` as a process-wide singleton.""" + +from __future__ import annotations + +from functools import lru_cache + +from .port import TextToSpeech + +# Sentinel model string that selects the local Kokoro pipeline; anything else is +# treated as a LiteLLM-hosted model (``openai/...``, ``vertex_ai/...``, etc.). +KOKORO_SERVICE = "local/kokoro" + + +@lru_cache(maxsize=1) +def get_text_to_speech() -> TextToSpeech: + """Build the provider selected by ``TTS_SERVICE`` (adapters lazy-imported). + + Cached because the Kokoro adapter holds loaded pipelines that must be reused + across segments and requests rather than rebuilt per call. + """ + from app.config import config as app_config + + service = app_config.TTS_SERVICE + if not service: + raise ValueError("TTS_SERVICE is not configured") + + if service == KOKORO_SERVICE: + from .adapters.kokoro import KokoroTextToSpeech + + return KokoroTextToSpeech() + + from .adapters.litellm import LiteLlmTextToSpeech + + return LiteLlmTextToSpeech( + model=service, + api_base=app_config.TTS_SERVICE_API_BASE, + api_key=app_config.TTS_SERVICE_API_KEY, + ) diff --git a/surfsense_backend/app/podcasts/tts/port.py b/surfsense_backend/app/podcasts/tts/port.py new file mode 100644 index 000000000..604708260 --- /dev/null +++ b/surfsense_backend/app/podcasts/tts/port.py @@ -0,0 +1,31 @@ +"""The TTS contract: turn one segment of text into encoded audio.""" + +from __future__ import annotations + +from abc import ABC, abstractmethod + +from .audio import SynthesizedAudio +from .request import SynthesisRequest + + +class TextToSpeech(ABC): + """Synthesises a single segment; one implementation per provider. + + The contract is intentionally per-segment rather than per-episode: it keeps + each call independently cacheable and lets the renderer cap concurrency and + retry segments in isolation. Stitching segments into one file is the + renderer's job, not the provider's. + """ + + @property + @abstractmethod + def container(self) -> str: + """File extension/container this provider emits (e.g. ``"mp3"``).""" + + @abstractmethod + async def synthesize(self, request: SynthesisRequest) -> SynthesizedAudio: + """Voice ``request.text`` and return its encoded audio. + + Raises :class:`~app.podcasts.tts.errors.TextToSpeechError` on any + provider or configuration failure. + """ diff --git a/surfsense_backend/app/podcasts/tts/request.py b/surfsense_backend/app/podcasts/tts/request.py new file mode 100644 index 000000000..2cb5f6ec4 --- /dev/null +++ b/surfsense_backend/app/podcasts/tts/request.py @@ -0,0 +1,22 @@ +"""What the renderer hands a TTS provider to voice a single segment.""" + +from __future__ import annotations + +from collections.abc import Mapping +from dataclasses import dataclass +from typing import Any + +# A provider-native voice reference. OpenAI/Azure/Kokoro name a voice with a +# string; Vertex passes a mapping (``languageCode`` + ``name``). The catalog +# stores whichever shape the provider expects and we pass it through untouched. +VoiceRef = str | Mapping[str, Any] + + +@dataclass(frozen=True, slots=True) +class SynthesisRequest: + """One unit of speech to synthesise: the smallest cacheable render step.""" + + text: str + voice: VoiceRef + language: str + speed: float = 1.0