mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-06-12 20:45:20 +02:00
feat(podcasts): add text-to-speech adapters
This commit is contained in:
parent
ee24925747
commit
75287020e1
9 changed files with 328 additions and 0 deletions
22
surfsense_backend/app/podcasts/tts/__init__.py
Normal file
22
surfsense_backend/app/podcasts/tts/__init__.py
Normal file
|
|
@ -0,0 +1,22 @@
|
|||
"""Text-to-speech: a per-segment synthesis port with provider adapters.
|
||||
|
||||
Callers depend on :class:`TextToSpeech` and obtain the configured provider from
|
||||
:func:`get_text_to_speech`; the concrete Kokoro/LiteLLM adapters stay private.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from .audio import SynthesizedAudio
|
||||
from .errors import TextToSpeechError
|
||||
from .factory import get_text_to_speech
|
||||
from .port import TextToSpeech
|
||||
from .request import SynthesisRequest, VoiceRef
|
||||
|
||||
__all__ = [
|
||||
"SynthesisRequest",
|
||||
"SynthesizedAudio",
|
||||
"TextToSpeech",
|
||||
"TextToSpeechError",
|
||||
"VoiceRef",
|
||||
"get_text_to_speech",
|
||||
]
|
||||
3
surfsense_backend/app/podcasts/tts/adapters/__init__.py
Normal file
3
surfsense_backend/app/podcasts/tts/adapters/__init__.py
Normal file
|
|
@ -0,0 +1,3 @@
|
|||
"""Per-provider TextToSpeech implementations."""
|
||||
|
||||
from __future__ import annotations
|
||||
111
surfsense_backend/app/podcasts/tts/adapters/kokoro.py
Normal file
111
surfsense_backend/app/podcasts/tts/adapters/kokoro.py
Normal file
|
|
@ -0,0 +1,111 @@
|
|||
"""Local Kokoro adapter: on-box synthesis, no network or per-segment cost.
|
||||
|
||||
Kokoro selects its language model by a single-letter ``lang_code``, so this
|
||||
adapter maps the brief's BCP-47 tag to that code and caches one pipeline per
|
||||
code (pipeline construction loads weights and is expensive). Pipelines run in a
|
||||
thread pool because Kokoro is synchronous; the renderer caps how many segments
|
||||
synthesise at once.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import io
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from ..audio import SynthesizedAudio
|
||||
from ..errors import TextToSpeechError
|
||||
from ..port import TextToSpeech
|
||||
from ..request import SynthesisRequest
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from kokoro import KPipeline
|
||||
|
||||
# Kokoro emits 24 kHz mono PCM regardless of voice.
|
||||
_SAMPLE_RATE = 24000
|
||||
|
||||
# BCP-47 primary subtag -> Kokoro language code. English defaults to American;
|
||||
# the en-GB region override below switches it to British.
|
||||
_LANG_CODE_BY_PRIMARY = {
|
||||
"en": "a",
|
||||
"es": "e",
|
||||
"fr": "f",
|
||||
"hi": "h",
|
||||
"it": "i",
|
||||
"ja": "j",
|
||||
"pt": "p",
|
||||
"zh": "z",
|
||||
}
|
||||
|
||||
|
||||
class KokoroTextToSpeech(TextToSpeech):
|
||||
"""Synthesises segments with locally hosted Kokoro pipelines."""
|
||||
|
||||
def __init__(self) -> None:
|
||||
self._pipelines: dict[str, KPipeline] = {}
|
||||
|
||||
@property
|
||||
def container(self) -> str:
|
||||
return "wav"
|
||||
|
||||
async def synthesize(self, request: SynthesisRequest) -> SynthesizedAudio:
|
||||
if not isinstance(request.voice, str):
|
||||
raise TextToSpeechError(
|
||||
"Kokoro voices are named by string, not a mapping"
|
||||
)
|
||||
|
||||
pipeline = self._pipeline_for(request.language)
|
||||
loop = asyncio.get_event_loop()
|
||||
try:
|
||||
generator = await loop.run_in_executor(
|
||||
None,
|
||||
lambda: pipeline(
|
||||
request.text,
|
||||
voice=request.voice,
|
||||
speed=request.speed,
|
||||
split_pattern=r"\n+",
|
||||
),
|
||||
)
|
||||
segments = [audio for _gs, _ps, audio in generator]
|
||||
except Exception as exc: # noqa: BLE001 - normalise provider errors
|
||||
raise TextToSpeechError(f"Kokoro synthesis failed: {exc}") from exc
|
||||
|
||||
if not segments:
|
||||
raise TextToSpeechError("Kokoro produced no audio for the text")
|
||||
|
||||
return SynthesizedAudio(
|
||||
data=_encode_wav(segments, _SAMPLE_RATE),
|
||||
container="wav",
|
||||
sample_rate=_SAMPLE_RATE,
|
||||
)
|
||||
|
||||
def _pipeline_for(self, language: str) -> KPipeline:
|
||||
lang_code = _lang_code(language)
|
||||
pipeline = self._pipelines.get(lang_code)
|
||||
if pipeline is None:
|
||||
from kokoro import KPipeline
|
||||
|
||||
pipeline = KPipeline(lang_code=lang_code)
|
||||
self._pipelines[lang_code] = pipeline
|
||||
return pipeline
|
||||
|
||||
|
||||
def _lang_code(language: str) -> str:
|
||||
normalised = language.strip().lower()
|
||||
if normalised.startswith("en-gb") or normalised == "en-uk":
|
||||
return "b"
|
||||
primary = normalised.partition("-")[0]
|
||||
code = _LANG_CODE_BY_PRIMARY.get(primary)
|
||||
if code is None:
|
||||
raise TextToSpeechError(f"Kokoro has no language model for {language!r}")
|
||||
return code
|
||||
|
||||
|
||||
def _encode_wav(segments: list, sample_rate: int) -> bytes:
|
||||
import numpy as np
|
||||
import soundfile as sf
|
||||
|
||||
waveform = segments[0] if len(segments) == 1 else np.concatenate(segments)
|
||||
buffer = io.BytesIO()
|
||||
sf.write(buffer, waveform, sample_rate, format="WAV")
|
||||
return buffer.getvalue()
|
||||
69
surfsense_backend/app/podcasts/tts/adapters/litellm.py
Normal file
69
surfsense_backend/app/podcasts/tts/adapters/litellm.py
Normal file
|
|
@ -0,0 +1,69 @@
|
|||
"""LiteLLM adapter: hosted TTS (OpenAI, Azure, Vertex AI) via one ``aspeech`` call.
|
||||
|
||||
LiteLLM normalises every hosted provider behind the same ``aspeech`` surface,
|
||||
so a single adapter covers them all. The provider is encoded in the model
|
||||
string (e.g. ``openai/tts-1``, ``vertex_ai/...``) and the voice reference is
|
||||
whatever that provider expects, which the catalog already supplies.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from ..audio import SynthesizedAudio
|
||||
from ..errors import TextToSpeechError
|
||||
from ..port import TextToSpeech
|
||||
from ..request import SynthesisRequest
|
||||
|
||||
# Hosted providers return MP3-encoded bytes from ``aspeech``.
|
||||
_CONTAINER = "mp3"
|
||||
|
||||
# Matches the legacy podcaster timeouts; long single segments still finish well
|
||||
# under this, and retries cover transient upstream failures.
|
||||
_TIMEOUT_SECONDS = 600
|
||||
_MAX_RETRIES = 2
|
||||
|
||||
|
||||
class LiteLlmTextToSpeech(TextToSpeech):
|
||||
"""Synthesises segments through any LiteLLM-supported hosted TTS model."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
model: str,
|
||||
api_base: str | None = None,
|
||||
api_key: str | None = None,
|
||||
) -> None:
|
||||
self._model = model
|
||||
self._api_base = api_base
|
||||
self._api_key = api_key
|
||||
|
||||
@property
|
||||
def container(self) -> str:
|
||||
return _CONTAINER
|
||||
|
||||
async def synthesize(self, request: SynthesisRequest) -> SynthesizedAudio:
|
||||
from litellm import aspeech
|
||||
|
||||
kwargs = {
|
||||
"model": self._model,
|
||||
"voice": request.voice,
|
||||
"input": request.text,
|
||||
"max_retries": _MAX_RETRIES,
|
||||
"timeout": _TIMEOUT_SECONDS,
|
||||
}
|
||||
if self._api_base:
|
||||
kwargs["api_base"] = self._api_base
|
||||
if self._api_key:
|
||||
kwargs["api_key"] = self._api_key
|
||||
|
||||
try:
|
||||
response = await aspeech(**kwargs)
|
||||
except Exception as exc: # noqa: BLE001 - normalise provider errors
|
||||
raise TextToSpeechError(
|
||||
f"{self._model} synthesis failed: {exc}"
|
||||
) from exc
|
||||
|
||||
data = getattr(response, "content", None)
|
||||
if not data:
|
||||
raise TextToSpeechError(f"{self._model} returned no audio")
|
||||
|
||||
return SynthesizedAudio(data=data, container=_CONTAINER)
|
||||
19
surfsense_backend/app/podcasts/tts/audio.py
Normal file
19
surfsense_backend/app/podcasts/tts/audio.py
Normal file
|
|
@ -0,0 +1,19 @@
|
|||
"""The bytes a TTS provider returns for one segment."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass
|
||||
|
||||
|
||||
@dataclass(frozen=True, slots=True)
|
||||
class SynthesizedAudio:
|
||||
"""Encoded audio for a single segment, ready to cache and concatenate.
|
||||
|
||||
``container`` is the file extension the bytes are encoded as (``"wav"`` or
|
||||
``"mp3"``); the renderer uses it to name the on-disk segment so FFmpeg can
|
||||
demux the right format during merge.
|
||||
"""
|
||||
|
||||
data: bytes
|
||||
container: str
|
||||
sample_rate: int | None = None
|
||||
13
surfsense_backend/app/podcasts/tts/errors.py
Normal file
13
surfsense_backend/app/podcasts/tts/errors.py
Normal file
|
|
@ -0,0 +1,13 @@
|
|||
"""Failures raised by the TTS layer."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
|
||||
class TextToSpeechError(RuntimeError):
|
||||
"""A provider failed to synthesise a segment.
|
||||
|
||||
Raised for both configuration faults (an unusable voice reference) and
|
||||
provider faults (the upstream call errored or returned no audio), so the
|
||||
renderer can fail the segment without unwrapping provider-specific
|
||||
exceptions.
|
||||
"""
|
||||
38
surfsense_backend/app/podcasts/tts/factory.py
Normal file
38
surfsense_backend/app/podcasts/tts/factory.py
Normal file
|
|
@ -0,0 +1,38 @@
|
|||
"""Resolve the configured :class:`TextToSpeech` as a process-wide singleton."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from functools import lru_cache
|
||||
|
||||
from .port import TextToSpeech
|
||||
|
||||
# Sentinel model string that selects the local Kokoro pipeline; anything else is
|
||||
# treated as a LiteLLM-hosted model (``openai/...``, ``vertex_ai/...``, etc.).
|
||||
KOKORO_SERVICE = "local/kokoro"
|
||||
|
||||
|
||||
@lru_cache(maxsize=1)
|
||||
def get_text_to_speech() -> TextToSpeech:
|
||||
"""Build the provider selected by ``TTS_SERVICE`` (adapters lazy-imported).
|
||||
|
||||
Cached because the Kokoro adapter holds loaded pipelines that must be reused
|
||||
across segments and requests rather than rebuilt per call.
|
||||
"""
|
||||
from app.config import config as app_config
|
||||
|
||||
service = app_config.TTS_SERVICE
|
||||
if not service:
|
||||
raise ValueError("TTS_SERVICE is not configured")
|
||||
|
||||
if service == KOKORO_SERVICE:
|
||||
from .adapters.kokoro import KokoroTextToSpeech
|
||||
|
||||
return KokoroTextToSpeech()
|
||||
|
||||
from .adapters.litellm import LiteLlmTextToSpeech
|
||||
|
||||
return LiteLlmTextToSpeech(
|
||||
model=service,
|
||||
api_base=app_config.TTS_SERVICE_API_BASE,
|
||||
api_key=app_config.TTS_SERVICE_API_KEY,
|
||||
)
|
||||
31
surfsense_backend/app/podcasts/tts/port.py
Normal file
31
surfsense_backend/app/podcasts/tts/port.py
Normal file
|
|
@ -0,0 +1,31 @@
|
|||
"""The TTS contract: turn one segment of text into encoded audio."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from abc import ABC, abstractmethod
|
||||
|
||||
from .audio import SynthesizedAudio
|
||||
from .request import SynthesisRequest
|
||||
|
||||
|
||||
class TextToSpeech(ABC):
|
||||
"""Synthesises a single segment; one implementation per provider.
|
||||
|
||||
The contract is intentionally per-segment rather than per-episode: it keeps
|
||||
each call independently cacheable and lets the renderer cap concurrency and
|
||||
retry segments in isolation. Stitching segments into one file is the
|
||||
renderer's job, not the provider's.
|
||||
"""
|
||||
|
||||
@property
|
||||
@abstractmethod
|
||||
def container(self) -> str:
|
||||
"""File extension/container this provider emits (e.g. ``"mp3"``)."""
|
||||
|
||||
@abstractmethod
|
||||
async def synthesize(self, request: SynthesisRequest) -> SynthesizedAudio:
|
||||
"""Voice ``request.text`` and return its encoded audio.
|
||||
|
||||
Raises :class:`~app.podcasts.tts.errors.TextToSpeechError` on any
|
||||
provider or configuration failure.
|
||||
"""
|
||||
22
surfsense_backend/app/podcasts/tts/request.py
Normal file
22
surfsense_backend/app/podcasts/tts/request.py
Normal file
|
|
@ -0,0 +1,22 @@
|
|||
"""What the renderer hands a TTS provider to voice a single segment."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from collections.abc import Mapping
|
||||
from dataclasses import dataclass
|
||||
from typing import Any
|
||||
|
||||
# A provider-native voice reference. OpenAI/Azure/Kokoro name a voice with a
|
||||
# string; Vertex passes a mapping (``languageCode`` + ``name``). The catalog
|
||||
# stores whichever shape the provider expects and we pass it through untouched.
|
||||
VoiceRef = str | Mapping[str, Any]
|
||||
|
||||
|
||||
@dataclass(frozen=True, slots=True)
|
||||
class SynthesisRequest:
|
||||
"""One unit of speech to synthesise: the smallest cacheable render step."""
|
||||
|
||||
text: str
|
||||
voice: VoiceRef
|
||||
language: str
|
||||
speed: float = 1.0
|
||||
Loading…
Add table
Add a link
Reference in a new issue