mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-06-12 20:45:20 +02:00
157 lines
5.7 KiB
Python
157 lines
5.7 KiB
Python
"""Render an approved transcript into a single podcast audio file.
|
|
|
|
The renderer is the only place that turns dialogue into sound. It maps each
|
|
turn to its speaker's voice, synthesises segments concurrently (capped, served
|
|
from the segment cache when possible, and coalesced so identical lines render
|
|
once), then merges them in order. It takes a settled spec + transcript and
|
|
returns bytes; persistence and lifecycle transitions belong to the service.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import asyncio
|
|
from dataclasses import dataclass
|
|
from pathlib import Path
|
|
|
|
from app.podcasts.schemas import PodcastSpec, Transcript, TranscriptTurn
|
|
from app.podcasts.tts import SynthesisRequest, TextToSpeech, TextToSpeechError
|
|
from app.podcasts.voices import VoiceCatalog
|
|
|
|
from .cache import SegmentCache
|
|
from .errors import RenderError
|
|
from .merge import concat_to_mp3
|
|
|
|
# Bounds how many segments synthesise at once. Protects hosted-provider rate
|
|
# limits and avoids thrashing the local Kokoro pipeline; the renderer is I/O- or
|
|
# model-bound per segment, so a small pool already saturates throughput.
|
|
DEFAULT_MAX_CONCURRENCY = 4
|
|
|
|
_MERGED_FILENAME = "podcast.mp3"
|
|
|
|
|
|
@dataclass(frozen=True, slots=True)
|
|
class RenderedPodcast:
|
|
"""The finished episode: encoded bytes plus their container."""
|
|
|
|
data: bytes
|
|
container: str
|
|
|
|
|
|
class PodcastRenderer:
|
|
"""Synthesises and merges a transcript using one TTS provider."""
|
|
|
|
def __init__(
|
|
self,
|
|
*,
|
|
tts: TextToSpeech,
|
|
catalog: VoiceCatalog,
|
|
max_concurrency: int = DEFAULT_MAX_CONCURRENCY,
|
|
) -> None:
|
|
self._tts = tts
|
|
self._catalog = catalog
|
|
self._max_concurrency = max_concurrency
|
|
|
|
async def render(
|
|
self,
|
|
*,
|
|
spec: PodcastSpec,
|
|
transcript: Transcript,
|
|
workdir: Path,
|
|
) -> RenderedPodcast:
|
|
"""Produce the merged MP3 for ``transcript`` under ``spec``.
|
|
|
|
``workdir`` holds the segment cache and merge output; reusing the same
|
|
directory across renders is what makes voice edits cheap.
|
|
"""
|
|
cache = SegmentCache(workdir / "segments")
|
|
requests = [self._request_for(spec, turn) for turn in transcript.turns]
|
|
|
|
# Concurrency primitives are created per render so each call is bound to
|
|
# the event loop running it (Celery tasks may use a fresh loop).
|
|
synthesizer = _SegmentSynthesizer(self._tts, cache, self._max_concurrency)
|
|
segment_paths = await asyncio.gather(
|
|
*(synthesizer.segment(request) for request in requests)
|
|
)
|
|
|
|
output_path = workdir / _MERGED_FILENAME
|
|
await concat_to_mp3(list(segment_paths), output_path)
|
|
return RenderedPodcast(data=output_path.read_bytes(), container="mp3")
|
|
|
|
def _request_for(
|
|
self, spec: PodcastSpec, turn: TranscriptTurn
|
|
) -> SynthesisRequest:
|
|
try:
|
|
speaker = spec.speaker_for(turn.speaker)
|
|
except KeyError as exc:
|
|
raise RenderError(
|
|
f"transcript references unknown speaker slot {turn.speaker}"
|
|
) from exc
|
|
try:
|
|
voice = self._catalog.get(speaker.voice_id)
|
|
except KeyError as exc:
|
|
raise RenderError(f"unknown voice {speaker.voice_id!r}") from exc
|
|
return SynthesisRequest(
|
|
text=turn.text, voice=voice.native_ref, language=spec.language
|
|
)
|
|
|
|
|
|
class _SegmentSynthesizer:
|
|
"""Per-render synthesis coordinator: caps concurrency and dedupes work.
|
|
|
|
Beyond the on-disk cache (which serves cross-render reuse), this coalesces
|
|
identical segments that race within one render so the same line is voiced
|
|
once even when several turns request it simultaneously.
|
|
"""
|
|
|
|
def __init__(
|
|
self, tts: TextToSpeech, cache: SegmentCache, max_concurrency: int
|
|
) -> None:
|
|
self._tts = tts
|
|
self._cache = cache
|
|
self._container = tts.container
|
|
self._semaphore = asyncio.Semaphore(max_concurrency)
|
|
self._inflight: dict[str, asyncio.Future[Path]] = {}
|
|
self._inflight_lock = asyncio.Lock()
|
|
|
|
async def segment(self, request: SynthesisRequest) -> Path:
|
|
key = self._cache.key(request)
|
|
cached = self._cache.get(key, self._container)
|
|
if cached is not None:
|
|
return cached
|
|
|
|
async with self._inflight_lock:
|
|
future = self._inflight.get(key)
|
|
owner = future is None
|
|
if owner:
|
|
future = asyncio.get_event_loop().create_future()
|
|
self._inflight[key] = future
|
|
|
|
# The owner runs the work and publishes the outcome on the shared future;
|
|
# every caller (owner included) reads it back via ``await future`` so the
|
|
# result is retrieved exactly once-or-more and never left dangling.
|
|
if owner:
|
|
try:
|
|
path = await self._synthesize(request, key)
|
|
except BaseException as exc: # noqa: BLE001 - relayed to all waiters
|
|
future.set_exception(exc)
|
|
else:
|
|
future.set_result(path)
|
|
finally:
|
|
await self._forget(key)
|
|
|
|
return await future
|
|
|
|
async def _synthesize(self, request: SynthesisRequest, key: str) -> Path:
|
|
async with self._semaphore:
|
|
cached = self._cache.get(key, self._container)
|
|
if cached is not None:
|
|
return cached
|
|
try:
|
|
audio = await self._tts.synthesize(request)
|
|
except TextToSpeechError as exc:
|
|
raise RenderError(f"segment synthesis failed: {exc}") from exc
|
|
return self._cache.put(key, audio.container, audio.data)
|
|
|
|
async def _forget(self, key: str) -> None:
|
|
async with self._inflight_lock:
|
|
self._inflight.pop(key, None)
|