diff --git a/surfsense_backend/app/podcasts/voices/__init__.py b/surfsense_backend/app/podcasts/voices/__init__.py new file mode 100644 index 000000000..230b0b540 --- /dev/null +++ b/surfsense_backend/app/podcasts/voices/__init__.py @@ -0,0 +1,22 @@ +"""Voices: the catalog of selectable TTS voices and the active provider. + +Replaces the legacy hardcoded speaker-id voice maps. Callers obtain the +catalog via :func:`get_voice_catalog` and identify the configured provider via +:func:`provider_from_service`. +""" + +from __future__ import annotations + +from .catalog import VoiceCatalog, get_voice_catalog +from .provider import TtsProvider, provider_from_service +from .voice import ANY_LANGUAGE, CatalogVoice, VoiceGender + +__all__ = [ + "ANY_LANGUAGE", + "CatalogVoice", + "TtsProvider", + "VoiceCatalog", + "VoiceGender", + "get_voice_catalog", + "provider_from_service", +] diff --git a/surfsense_backend/app/podcasts/voices/catalog.py b/surfsense_backend/app/podcasts/voices/catalog.py new file mode 100644 index 000000000..591812943 --- /dev/null +++ b/surfsense_backend/app/podcasts/voices/catalog.py @@ -0,0 +1,55 @@ +"""The voice catalog: look up and filter selectable voices. + +A :class:`VoiceCatalog` is the single source of truth for which voices exist, +replacing the hardcoded speaker-id maps. Resolution uses it to pick defaults +for a brief, the API exposes it as picker options, and the renderer uses it to +turn a stored ``voice_id`` back into the provider-native reference. +""" + +from __future__ import annotations + +from collections.abc import Iterable +from functools import lru_cache + +from .data import AZURE_VOICES, KOKORO_VOICES, OPENAI_VOICES, VERTEX_VOICES +from .provider import TtsProvider +from .voice import CatalogVoice + + +class VoiceCatalog: + """An indexed, read-only collection of :class:`CatalogVoice`.""" + + def __init__(self, voices: Iterable[CatalogVoice]) -> None: + self._by_id: dict[str, CatalogVoice] = {} + self._by_provider: dict[TtsProvider, list[CatalogVoice]] = {} + for voice in voices: + if voice.voice_id in self._by_id: + raise ValueError(f"duplicate voice_id: {voice.voice_id}") + self._by_id[voice.voice_id] = voice + self._by_provider.setdefault(voice.provider, []).append(voice) + + def get(self, voice_id: str) -> CatalogVoice: + """Return the voice with ``voice_id`` or raise ``KeyError``.""" + return self._by_id[voice_id] + + def for_provider(self, provider: TtsProvider) -> list[CatalogVoice]: + """All voices offered by ``provider``, in catalog order.""" + return list(self._by_provider.get(provider, ())) + + def for_language( + self, provider: TtsProvider, language: str + ) -> list[CatalogVoice]: + """``provider`` voices that can render ``language``, in catalog order.""" + return [v for v in self.for_provider(provider) if v.speaks(language)] + + def supports_language(self, provider: TtsProvider, language: str) -> bool: + """Whether ``provider`` has at least one voice for ``language``.""" + return any(v.speaks(language) for v in self.for_provider(provider)) + + +@lru_cache(maxsize=1) +def get_voice_catalog() -> VoiceCatalog: + """The process-wide catalog assembled from every provider's roster.""" + return VoiceCatalog( + (*KOKORO_VOICES, *OPENAI_VOICES, *AZURE_VOICES, *VERTEX_VOICES) + ) diff --git a/surfsense_backend/app/podcasts/voices/provider.py b/surfsense_backend/app/podcasts/voices/provider.py new file mode 100644 index 000000000..f57ae11cc --- /dev/null +++ b/surfsense_backend/app/podcasts/voices/provider.py @@ -0,0 +1,27 @@ +"""The TTS providers we carry voices for, and how to name one from config.""" + +from __future__ import annotations + +from enum import StrEnum + + +class TtsProvider(StrEnum): + """A speech provider whose voices the catalog enumerates.""" + + KOKORO = "kokoro" + OPENAI = "openai" + AZURE = "azure" + VERTEX_AI = "vertex_ai" + + +def provider_from_service(service: str) -> TtsProvider: + """Map a ``TTS_SERVICE`` string to its provider. + + The config value is a LiteLLM-style ``provider/model`` string + (``openai/tts-1``, ``vertex_ai/...``) except for local Kokoro, which is + spelled ``local/kokoro``; both halves of that special case resolve here. + """ + prefix = service.split("/", 1)[0].strip().lower() + if prefix == "local": + return TtsProvider.KOKORO + return TtsProvider(prefix) diff --git a/surfsense_backend/app/podcasts/voices/voice.py b/surfsense_backend/app/podcasts/voices/voice.py new file mode 100644 index 000000000..6478f04b0 --- /dev/null +++ b/surfsense_backend/app/podcasts/voices/voice.py @@ -0,0 +1,50 @@ +"""A catalog voice: a stable id paired with its provider-native reference.""" + +from __future__ import annotations + +from dataclasses import dataclass +from enum import StrEnum + +from app.podcasts.tts import VoiceRef + +from .provider import TtsProvider + +# A voice that speaks whatever language the input text is in (e.g. OpenAI's +# voices), matched against every requested language. +ANY_LANGUAGE = "*" + + +class VoiceGender(StrEnum): + """Perceived voice gender, used to pick distinct voices per speaker.""" + + MALE = "male" + FEMALE = "female" + NEUTRAL = "neutral" + + +@dataclass(frozen=True, slots=True) +class CatalogVoice: + """One selectable voice. + + ``voice_id`` is the provider-prefixed, stable id stored on a speaker in the + brief (e.g. ``"kokoro:am_adam"``). ``native_ref`` is the untyped value the + TTS adapter passes to the provider — a string for most, a mapping for + Vertex — kept separate so renaming the catalog id never breaks synthesis. + """ + + voice_id: str + provider: TtsProvider + language: str + display_name: str + gender: VoiceGender + native_ref: VoiceRef + + def speaks(self, language: str) -> bool: + """Whether this voice can render ``language`` (primary subtag match).""" + if self.language == ANY_LANGUAGE: + return True + return _primary(self.language) == _primary(language) + + +def _primary(language: str) -> str: + return language.split("-", 1)[0].strip().lower()