mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-06-12 20:45:20 +02:00
feat(podcasts): add voice catalog
This commit is contained in:
parent
65b6c2d357
commit
ee24925747
4 changed files with 154 additions and 0 deletions
22
surfsense_backend/app/podcasts/voices/__init__.py
Normal file
22
surfsense_backend/app/podcasts/voices/__init__.py
Normal file
|
|
@ -0,0 +1,22 @@
|
|||
"""Voices: the catalog of selectable TTS voices and the active provider.
|
||||
|
||||
Replaces the legacy hardcoded speaker-id voice maps. Callers obtain the
|
||||
catalog via :func:`get_voice_catalog` and identify the configured provider via
|
||||
:func:`provider_from_service`.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from .catalog import VoiceCatalog, get_voice_catalog
|
||||
from .provider import TtsProvider, provider_from_service
|
||||
from .voice import ANY_LANGUAGE, CatalogVoice, VoiceGender
|
||||
|
||||
__all__ = [
|
||||
"ANY_LANGUAGE",
|
||||
"CatalogVoice",
|
||||
"TtsProvider",
|
||||
"VoiceCatalog",
|
||||
"VoiceGender",
|
||||
"get_voice_catalog",
|
||||
"provider_from_service",
|
||||
]
|
||||
55
surfsense_backend/app/podcasts/voices/catalog.py
Normal file
55
surfsense_backend/app/podcasts/voices/catalog.py
Normal file
|
|
@ -0,0 +1,55 @@
|
|||
"""The voice catalog: look up and filter selectable voices.
|
||||
|
||||
A :class:`VoiceCatalog` is the single source of truth for which voices exist,
|
||||
replacing the hardcoded speaker-id maps. Resolution uses it to pick defaults
|
||||
for a brief, the API exposes it as picker options, and the renderer uses it to
|
||||
turn a stored ``voice_id`` back into the provider-native reference.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from collections.abc import Iterable
|
||||
from functools import lru_cache
|
||||
|
||||
from .data import AZURE_VOICES, KOKORO_VOICES, OPENAI_VOICES, VERTEX_VOICES
|
||||
from .provider import TtsProvider
|
||||
from .voice import CatalogVoice
|
||||
|
||||
|
||||
class VoiceCatalog:
|
||||
"""An indexed, read-only collection of :class:`CatalogVoice`."""
|
||||
|
||||
def __init__(self, voices: Iterable[CatalogVoice]) -> None:
|
||||
self._by_id: dict[str, CatalogVoice] = {}
|
||||
self._by_provider: dict[TtsProvider, list[CatalogVoice]] = {}
|
||||
for voice in voices:
|
||||
if voice.voice_id in self._by_id:
|
||||
raise ValueError(f"duplicate voice_id: {voice.voice_id}")
|
||||
self._by_id[voice.voice_id] = voice
|
||||
self._by_provider.setdefault(voice.provider, []).append(voice)
|
||||
|
||||
def get(self, voice_id: str) -> CatalogVoice:
|
||||
"""Return the voice with ``voice_id`` or raise ``KeyError``."""
|
||||
return self._by_id[voice_id]
|
||||
|
||||
def for_provider(self, provider: TtsProvider) -> list[CatalogVoice]:
|
||||
"""All voices offered by ``provider``, in catalog order."""
|
||||
return list(self._by_provider.get(provider, ()))
|
||||
|
||||
def for_language(
|
||||
self, provider: TtsProvider, language: str
|
||||
) -> list[CatalogVoice]:
|
||||
"""``provider`` voices that can render ``language``, in catalog order."""
|
||||
return [v for v in self.for_provider(provider) if v.speaks(language)]
|
||||
|
||||
def supports_language(self, provider: TtsProvider, language: str) -> bool:
|
||||
"""Whether ``provider`` has at least one voice for ``language``."""
|
||||
return any(v.speaks(language) for v in self.for_provider(provider))
|
||||
|
||||
|
||||
@lru_cache(maxsize=1)
|
||||
def get_voice_catalog() -> VoiceCatalog:
|
||||
"""The process-wide catalog assembled from every provider's roster."""
|
||||
return VoiceCatalog(
|
||||
(*KOKORO_VOICES, *OPENAI_VOICES, *AZURE_VOICES, *VERTEX_VOICES)
|
||||
)
|
||||
27
surfsense_backend/app/podcasts/voices/provider.py
Normal file
27
surfsense_backend/app/podcasts/voices/provider.py
Normal file
|
|
@ -0,0 +1,27 @@
|
|||
"""The TTS providers we carry voices for, and how to name one from config."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from enum import StrEnum
|
||||
|
||||
|
||||
class TtsProvider(StrEnum):
|
||||
"""A speech provider whose voices the catalog enumerates."""
|
||||
|
||||
KOKORO = "kokoro"
|
||||
OPENAI = "openai"
|
||||
AZURE = "azure"
|
||||
VERTEX_AI = "vertex_ai"
|
||||
|
||||
|
||||
def provider_from_service(service: str) -> TtsProvider:
|
||||
"""Map a ``TTS_SERVICE`` string to its provider.
|
||||
|
||||
The config value is a LiteLLM-style ``provider/model`` string
|
||||
(``openai/tts-1``, ``vertex_ai/...``) except for local Kokoro, which is
|
||||
spelled ``local/kokoro``; both halves of that special case resolve here.
|
||||
"""
|
||||
prefix = service.split("/", 1)[0].strip().lower()
|
||||
if prefix == "local":
|
||||
return TtsProvider.KOKORO
|
||||
return TtsProvider(prefix)
|
||||
50
surfsense_backend/app/podcasts/voices/voice.py
Normal file
50
surfsense_backend/app/podcasts/voices/voice.py
Normal file
|
|
@ -0,0 +1,50 @@
|
|||
"""A catalog voice: a stable id paired with its provider-native reference."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass
|
||||
from enum import StrEnum
|
||||
|
||||
from app.podcasts.tts import VoiceRef
|
||||
|
||||
from .provider import TtsProvider
|
||||
|
||||
# A voice that speaks whatever language the input text is in (e.g. OpenAI's
|
||||
# voices), matched against every requested language.
|
||||
ANY_LANGUAGE = "*"
|
||||
|
||||
|
||||
class VoiceGender(StrEnum):
|
||||
"""Perceived voice gender, used to pick distinct voices per speaker."""
|
||||
|
||||
MALE = "male"
|
||||
FEMALE = "female"
|
||||
NEUTRAL = "neutral"
|
||||
|
||||
|
||||
@dataclass(frozen=True, slots=True)
|
||||
class CatalogVoice:
|
||||
"""One selectable voice.
|
||||
|
||||
``voice_id`` is the provider-prefixed, stable id stored on a speaker in the
|
||||
brief (e.g. ``"kokoro:am_adam"``). ``native_ref`` is the untyped value the
|
||||
TTS adapter passes to the provider — a string for most, a mapping for
|
||||
Vertex — kept separate so renaming the catalog id never breaks synthesis.
|
||||
"""
|
||||
|
||||
voice_id: str
|
||||
provider: TtsProvider
|
||||
language: str
|
||||
display_name: str
|
||||
gender: VoiceGender
|
||||
native_ref: VoiceRef
|
||||
|
||||
def speaks(self, language: str) -> bool:
|
||||
"""Whether this voice can render ``language`` (primary subtag match)."""
|
||||
if self.language == ANY_LANGUAGE:
|
||||
return True
|
||||
return _primary(self.language) == _primary(language)
|
||||
|
||||
|
||||
def _primary(language: str) -> str:
|
||||
return language.split("-", 1)[0].strip().lower()
|
||||
Loading…
Add table
Add a link
Reference in a new issue