mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-06-12 20:45:20 +02:00
feat(podcasts): add brief and transcript schemas
This commit is contained in:
parent
8dd29fa833
commit
73e191af09
3 changed files with 221 additions and 0 deletions
24
surfsense_backend/app/podcasts/schemas/__init__.py
Normal file
24
surfsense_backend/app/podcasts/schemas/__init__.py
Normal file
|
|
@ -0,0 +1,24 @@
|
|||
"""Pydantic shapes for the podcast brief and transcript."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from .spec import (
|
||||
DurationTarget,
|
||||
PodcastSpec,
|
||||
PodcastStyle,
|
||||
SpeakerRole,
|
||||
SpeakerSpec,
|
||||
normalize_language_tag,
|
||||
)
|
||||
from .transcript import Transcript, TranscriptTurn
|
||||
|
||||
__all__ = [
|
||||
"DurationTarget",
|
||||
"PodcastSpec",
|
||||
"PodcastStyle",
|
||||
"SpeakerRole",
|
||||
"SpeakerSpec",
|
||||
"Transcript",
|
||||
"TranscriptTurn",
|
||||
"normalize_language_tag",
|
||||
]
|
||||
156
surfsense_backend/app/podcasts/schemas/spec.py
Normal file
156
surfsense_backend/app/podcasts/schemas/spec.py
Normal file
|
|
@ -0,0 +1,156 @@
|
|||
"""The brief: the editable configuration a user approves before drafting.
|
||||
|
||||
A :class:`PodcastSpec` front-loads every decision that drives token or audio
|
||||
cost (language, speakers, voices, style, target length) so the expensive
|
||||
drafting and rendering steps run once against settled inputs. It is stored as
|
||||
JSONB on the ``podcasts`` row and round-trips through the review API.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
from enum import StrEnum
|
||||
|
||||
from pydantic import BaseModel, ConfigDict, Field, field_validator, model_validator
|
||||
|
||||
# A speaker count beyond this is almost never a real podcast and explodes the
|
||||
# voice/turn-attribution space, so we reject it at the brief gate.
|
||||
MAX_SPEAKERS = 6
|
||||
|
||||
# Long-form is a goal, but an open-ended upper bound invites runaway TTS bills.
|
||||
# One day of audio is a generous ceiling that still blocks obvious mistakes.
|
||||
MAX_DURATION_MINUTES = 24 * 60
|
||||
|
||||
# BCP-47 primary subtag plus optional region (e.g. ``en``, ``en-US``, ``pt-BR``).
|
||||
# Kept deliberately permissive: the voice catalog, not the brief, decides which
|
||||
# languages can actually be synthesised. Casing is normalised after matching.
|
||||
_LANGUAGE_TAG = re.compile(r"^[A-Za-z]{2,3}(-[A-Za-z0-9]{2,8})*$")
|
||||
|
||||
|
||||
def normalize_language_tag(value: str) -> str:
|
||||
"""Validate and canonicalise a BCP-47 tag (lowercased primary subtag).
|
||||
|
||||
Shared with the generation layer so detected and user-entered languages are
|
||||
normalised identically before they reach a :class:`PodcastSpec`.
|
||||
"""
|
||||
cleaned = value.strip()
|
||||
if not _LANGUAGE_TAG.match(cleaned):
|
||||
raise ValueError(f"not a valid BCP-47 language tag: {value!r}")
|
||||
primary, _, rest = cleaned.partition("-")
|
||||
return primary.lower() if not rest else f"{primary.lower()}-{rest}"
|
||||
|
||||
|
||||
class SpeakerRole(StrEnum):
|
||||
"""How a speaker functions in the conversation, used to steer drafting."""
|
||||
|
||||
HOST = "host"
|
||||
COHOST = "cohost"
|
||||
GUEST = "guest"
|
||||
EXPERT = "expert"
|
||||
NARRATOR = "narrator"
|
||||
|
||||
|
||||
class PodcastStyle(StrEnum):
|
||||
"""The conversational format the transcript should follow."""
|
||||
|
||||
CONVERSATIONAL = "conversational"
|
||||
INTERVIEW = "interview"
|
||||
DEBATE = "debate"
|
||||
MONOLOGUE = "monologue"
|
||||
NARRATIVE = "narrative"
|
||||
|
||||
|
||||
class SpeakerSpec(BaseModel):
|
||||
"""One voice in the podcast: who they are and which TTS voice renders them.
|
||||
|
||||
``slot`` is the stable join key. Transcript turns reference a speaker by
|
||||
``slot`` and the renderer resolves ``voice_id`` for that same slot, so the
|
||||
two never drift even if speakers are reordered in the brief.
|
||||
"""
|
||||
|
||||
model_config = ConfigDict(extra="forbid")
|
||||
|
||||
slot: int = Field(..., ge=0, description="Stable index a transcript turn references")
|
||||
name: str = Field(..., min_length=1, max_length=120)
|
||||
role: SpeakerRole
|
||||
voice_id: str = Field(
|
||||
...,
|
||||
min_length=1,
|
||||
description="Catalog voice id valid for the spec's language and provider",
|
||||
)
|
||||
|
||||
@field_validator("name", "voice_id")
|
||||
@classmethod
|
||||
def _strip_required_text(cls, value: str) -> str:
|
||||
cleaned = value.strip()
|
||||
if not cleaned:
|
||||
raise ValueError("must not be blank")
|
||||
return cleaned
|
||||
|
||||
|
||||
class DurationTarget(BaseModel):
|
||||
"""The desired finished length as an inclusive minute range.
|
||||
|
||||
Drafting aims for the midpoint and treats the bounds as soft guardrails;
|
||||
storing a range (rather than a point) keeps long-form expectations honest
|
||||
without pretending we can hit an exact runtime.
|
||||
"""
|
||||
|
||||
model_config = ConfigDict(extra="forbid")
|
||||
|
||||
min_minutes: int = Field(..., ge=1, le=MAX_DURATION_MINUTES)
|
||||
max_minutes: int = Field(..., ge=1, le=MAX_DURATION_MINUTES)
|
||||
|
||||
@model_validator(mode="after")
|
||||
def _check_order(self) -> DurationTarget:
|
||||
if self.max_minutes < self.min_minutes:
|
||||
raise ValueError("max_minutes must be >= min_minutes")
|
||||
return self
|
||||
|
||||
@property
|
||||
def midpoint_minutes(self) -> float:
|
||||
"""The runtime drafting should aim for within the range."""
|
||||
return (self.min_minutes + self.max_minutes) / 2
|
||||
|
||||
|
||||
class PodcastSpec(BaseModel):
|
||||
"""The full brief approved before any tokens or audio are spent."""
|
||||
|
||||
model_config = ConfigDict(extra="forbid")
|
||||
|
||||
language: str = Field(..., description="BCP-47 tag, e.g. 'en', 'en-US', 'pt-BR'")
|
||||
style: PodcastStyle = PodcastStyle.CONVERSATIONAL
|
||||
speakers: list[SpeakerSpec] = Field(..., min_length=1, max_length=MAX_SPEAKERS)
|
||||
duration: DurationTarget
|
||||
focus: str | None = Field(
|
||||
default=None,
|
||||
max_length=2000,
|
||||
description="Optional user steer for what the episode should emphasise",
|
||||
)
|
||||
|
||||
@field_validator("language")
|
||||
@classmethod
|
||||
def _normalise_language(cls, value: str) -> str:
|
||||
return normalize_language_tag(value)
|
||||
|
||||
@field_validator("focus")
|
||||
@classmethod
|
||||
def _blank_focus_is_none(cls, value: str | None) -> str | None:
|
||||
if value is None:
|
||||
return None
|
||||
cleaned = value.strip()
|
||||
return cleaned or None
|
||||
|
||||
@model_validator(mode="after")
|
||||
def _check_speaker_slots(self) -> PodcastSpec:
|
||||
slots = [speaker.slot for speaker in self.speakers]
|
||||
if len(slots) != len(set(slots)):
|
||||
raise ValueError("speaker slots must be unique")
|
||||
return self
|
||||
|
||||
def speaker_for(self, slot: int) -> SpeakerSpec:
|
||||
"""Return the speaker bound to ``slot`` or raise if none matches."""
|
||||
for speaker in self.speakers:
|
||||
if speaker.slot == slot:
|
||||
return speaker
|
||||
raise KeyError(f"no speaker for slot {slot}")
|
||||
41
surfsense_backend/app/podcasts/schemas/transcript.py
Normal file
41
surfsense_backend/app/podcasts/schemas/transcript.py
Normal file
|
|
@ -0,0 +1,41 @@
|
|||
"""The transcript: ordered dialogue turns drafting produces for review.
|
||||
|
||||
A :class:`Transcript` is the reviewable artifact at the go/no-go gate and the
|
||||
exact input the renderer turns into audio. Each turn names a speaker by the
|
||||
``slot`` defined in the :class:`~app.podcasts.schemas.spec.PodcastSpec`, so the
|
||||
renderer can resolve the right voice without re-attributing anything.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from pydantic import BaseModel, ConfigDict, Field, field_validator
|
||||
|
||||
|
||||
class TranscriptTurn(BaseModel):
|
||||
"""A single spoken line by one speaker."""
|
||||
|
||||
model_config = ConfigDict(extra="forbid")
|
||||
|
||||
speaker: int = Field(..., ge=0, description="The PodcastSpec speaker slot speaking")
|
||||
text: str = Field(..., min_length=1)
|
||||
|
||||
@field_validator("text")
|
||||
@classmethod
|
||||
def _strip_text(cls, value: str) -> str:
|
||||
cleaned = value.strip()
|
||||
if not cleaned:
|
||||
raise ValueError("turn text must not be blank")
|
||||
return cleaned
|
||||
|
||||
|
||||
class Transcript(BaseModel):
|
||||
"""The full ordered dialogue for an episode."""
|
||||
|
||||
model_config = ConfigDict(extra="forbid")
|
||||
|
||||
turns: list[TranscriptTurn] = Field(..., min_length=1)
|
||||
|
||||
@property
|
||||
def word_count(self) -> int:
|
||||
"""Total spoken words, used to estimate runtime against the brief."""
|
||||
return sum(len(turn.text.split()) for turn in self.turns)
|
||||
Loading…
Add table
Add a link
Reference in a new issue