From 73e191af0905d030518c04615803ee7139521d99 Mon Sep 17 00:00:00 2001 From: CREDO23 Date: Wed, 10 Jun 2026 18:44:03 +0200 Subject: [PATCH] feat(podcasts): add brief and transcript schemas --- .../app/podcasts/schemas/__init__.py | 24 +++ .../app/podcasts/schemas/spec.py | 156 ++++++++++++++++++ .../app/podcasts/schemas/transcript.py | 41 +++++ 3 files changed, 221 insertions(+) create mode 100644 surfsense_backend/app/podcasts/schemas/__init__.py create mode 100644 surfsense_backend/app/podcasts/schemas/spec.py create mode 100644 surfsense_backend/app/podcasts/schemas/transcript.py diff --git a/surfsense_backend/app/podcasts/schemas/__init__.py b/surfsense_backend/app/podcasts/schemas/__init__.py new file mode 100644 index 000000000..cd19a21cc --- /dev/null +++ b/surfsense_backend/app/podcasts/schemas/__init__.py @@ -0,0 +1,24 @@ +"""Pydantic shapes for the podcast brief and transcript.""" + +from __future__ import annotations + +from .spec import ( + DurationTarget, + PodcastSpec, + PodcastStyle, + SpeakerRole, + SpeakerSpec, + normalize_language_tag, +) +from .transcript import Transcript, TranscriptTurn + +__all__ = [ + "DurationTarget", + "PodcastSpec", + "PodcastStyle", + "SpeakerRole", + "SpeakerSpec", + "Transcript", + "TranscriptTurn", + "normalize_language_tag", +] diff --git a/surfsense_backend/app/podcasts/schemas/spec.py b/surfsense_backend/app/podcasts/schemas/spec.py new file mode 100644 index 000000000..2d3b3c74e --- /dev/null +++ b/surfsense_backend/app/podcasts/schemas/spec.py @@ -0,0 +1,156 @@ +"""The brief: the editable configuration a user approves before drafting. + +A :class:`PodcastSpec` front-loads every decision that drives token or audio +cost (language, speakers, voices, style, target length) so the expensive +drafting and rendering steps run once against settled inputs. It is stored as +JSONB on the ``podcasts`` row and round-trips through the review API. +""" + +from __future__ import annotations + +import re +from enum import StrEnum + +from pydantic import BaseModel, ConfigDict, Field, field_validator, model_validator + +# A speaker count beyond this is almost never a real podcast and explodes the +# voice/turn-attribution space, so we reject it at the brief gate. +MAX_SPEAKERS = 6 + +# Long-form is a goal, but an open-ended upper bound invites runaway TTS bills. +# One day of audio is a generous ceiling that still blocks obvious mistakes. +MAX_DURATION_MINUTES = 24 * 60 + +# BCP-47 primary subtag plus optional region (e.g. ``en``, ``en-US``, ``pt-BR``). +# Kept deliberately permissive: the voice catalog, not the brief, decides which +# languages can actually be synthesised. Casing is normalised after matching. +_LANGUAGE_TAG = re.compile(r"^[A-Za-z]{2,3}(-[A-Za-z0-9]{2,8})*$") + + +def normalize_language_tag(value: str) -> str: + """Validate and canonicalise a BCP-47 tag (lowercased primary subtag). + + Shared with the generation layer so detected and user-entered languages are + normalised identically before they reach a :class:`PodcastSpec`. + """ + cleaned = value.strip() + if not _LANGUAGE_TAG.match(cleaned): + raise ValueError(f"not a valid BCP-47 language tag: {value!r}") + primary, _, rest = cleaned.partition("-") + return primary.lower() if not rest else f"{primary.lower()}-{rest}" + + +class SpeakerRole(StrEnum): + """How a speaker functions in the conversation, used to steer drafting.""" + + HOST = "host" + COHOST = "cohost" + GUEST = "guest" + EXPERT = "expert" + NARRATOR = "narrator" + + +class PodcastStyle(StrEnum): + """The conversational format the transcript should follow.""" + + CONVERSATIONAL = "conversational" + INTERVIEW = "interview" + DEBATE = "debate" + MONOLOGUE = "monologue" + NARRATIVE = "narrative" + + +class SpeakerSpec(BaseModel): + """One voice in the podcast: who they are and which TTS voice renders them. + + ``slot`` is the stable join key. Transcript turns reference a speaker by + ``slot`` and the renderer resolves ``voice_id`` for that same slot, so the + two never drift even if speakers are reordered in the brief. + """ + + model_config = ConfigDict(extra="forbid") + + slot: int = Field(..., ge=0, description="Stable index a transcript turn references") + name: str = Field(..., min_length=1, max_length=120) + role: SpeakerRole + voice_id: str = Field( + ..., + min_length=1, + description="Catalog voice id valid for the spec's language and provider", + ) + + @field_validator("name", "voice_id") + @classmethod + def _strip_required_text(cls, value: str) -> str: + cleaned = value.strip() + if not cleaned: + raise ValueError("must not be blank") + return cleaned + + +class DurationTarget(BaseModel): + """The desired finished length as an inclusive minute range. + + Drafting aims for the midpoint and treats the bounds as soft guardrails; + storing a range (rather than a point) keeps long-form expectations honest + without pretending we can hit an exact runtime. + """ + + model_config = ConfigDict(extra="forbid") + + min_minutes: int = Field(..., ge=1, le=MAX_DURATION_MINUTES) + max_minutes: int = Field(..., ge=1, le=MAX_DURATION_MINUTES) + + @model_validator(mode="after") + def _check_order(self) -> DurationTarget: + if self.max_minutes < self.min_minutes: + raise ValueError("max_minutes must be >= min_minutes") + return self + + @property + def midpoint_minutes(self) -> float: + """The runtime drafting should aim for within the range.""" + return (self.min_minutes + self.max_minutes) / 2 + + +class PodcastSpec(BaseModel): + """The full brief approved before any tokens or audio are spent.""" + + model_config = ConfigDict(extra="forbid") + + language: str = Field(..., description="BCP-47 tag, e.g. 'en', 'en-US', 'pt-BR'") + style: PodcastStyle = PodcastStyle.CONVERSATIONAL + speakers: list[SpeakerSpec] = Field(..., min_length=1, max_length=MAX_SPEAKERS) + duration: DurationTarget + focus: str | None = Field( + default=None, + max_length=2000, + description="Optional user steer for what the episode should emphasise", + ) + + @field_validator("language") + @classmethod + def _normalise_language(cls, value: str) -> str: + return normalize_language_tag(value) + + @field_validator("focus") + @classmethod + def _blank_focus_is_none(cls, value: str | None) -> str | None: + if value is None: + return None + cleaned = value.strip() + return cleaned or None + + @model_validator(mode="after") + def _check_speaker_slots(self) -> PodcastSpec: + slots = [speaker.slot for speaker in self.speakers] + if len(slots) != len(set(slots)): + raise ValueError("speaker slots must be unique") + return self + + def speaker_for(self, slot: int) -> SpeakerSpec: + """Return the speaker bound to ``slot`` or raise if none matches.""" + for speaker in self.speakers: + if speaker.slot == slot: + return speaker + raise KeyError(f"no speaker for slot {slot}") diff --git a/surfsense_backend/app/podcasts/schemas/transcript.py b/surfsense_backend/app/podcasts/schemas/transcript.py new file mode 100644 index 000000000..b4c1463d8 --- /dev/null +++ b/surfsense_backend/app/podcasts/schemas/transcript.py @@ -0,0 +1,41 @@ +"""The transcript: ordered dialogue turns drafting produces for review. + +A :class:`Transcript` is the reviewable artifact at the go/no-go gate and the +exact input the renderer turns into audio. Each turn names a speaker by the +``slot`` defined in the :class:`~app.podcasts.schemas.spec.PodcastSpec`, so the +renderer can resolve the right voice without re-attributing anything. +""" + +from __future__ import annotations + +from pydantic import BaseModel, ConfigDict, Field, field_validator + + +class TranscriptTurn(BaseModel): + """A single spoken line by one speaker.""" + + model_config = ConfigDict(extra="forbid") + + speaker: int = Field(..., ge=0, description="The PodcastSpec speaker slot speaking") + text: str = Field(..., min_length=1) + + @field_validator("text") + @classmethod + def _strip_text(cls, value: str) -> str: + cleaned = value.strip() + if not cleaned: + raise ValueError("turn text must not be blank") + return cleaned + + +class Transcript(BaseModel): + """The full ordered dialogue for an episode.""" + + model_config = ConfigDict(extra="forbid") + + turns: list[TranscriptTurn] = Field(..., min_length=1) + + @property + def word_count(self) -> int: + """Total spoken words, used to estimate runtime against the brief.""" + return sum(len(turn.text.split()) for turn in self.turns)