feat(podcasts): add brief and transcript schemas

This commit is contained in:
CREDO23 2026-06-10 18:44:03 +02:00
parent 8dd29fa833
commit 73e191af09
3 changed files with 221 additions and 0 deletions

View file

@ -0,0 +1,24 @@
"""Pydantic shapes for the podcast brief and transcript."""
from __future__ import annotations
from .spec import (
DurationTarget,
PodcastSpec,
PodcastStyle,
SpeakerRole,
SpeakerSpec,
normalize_language_tag,
)
from .transcript import Transcript, TranscriptTurn
__all__ = [
"DurationTarget",
"PodcastSpec",
"PodcastStyle",
"SpeakerRole",
"SpeakerSpec",
"Transcript",
"TranscriptTurn",
"normalize_language_tag",
]

View file

@ -0,0 +1,156 @@
"""The brief: the editable configuration a user approves before drafting.
A :class:`PodcastSpec` front-loads every decision that drives token or audio
cost (language, speakers, voices, style, target length) so the expensive
drafting and rendering steps run once against settled inputs. It is stored as
JSONB on the ``podcasts`` row and round-trips through the review API.
"""
from __future__ import annotations
import re
from enum import StrEnum
from pydantic import BaseModel, ConfigDict, Field, field_validator, model_validator
# A speaker count beyond this is almost never a real podcast and explodes the
# voice/turn-attribution space, so we reject it at the brief gate.
MAX_SPEAKERS = 6
# Long-form is a goal, but an open-ended upper bound invites runaway TTS bills.
# One day of audio is a generous ceiling that still blocks obvious mistakes.
MAX_DURATION_MINUTES = 24 * 60
# BCP-47 primary subtag plus optional region (e.g. ``en``, ``en-US``, ``pt-BR``).
# Kept deliberately permissive: the voice catalog, not the brief, decides which
# languages can actually be synthesised. Casing is normalised after matching.
_LANGUAGE_TAG = re.compile(r"^[A-Za-z]{2,3}(-[A-Za-z0-9]{2,8})*$")
def normalize_language_tag(value: str) -> str:
"""Validate and canonicalise a BCP-47 tag (lowercased primary subtag).
Shared with the generation layer so detected and user-entered languages are
normalised identically before they reach a :class:`PodcastSpec`.
"""
cleaned = value.strip()
if not _LANGUAGE_TAG.match(cleaned):
raise ValueError(f"not a valid BCP-47 language tag: {value!r}")
primary, _, rest = cleaned.partition("-")
return primary.lower() if not rest else f"{primary.lower()}-{rest}"
class SpeakerRole(StrEnum):
"""How a speaker functions in the conversation, used to steer drafting."""
HOST = "host"
COHOST = "cohost"
GUEST = "guest"
EXPERT = "expert"
NARRATOR = "narrator"
class PodcastStyle(StrEnum):
"""The conversational format the transcript should follow."""
CONVERSATIONAL = "conversational"
INTERVIEW = "interview"
DEBATE = "debate"
MONOLOGUE = "monologue"
NARRATIVE = "narrative"
class SpeakerSpec(BaseModel):
"""One voice in the podcast: who they are and which TTS voice renders them.
``slot`` is the stable join key. Transcript turns reference a speaker by
``slot`` and the renderer resolves ``voice_id`` for that same slot, so the
two never drift even if speakers are reordered in the brief.
"""
model_config = ConfigDict(extra="forbid")
slot: int = Field(..., ge=0, description="Stable index a transcript turn references")
name: str = Field(..., min_length=1, max_length=120)
role: SpeakerRole
voice_id: str = Field(
...,
min_length=1,
description="Catalog voice id valid for the spec's language and provider",
)
@field_validator("name", "voice_id")
@classmethod
def _strip_required_text(cls, value: str) -> str:
cleaned = value.strip()
if not cleaned:
raise ValueError("must not be blank")
return cleaned
class DurationTarget(BaseModel):
"""The desired finished length as an inclusive minute range.
Drafting aims for the midpoint and treats the bounds as soft guardrails;
storing a range (rather than a point) keeps long-form expectations honest
without pretending we can hit an exact runtime.
"""
model_config = ConfigDict(extra="forbid")
min_minutes: int = Field(..., ge=1, le=MAX_DURATION_MINUTES)
max_minutes: int = Field(..., ge=1, le=MAX_DURATION_MINUTES)
@model_validator(mode="after")
def _check_order(self) -> DurationTarget:
if self.max_minutes < self.min_minutes:
raise ValueError("max_minutes must be >= min_minutes")
return self
@property
def midpoint_minutes(self) -> float:
"""The runtime drafting should aim for within the range."""
return (self.min_minutes + self.max_minutes) / 2
class PodcastSpec(BaseModel):
"""The full brief approved before any tokens or audio are spent."""
model_config = ConfigDict(extra="forbid")
language: str = Field(..., description="BCP-47 tag, e.g. 'en', 'en-US', 'pt-BR'")
style: PodcastStyle = PodcastStyle.CONVERSATIONAL
speakers: list[SpeakerSpec] = Field(..., min_length=1, max_length=MAX_SPEAKERS)
duration: DurationTarget
focus: str | None = Field(
default=None,
max_length=2000,
description="Optional user steer for what the episode should emphasise",
)
@field_validator("language")
@classmethod
def _normalise_language(cls, value: str) -> str:
return normalize_language_tag(value)
@field_validator("focus")
@classmethod
def _blank_focus_is_none(cls, value: str | None) -> str | None:
if value is None:
return None
cleaned = value.strip()
return cleaned or None
@model_validator(mode="after")
def _check_speaker_slots(self) -> PodcastSpec:
slots = [speaker.slot for speaker in self.speakers]
if len(slots) != len(set(slots)):
raise ValueError("speaker slots must be unique")
return self
def speaker_for(self, slot: int) -> SpeakerSpec:
"""Return the speaker bound to ``slot`` or raise if none matches."""
for speaker in self.speakers:
if speaker.slot == slot:
return speaker
raise KeyError(f"no speaker for slot {slot}")

View file

@ -0,0 +1,41 @@
"""The transcript: ordered dialogue turns drafting produces for review.
A :class:`Transcript` is the reviewable artifact at the go/no-go gate and the
exact input the renderer turns into audio. Each turn names a speaker by the
``slot`` defined in the :class:`~app.podcasts.schemas.spec.PodcastSpec`, so the
renderer can resolve the right voice without re-attributing anything.
"""
from __future__ import annotations
from pydantic import BaseModel, ConfigDict, Field, field_validator
class TranscriptTurn(BaseModel):
"""A single spoken line by one speaker."""
model_config = ConfigDict(extra="forbid")
speaker: int = Field(..., ge=0, description="The PodcastSpec speaker slot speaking")
text: str = Field(..., min_length=1)
@field_validator("text")
@classmethod
def _strip_text(cls, value: str) -> str:
cleaned = value.strip()
if not cleaned:
raise ValueError("turn text must not be blank")
return cleaned
class Transcript(BaseModel):
"""The full ordered dialogue for an episode."""
model_config = ConfigDict(extra="forbid")
turns: list[TranscriptTurn] = Field(..., min_length=1)
@property
def word_count(self) -> int:
"""Total spoken words, used to estimate runtime against the brief."""
return sum(len(turn.text.split()) for turn in self.turns)