From a3386cd5f983263fdf925b494f053b2ce0504084 Mon Sep 17 00:00:00 2001 From: CREDO23 Date: Wed, 10 Jun 2026 18:44:03 +0200 Subject: [PATCH] feat(podcasts): add brief and transcript generation --- .../app/podcasts/generation/__init__.py | 20 +++ .../app/podcasts/generation/brief/__init__.py | 9 ++ .../app/podcasts/generation/brief/config.py | 31 ++++ .../podcasts/generation/brief/detection.py | 28 ++++ .../app/podcasts/generation/brief/graph.py | 27 ++++ .../app/podcasts/generation/brief/nodes.py | 153 ++++++++++++++++++ .../app/podcasts/generation/brief/state.py | 19 +++ .../podcasts/generation/prompts/__init__.py | 15 ++ .../generation/prompts/detect_language.py | 22 +++ .../generation/prompts/draft_segment.py | 54 +++++++ .../generation/prompts/plan_outline.py | 47 ++++++ .../podcasts/generation/prompts/speakers.py | 18 +++ .../app/podcasts/generation/structured.py | 49 ++++++ .../generation/transcript/__init__.py | 17 ++ .../podcasts/generation/transcript/config.py | 26 +++ .../podcasts/generation/transcript/graph.py | 29 ++++ .../podcasts/generation/transcript/nodes.py | 127 +++++++++++++++ .../generation/transcript/planning.py | 32 ++++ .../podcasts/generation/transcript/state.py | 22 +++ 19 files changed, 745 insertions(+) create mode 100644 surfsense_backend/app/podcasts/generation/__init__.py create mode 100644 surfsense_backend/app/podcasts/generation/brief/__init__.py create mode 100644 surfsense_backend/app/podcasts/generation/brief/config.py create mode 100644 surfsense_backend/app/podcasts/generation/brief/detection.py create mode 100644 surfsense_backend/app/podcasts/generation/brief/graph.py create mode 100644 surfsense_backend/app/podcasts/generation/brief/nodes.py create mode 100644 surfsense_backend/app/podcasts/generation/brief/state.py create mode 100644 surfsense_backend/app/podcasts/generation/prompts/__init__.py create mode 100644 surfsense_backend/app/podcasts/generation/prompts/detect_language.py create mode 100644 surfsense_backend/app/podcasts/generation/prompts/draft_segment.py create mode 100644 surfsense_backend/app/podcasts/generation/prompts/plan_outline.py create mode 100644 surfsense_backend/app/podcasts/generation/prompts/speakers.py create mode 100644 surfsense_backend/app/podcasts/generation/structured.py create mode 100644 surfsense_backend/app/podcasts/generation/transcript/__init__.py create mode 100644 surfsense_backend/app/podcasts/generation/transcript/config.py create mode 100644 surfsense_backend/app/podcasts/generation/transcript/graph.py create mode 100644 surfsense_backend/app/podcasts/generation/transcript/nodes.py create mode 100644 surfsense_backend/app/podcasts/generation/transcript/planning.py create mode 100644 surfsense_backend/app/podcasts/generation/transcript/state.py diff --git a/surfsense_backend/app/podcasts/generation/__init__.py b/surfsense_backend/app/podcasts/generation/__init__.py new file mode 100644 index 000000000..30a2425b0 --- /dev/null +++ b/surfsense_backend/app/podcasts/generation/__init__.py @@ -0,0 +1,20 @@ +"""Generation: the LLM-driven brief and transcript controlled graphs. + +Two small graphs hold all the intelligence: ``brief`` proposes a reviewable spec +(language detection + resolution), and ``transcript`` drafts long-form dialogue +outline-first. Everything else in the podcast pipeline is deterministic. +""" + +from __future__ import annotations + +from .brief import BriefConfig, BriefState, build_brief_graph +from .transcript import TranscriptConfig, TranscriptState, build_transcript_graph + +__all__ = [ + "BriefConfig", + "BriefState", + "TranscriptConfig", + "TranscriptState", + "build_brief_graph", + "build_transcript_graph", +] diff --git a/surfsense_backend/app/podcasts/generation/brief/__init__.py b/surfsense_backend/app/podcasts/generation/brief/__init__.py new file mode 100644 index 000000000..0359a513d --- /dev/null +++ b/surfsense_backend/app/podcasts/generation/brief/__init__.py @@ -0,0 +1,9 @@ +"""Brief planning: propose a reviewable spec from weak signals.""" + +from __future__ import annotations + +from .config import BriefConfig +from .graph import build_brief_graph +from .state import BriefState + +__all__ = ["BriefConfig", "BriefState", "build_brief_graph"] diff --git a/surfsense_backend/app/podcasts/generation/brief/config.py b/surfsense_backend/app/podcasts/generation/brief/config.py new file mode 100644 index 000000000..a9f2f9dec --- /dev/null +++ b/surfsense_backend/app/podcasts/generation/brief/config.py @@ -0,0 +1,31 @@ +"""Configurable inputs for the brief-planning graph.""" + +from __future__ import annotations + +from dataclasses import dataclass, field, fields + +from langchain_core.runnables import RunnableConfig + +# Sensible defaults for a fresh brief; the user adjusts the range at the gate. +DEFAULT_SPEAKER_COUNT = 2 +DEFAULT_MIN_MINUTES = 10 +DEFAULT_MAX_MINUTES = 20 + + +@dataclass(kw_only=True) +class BriefConfig: + """Signals used to propose a brief; everything here is non-LLM context.""" + + search_space_id: int + speaker_count: int = DEFAULT_SPEAKER_COUNT + min_minutes: int = DEFAULT_MIN_MINUTES + max_minutes: int = DEFAULT_MAX_MINUTES + focus: str | None = None + last_used_language: str | None = None + last_used_voices: list[str] = field(default_factory=list) + + @classmethod + def from_runnable_config(cls, config: RunnableConfig | None = None) -> BriefConfig: + configurable = (config.get("configurable") or {}) if config else {} + names = {f.name for f in fields(cls) if f.init} + return cls(**{k: v for k, v in configurable.items() if k in names}) diff --git a/surfsense_backend/app/podcasts/generation/brief/detection.py b/surfsense_backend/app/podcasts/generation/brief/detection.py new file mode 100644 index 000000000..d505d4993 --- /dev/null +++ b/surfsense_backend/app/podcasts/generation/brief/detection.py @@ -0,0 +1,28 @@ +"""The language-detection reply shape, normalised to a safe tag or ``None``.""" + +from __future__ import annotations + +from pydantic import BaseModel, field_validator + +from app.podcasts.schemas import normalize_language_tag + + +class DetectedLanguage(BaseModel): + """What the detector returns: a usable BCP-47 tag, or ``None`` when unsure. + + A malformed or non-language reply is coerced to ``None`` so a bad detection + quietly defers to the rest of the resolution chain rather than poisoning the + spec with an invalid tag. + """ + + language: str | None = None + + @field_validator("language") + @classmethod + def _normalise(cls, value: str | None) -> str | None: + if value is None: + return None + try: + return normalize_language_tag(value) + except ValueError: + return None diff --git a/surfsense_backend/app/podcasts/generation/brief/graph.py b/surfsense_backend/app/podcasts/generation/brief/graph.py new file mode 100644 index 000000000..328529e59 --- /dev/null +++ b/surfsense_backend/app/podcasts/generation/brief/graph.py @@ -0,0 +1,27 @@ +"""The brief-planning graph: detect language, then propose a spec.""" + +from __future__ import annotations + +from langgraph.graph import StateGraph + +from .config import BriefConfig +from .nodes import detect_language, propose_spec +from .state import BriefState + + +def build_brief_graph(): + workflow = StateGraph(BriefState, config_schema=BriefConfig) + + workflow.add_node("detect_language", detect_language) + workflow.add_node("propose_spec", propose_spec) + + workflow.add_edge("__start__", "detect_language") + workflow.add_edge("detect_language", "propose_spec") + workflow.add_edge("propose_spec", "__end__") + + graph = workflow.compile() + graph.name = "Surfsense Podcast Brief" + return graph + + +graph = build_brief_graph() diff --git a/surfsense_backend/app/podcasts/generation/brief/nodes.py b/surfsense_backend/app/podcasts/generation/brief/nodes.py new file mode 100644 index 000000000..e0477940c --- /dev/null +++ b/surfsense_backend/app/podcasts/generation/brief/nodes.py @@ -0,0 +1,153 @@ +"""Brief-planning nodes: detect the language, then propose a full spec. + +Only ``detect_language`` spends tokens, and only a small sample of source text; +``propose_spec`` is pure resolution. Together they open the brief gate pre-filled +so the common case needs no edits. +""" + +from __future__ import annotations + +from typing import Any + +from langchain_core.messages import HumanMessage, SystemMessage +from langchain_core.runnables import RunnableConfig + +from app.config import config as app_config +from app.podcasts.resolution import ( + DEFAULT_LANGUAGE, + LanguageContext, + resolve_language, + resolve_voices, +) +from app.podcasts.schemas import ( + DurationTarget, + PodcastSpec, + PodcastStyle, + SpeakerRole, + SpeakerSpec, + normalize_language_tag, +) +from app.podcasts.voices import ( + VoiceCatalog, + TtsProvider, + get_voice_catalog, + provider_from_service, +) +from app.services.llm_service import get_agent_llm + +from ..prompts import detect_language_prompt +from ..structured import StructuredOutputError, invoke_json +from .config import BriefConfig +from .detection import DetectedLanguage +from .state import BriefState + +# Only the head of the source is needed to judge language; this caps tokens. +_DETECTION_SAMPLE_CHARS = 4000 + +# Default role per speaker slot; extra speakers beyond the list fall back to guest. +_ROLE_BY_SLOT = ( + SpeakerRole.HOST, + SpeakerRole.GUEST, + SpeakerRole.EXPERT, + SpeakerRole.COHOST, + SpeakerRole.NARRATOR, +) + + +async def detect_language( + state: BriefState, config: RunnableConfig +) -> dict[str, Any]: + """Detect the source language; defer (``None``) on any uncertainty.""" + brief = BriefConfig.from_runnable_config(config) + llm = await get_agent_llm(state.db_session, brief.search_space_id) + if llm is None: + return {"detected_language": None} + + sample = (state.source_content or "")[:_DETECTION_SAMPLE_CHARS].strip() + if not sample: + return {"detected_language": None} + + messages = [ + SystemMessage(content=detect_language_prompt()), + HumanMessage(content=f"{sample}"), + ] + try: + detected = await invoke_json(llm, messages, DetectedLanguage) + except StructuredOutputError: + return {"detected_language": None} + return {"detected_language": detected.language} + + +def propose_spec(state: BriefState, config: RunnableConfig) -> dict[str, Any]: + """Build a complete :class:`PodcastSpec` from the resolved defaults.""" + brief = BriefConfig.from_runnable_config(config) + provider = _active_provider() + catalog = get_voice_catalog() + + language = _supported_language( + detected=state.detected_language, + last_used=brief.last_used_language, + provider=provider, + catalog=catalog, + ) + voices = resolve_voices( + catalog=catalog, + provider=provider, + language=language, + speaker_count=brief.speaker_count, + preferred=brief.last_used_voices, + ) + + speakers = [ + SpeakerSpec( + slot=slot, + name=_default_name(slot), + role=_role_for(slot), + voice_id=voice.voice_id, + ) + for slot, voice in enumerate(voices) + ] + spec = PodcastSpec( + language=language, + style=PodcastStyle.CONVERSATIONAL, + speakers=speakers, + duration=DurationTarget( + min_minutes=brief.min_minutes, max_minutes=brief.max_minutes + ), + focus=brief.focus, + ) + return {"spec": spec} + + +def _active_provider() -> TtsProvider: + service = app_config.TTS_SERVICE + if not service: + raise ValueError("TTS_SERVICE is not configured") + return provider_from_service(service) + + +def _supported_language( + *, + detected: str | None, + last_used: str | None, + provider: TtsProvider, + catalog: VoiceCatalog, +) -> str: + raw = resolve_language(LanguageContext(detected=detected, last_used=last_used)) + try: + language = normalize_language_tag(raw) + except ValueError: + language = DEFAULT_LANGUAGE + if not catalog.supports_language(provider, language): + return DEFAULT_LANGUAGE + return language + + +def _role_for(slot: int) -> SpeakerRole: + return _ROLE_BY_SLOT[slot] if slot < len(_ROLE_BY_SLOT) else SpeakerRole.GUEST + + +def _default_name(slot: int) -> str: + role = _role_for(slot) + label = role.value.replace("cohost", "co-host").title() + return label if slot < len(_ROLE_BY_SLOT) else f"{label} {slot}" diff --git a/surfsense_backend/app/podcasts/generation/brief/state.py b/surfsense_backend/app/podcasts/generation/brief/state.py new file mode 100644 index 000000000..976a72df5 --- /dev/null +++ b/surfsense_backend/app/podcasts/generation/brief/state.py @@ -0,0 +1,19 @@ +"""Mutable state threaded through the brief-planning graph.""" + +from __future__ import annotations + +from dataclasses import dataclass + +from sqlalchemy.ext.asyncio import AsyncSession + +from app.podcasts.schemas import PodcastSpec + + +@dataclass +class BriefState: + """Runtime inputs and the proposed spec the graph produces.""" + + db_session: AsyncSession + source_content: str + detected_language: str | None = None + spec: PodcastSpec | None = None diff --git a/surfsense_backend/app/podcasts/generation/prompts/__init__.py b/surfsense_backend/app/podcasts/generation/prompts/__init__.py new file mode 100644 index 000000000..1f6d3993b --- /dev/null +++ b/surfsense_backend/app/podcasts/generation/prompts/__init__.py @@ -0,0 +1,15 @@ +"""Prompt builders for the generation graphs.""" + +from __future__ import annotations + +from .detect_language import detect_language_prompt +from .draft_segment import draft_segment_prompt +from .plan_outline import plan_outline_prompt +from .speakers import render_speaker_roster + +__all__ = [ + "detect_language_prompt", + "draft_segment_prompt", + "plan_outline_prompt", + "render_speaker_roster", +] diff --git a/surfsense_backend/app/podcasts/generation/prompts/detect_language.py b/surfsense_backend/app/podcasts/generation/prompts/detect_language.py new file mode 100644 index 000000000..a5ab4da5c --- /dev/null +++ b/surfsense_backend/app/podcasts/generation/prompts/detect_language.py @@ -0,0 +1,22 @@ +"""Prompt for detecting the dominant natural language of source content.""" + +from __future__ import annotations + +_SYSTEM = """\ +You identify the dominant natural language of a piece of source content for a \ +podcast that will be generated from it. + +Rules: +- Report the language the listener-facing podcast should be spoken in, i.e. the \ +language most of the meaningful prose is written in. +- Ignore code, markup, URLs, numbers, and proper nouns when judging. +- If the content is too short, ambiguous, mixed without a clear majority, or not \ +natural-language prose, return null rather than guessing. + +Respond with strict JSON and nothing else: +{"language": ""} or {"language": null} +""" + + +def detect_language_prompt() -> str: + return _SYSTEM diff --git a/surfsense_backend/app/podcasts/generation/prompts/draft_segment.py b/surfsense_backend/app/podcasts/generation/prompts/draft_segment.py new file mode 100644 index 000000000..c81dfa385 --- /dev/null +++ b/surfsense_backend/app/podcasts/generation/prompts/draft_segment.py @@ -0,0 +1,54 @@ +"""Prompt for drafting one outline segment into dialogue turns. + +Each segment is drafted on its own so long episodes stay coherent and within +context limits. A short recap of the preceding dialogue is passed in so the new +segment continues naturally instead of restarting. The model must write in the +episode language and attribute every line to a real speaker slot. +""" + +from __future__ import annotations + +from typing import TYPE_CHECKING + +from app.podcasts.schemas import PodcastSpec + +from .speakers import render_speaker_roster + +if TYPE_CHECKING: + from app.podcasts.generation.transcript.planning import OutlineSegment + + +def draft_segment_prompt( + *, + spec: PodcastSpec, + segment: OutlineSegment, + position: int, + total: int, + recap: str | None, +) -> str: + talking_points = "\n".join(f"- {point}" for point in segment.talking_points) + recap_block = ( + f"\nRecap of the conversation so far (continue from here, do not repeat " + f"it):\n{recap}\n" + if recap + else "\nThis is the opening segment; begin the conversation naturally.\n" + ) + return f"""\ +You are scripting natural, engaging podcast dialogue for segment {position} of \ +{total}. + +Write entirely in {spec.language}. The format is {spec.style.value}. +Speakers — attribute every line using these exact slot numbers: +{render_speaker_roster(spec)} +{recap_block} +This segment is "{segment.title}". Cover these points using only facts grounded \ +in the provided source content: +{talking_points} + +Aim for about {segment.target_words} words of dialogue. Keep turns conversational \ +and varied; speakers should react to each other rather than deliver monologues. \ +Do not add greetings or sign-offs unless this is the first or last segment. + +Respond with strict JSON and nothing else: +{{"turns": [{{"speaker": , "text": "..."}}]}} +""" diff --git a/surfsense_backend/app/podcasts/generation/prompts/plan_outline.py b/surfsense_backend/app/podcasts/generation/prompts/plan_outline.py new file mode 100644 index 000000000..1b227c2ff --- /dev/null +++ b/surfsense_backend/app/podcasts/generation/prompts/plan_outline.py @@ -0,0 +1,47 @@ +"""Prompt for planning a long-form podcast outline before drafting dialogue. + +Outlining first is what makes long-form reliable: a single LLM call cannot hold +a coherent one- to two-hour script, but it can plan segments that are then +drafted independently against a shared plan. The prompt is told the target +length so the number and size of segments scale with the requested duration. +""" + +from __future__ import annotations + +from app.podcasts.schemas import PodcastSpec + +from .speakers import render_speaker_roster + + +def plan_outline_prompt( + *, + spec: PodcastSpec, + target_words: int, + suggested_segments: int, + focus: str | None, +) -> str: + focus_block = ( + f"\nThe user asked the episode to focus on:\n{focus}\n" if focus else "" + ) + return f"""\ +You are a podcast showrunner planning the structure of an episode before any \ +dialogue is written. + +The episode language is {spec.language}. The format is {spec.style.value}. +Speakers (refer to them by these slots later): +{render_speaker_roster(spec)} +{focus_block} +Plan an outline that, when fully drafted, reaches roughly {target_words} words \ +of spoken dialogue (about {suggested_segments} segments). Each segment is one \ +coherent beat of the conversation: an opening, distinct topic areas grounded in \ +the source content, and a closing. + +For each segment provide: +- title: a short label for the beat +- talking_points: 2-5 concrete points to cover, drawn from the source content +- target_words: how many words of dialogue this segment should run (the sum \ +across segments should approximate {target_words}) + +Respond with strict JSON and nothing else: +{{"segments": [{{"title": "...", "talking_points": ["..."], "target_words": 0}}]}} +""" diff --git a/surfsense_backend/app/podcasts/generation/prompts/speakers.py b/surfsense_backend/app/podcasts/generation/prompts/speakers.py new file mode 100644 index 000000000..9df4138df --- /dev/null +++ b/surfsense_backend/app/podcasts/generation/prompts/speakers.py @@ -0,0 +1,18 @@ +"""Render a spec's speaker roster for prompts. + +The drafting prompts must reference speakers by the exact ``slot`` the renderer +expects, so this is the single place that formats that roster — keeping the +slot contract identical across every prompt that mentions speakers. +""" + +from __future__ import annotations + +from app.podcasts.schemas import PodcastSpec + + +def render_speaker_roster(spec: PodcastSpec) -> str: + lines = [ + f"- slot {speaker.slot} — {speaker.name} (role: {speaker.role.value})" + for speaker in spec.speakers + ] + return "\n".join(lines) diff --git a/surfsense_backend/app/podcasts/generation/structured.py b/surfsense_backend/app/podcasts/generation/structured.py new file mode 100644 index 000000000..9e9731c2f --- /dev/null +++ b/surfsense_backend/app/podcasts/generation/structured.py @@ -0,0 +1,49 @@ +"""Parse a model's reply into a Pydantic shape, tolerating chatty output. + +Agent LLMs return JSON wrapped in prose, markdown fences, or reasoning blocks. +This mirrors the legacy podcaster's resilient parsing — strip fences, then fall +back to the outermost ``{...}`` span — so every generation node validates the +reply the same way instead of repeating ad-hoc parsing. +""" + +from __future__ import annotations + +from typing import TYPE_CHECKING, TypeVar + +from pydantic import BaseModel, ValidationError + +from app.utils.content_utils import extract_text_content, strip_markdown_fences + +if TYPE_CHECKING: + from langchain_core.messages import BaseMessage + +T = TypeVar("T", bound=BaseModel) + + +class StructuredOutputError(RuntimeError): + """The model reply could not be parsed into the expected shape.""" + + +async def invoke_json(llm, messages: list[BaseMessage], model: type[T]) -> T: + """Invoke ``llm`` and validate its reply as ``model``.""" + response = await llm.ainvoke(messages) + content = strip_markdown_fences(extract_text_content(response.content)) + + try: + return model.model_validate_json(content) + except (ValidationError, ValueError): + pass + + start = content.find("{") + end = content.rfind("}") + 1 + if 0 <= start < end: + try: + return model.model_validate_json(content[start:end]) + except (ValidationError, ValueError) as exc: + raise StructuredOutputError( + f"could not parse {model.__name__} from model reply" + ) from exc + + raise StructuredOutputError( + f"no JSON object found for {model.__name__} in model reply" + ) diff --git a/surfsense_backend/app/podcasts/generation/transcript/__init__.py b/surfsense_backend/app/podcasts/generation/transcript/__init__.py new file mode 100644 index 000000000..5c8f23cd7 --- /dev/null +++ b/surfsense_backend/app/podcasts/generation/transcript/__init__.py @@ -0,0 +1,17 @@ +"""Transcript drafting: outline-first, long-form dialogue generation.""" + +from __future__ import annotations + +from .config import TranscriptConfig +from .graph import build_transcript_graph +from .planning import Outline, OutlineSegment, SegmentDraft +from .state import TranscriptState + +__all__ = [ + "Outline", + "OutlineSegment", + "SegmentDraft", + "TranscriptConfig", + "TranscriptState", + "build_transcript_graph", +] diff --git a/surfsense_backend/app/podcasts/generation/transcript/config.py b/surfsense_backend/app/podcasts/generation/transcript/config.py new file mode 100644 index 000000000..f627fc166 --- /dev/null +++ b/surfsense_backend/app/podcasts/generation/transcript/config.py @@ -0,0 +1,26 @@ +"""Configurable inputs for the transcript-drafting graph.""" + +from __future__ import annotations + +from dataclasses import dataclass, fields + +from langchain_core.runnables import RunnableConfig + +from app.podcasts.schemas import PodcastSpec + + +@dataclass(kw_only=True) +class TranscriptConfig: + """The approved spec and user focus that drive drafting.""" + + search_space_id: int + spec: PodcastSpec + focus: str | None = None + + @classmethod + def from_runnable_config( + cls, config: RunnableConfig | None = None + ) -> TranscriptConfig: + configurable = (config.get("configurable") or {}) if config else {} + names = {f.name for f in fields(cls) if f.init} + return cls(**{k: v for k, v in configurable.items() if k in names}) diff --git a/surfsense_backend/app/podcasts/generation/transcript/graph.py b/surfsense_backend/app/podcasts/generation/transcript/graph.py new file mode 100644 index 000000000..2f97db50f --- /dev/null +++ b/surfsense_backend/app/podcasts/generation/transcript/graph.py @@ -0,0 +1,29 @@ +"""The transcript-drafting graph: outline, draft segments, finalize.""" + +from __future__ import annotations + +from langgraph.graph import StateGraph + +from .config import TranscriptConfig +from .nodes import draft_segments, finalize, plan_outline +from .state import TranscriptState + + +def build_transcript_graph(): + workflow = StateGraph(TranscriptState, config_schema=TranscriptConfig) + + workflow.add_node("plan_outline", plan_outline) + workflow.add_node("draft_segments", draft_segments) + workflow.add_node("finalize", finalize) + + workflow.add_edge("__start__", "plan_outline") + workflow.add_edge("plan_outline", "draft_segments") + workflow.add_edge("draft_segments", "finalize") + workflow.add_edge("finalize", "__end__") + + graph = workflow.compile() + graph.name = "Surfsense Podcast Transcript" + return graph + + +graph = build_transcript_graph() diff --git a/surfsense_backend/app/podcasts/generation/transcript/nodes.py b/surfsense_backend/app/podcasts/generation/transcript/nodes.py new file mode 100644 index 000000000..b4a3e6541 --- /dev/null +++ b/surfsense_backend/app/podcasts/generation/transcript/nodes.py @@ -0,0 +1,127 @@ +"""Transcript-drafting nodes: plan an outline, draft each beat, then assemble. + +Long-form is produced beat-by-beat: a single call plans the structure, then each +segment is drafted on its own with a recap of what came before so the script +stays coherent without holding the whole episode in one context window. +""" + +from __future__ import annotations + +from typing import Any + +from langchain_core.messages import HumanMessage, SystemMessage +from langchain_core.runnables import RunnableConfig + +from app.podcasts.schemas import PodcastSpec, Transcript, TranscriptTurn +from app.services.llm_service import get_agent_llm + +from ..prompts import draft_segment_prompt, plan_outline_prompt +from ..structured import invoke_json +from .config import TranscriptConfig +from .planning import Outline, OutlineSegment, SegmentDraft +from .state import TranscriptState + +# Average speaking rate; converts target minutes to a target word count. +_WORDS_PER_MINUTE = 150 +# Rough words per outline segment, used to suggest how many segments to plan. +_WORDS_PER_SEGMENT = 250 +# Cap on source text sent per LLM call to bound tokens on large sources. +_SOURCE_BUDGET_CHARS = 12000 +# How much prior dialogue to recap into each segment for continuity. +_RECAP_CHARS = 800 + + +async def plan_outline( + state: TranscriptState, config: RunnableConfig +) -> dict[str, Any]: + """Plan the segment structure sized to the spec's target duration.""" + tc = TranscriptConfig.from_runnable_config(config) + llm = await _require_llm(state, tc) + + target_words = round(tc.spec.duration.midpoint_minutes * _WORDS_PER_MINUTE) + suggested_segments = max(1, round(target_words / _WORDS_PER_SEGMENT)) + + messages = [ + SystemMessage( + content=plan_outline_prompt( + spec=tc.spec, + target_words=target_words, + suggested_segments=suggested_segments, + focus=tc.focus, + ) + ), + HumanMessage(content=_source_block(state.source_content)), + ] + outline = await invoke_json(llm, messages, Outline) + return {"outline": outline} + + +async def draft_segments( + state: TranscriptState, config: RunnableConfig +) -> dict[str, Any]: + """Draft each outline segment in order, carrying a running recap.""" + tc = TranscriptConfig.from_runnable_config(config) + llm = await _require_llm(state, tc) + outline = state.outline + if outline is None: + raise RuntimeError("draft_segments requires an outline") + + source_block = _source_block(state.source_content) + turns: list[TranscriptTurn] = [] + total = len(outline.segments) + + for index, segment in enumerate(outline.segments): + messages = [ + SystemMessage( + content=draft_segment_prompt( + spec=tc.spec, + segment=segment, + position=index + 1, + total=total, + recap=_recap(turns, tc.spec), + ) + ), + HumanMessage(content=source_block), + ] + draft = await invoke_json(llm, messages, SegmentDraft) + turns.extend(_valid_turns(draft, tc.spec)) + + return {"drafted_turns": turns} + + +def finalize(state: TranscriptState, config: RunnableConfig) -> dict[str, Any]: + """Assemble drafted turns into a validated transcript.""" + if not state.drafted_turns: + raise RuntimeError("drafting produced no usable dialogue") + return {"transcript": Transcript(turns=state.drafted_turns)} + + +async def _require_llm(state: TranscriptState, tc: TranscriptConfig): + llm = await get_agent_llm(state.db_session, tc.search_space_id) + if llm is None: + raise RuntimeError( + f"no agent LLM configured for search space {tc.search_space_id}" + ) + return llm + + +def _source_block(source_content: str) -> str: + sample = (source_content or "")[:_SOURCE_BUDGET_CHARS] + return f"{sample}" + + +def _valid_turns(draft: SegmentDraft, spec: PodcastSpec) -> list[TranscriptTurn]: + # Drop any turn the model attributed to a slot the spec doesn't define, so a + # stray attribution can't break rendering downstream. + valid_slots = {speaker.slot for speaker in spec.speakers} + return [turn for turn in draft.turns if turn.speaker in valid_slots] + + +def _recap(turns: list[TranscriptTurn], spec: PodcastSpec) -> str | None: + if not turns: + return None + names = {speaker.slot: speaker.name for speaker in spec.speakers} + rendered = "\n".join( + f"{names.get(turn.speaker, turn.speaker)}: {turn.text}" for turn in turns + ) + return rendered[-_RECAP_CHARS:] diff --git a/surfsense_backend/app/podcasts/generation/transcript/planning.py b/surfsense_backend/app/podcasts/generation/transcript/planning.py new file mode 100644 index 000000000..3f6aeac9b --- /dev/null +++ b/surfsense_backend/app/podcasts/generation/transcript/planning.py @@ -0,0 +1,32 @@ +"""Internal shapes the transcript graph passes between its nodes. + +These are generation-time artifacts (the outline and per-segment drafts), not +persisted or API-facing. Segment drafts reuse :class:`TranscriptTurn` so the +speaker-slot contract and turn validation are identical to the final transcript. +""" + +from __future__ import annotations + +from pydantic import BaseModel, Field + +from app.podcasts.schemas import TranscriptTurn + + +class OutlineSegment(BaseModel): + """One planned beat of the conversation, drafted independently.""" + + title: str = Field(..., min_length=1) + talking_points: list[str] = Field(default_factory=list) + target_words: int = Field(..., ge=1) + + +class Outline(BaseModel): + """The full plan: ordered segments sized to the target duration.""" + + segments: list[OutlineSegment] = Field(..., min_length=1) + + +class SegmentDraft(BaseModel): + """The dialogue a single segment produced.""" + + turns: list[TranscriptTurn] = Field(default_factory=list) diff --git a/surfsense_backend/app/podcasts/generation/transcript/state.py b/surfsense_backend/app/podcasts/generation/transcript/state.py new file mode 100644 index 000000000..f11337471 --- /dev/null +++ b/surfsense_backend/app/podcasts/generation/transcript/state.py @@ -0,0 +1,22 @@ +"""Mutable state threaded through the transcript-drafting graph.""" + +from __future__ import annotations + +from dataclasses import dataclass, field + +from sqlalchemy.ext.asyncio import AsyncSession + +from app.podcasts.schemas import Transcript, TranscriptTurn + +from .planning import Outline + + +@dataclass +class TranscriptState: + """Source content plus the intermediate and final drafting artifacts.""" + + db_session: AsyncSession + source_content: str + outline: Outline | None = None + drafted_turns: list[TranscriptTurn] = field(default_factory=list) + transcript: Transcript | None = None