Merge pull request #1479 from CREDO23/improvement-podcast-graph

[Feat] Podcast: Rework generation into a lifecycle-driven module with multi-language support
This commit is contained in:
Rohan Verma 2026-06-11 14:23:38 -07:00 committed by GitHub
commit 8f80900ab0
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
126 changed files with 6702 additions and 2163 deletions

View file

@ -6,7 +6,7 @@ data/
__pycache__/
.flashrank_cache
surf_new_backend.egg-info/
podcasts/
/podcasts/
video_presentation_audio/
sandbox_files/
temp_audio/

View file

@ -0,0 +1,92 @@
"""evolve podcasts: expand status lifecycle and add brief/transcript/storage columns
Revision ID: 158
Revises: 157
"""
from collections.abc import Sequence
from alembic import op
revision: str = "158"
down_revision: str | None = "157"
branch_labels: str | Sequence[str] | None = None
depends_on: str | Sequence[str] | None = None
def upgrade() -> None:
# Retype the status enum by swapping in a fresh type and casting existing
# rows. The legacy transient value 'generating' maps onto 'rendering'.
op.execute("ALTER TYPE podcast_status RENAME TO podcast_status_old;")
op.execute(
"""
CREATE TYPE podcast_status AS ENUM (
'pending', 'awaiting_brief', 'drafting', 'awaiting_review',
'rendering', 'ready', 'failed', 'cancelled'
);
"""
)
op.execute("ALTER TABLE podcasts ALTER COLUMN status DROP DEFAULT;")
op.execute(
"""
ALTER TABLE podcasts
ALTER COLUMN status TYPE podcast_status
USING (
CASE status::text
WHEN 'generating' THEN 'rendering'
ELSE status::text
END
)::podcast_status;
"""
)
op.execute("ALTER TABLE podcasts ALTER COLUMN status SET DEFAULT 'pending';")
op.execute("DROP TYPE podcast_status_old;")
op.execute("ALTER TABLE podcasts ADD COLUMN IF NOT EXISTS source_content TEXT;")
op.execute("ALTER TABLE podcasts ADD COLUMN IF NOT EXISTS spec JSONB;")
op.execute(
"ALTER TABLE podcasts ADD COLUMN IF NOT EXISTS spec_version "
"INTEGER NOT NULL DEFAULT 1;"
)
op.execute(
"ALTER TABLE podcasts ADD COLUMN IF NOT EXISTS storage_backend VARCHAR(32);"
)
op.execute("ALTER TABLE podcasts ADD COLUMN IF NOT EXISTS storage_key TEXT;")
op.execute("ALTER TABLE podcasts ADD COLUMN IF NOT EXISTS duration_seconds INTEGER;")
op.execute("ALTER TABLE podcasts ADD COLUMN IF NOT EXISTS error TEXT;")
def downgrade() -> None:
op.execute("ALTER TABLE podcasts DROP COLUMN IF EXISTS error;")
op.execute("ALTER TABLE podcasts DROP COLUMN IF EXISTS duration_seconds;")
op.execute("ALTER TABLE podcasts DROP COLUMN IF EXISTS storage_key;")
op.execute("ALTER TABLE podcasts DROP COLUMN IF EXISTS storage_backend;")
op.execute("ALTER TABLE podcasts DROP COLUMN IF EXISTS spec_version;")
op.execute("ALTER TABLE podcasts DROP COLUMN IF EXISTS spec;")
op.execute("ALTER TABLE podcasts DROP COLUMN IF EXISTS source_content;")
# Collapse the expanded lifecycle back onto the original four values.
op.execute("ALTER TYPE podcast_status RENAME TO podcast_status_new;")
op.execute(
"CREATE TYPE podcast_status AS ENUM "
"('pending', 'generating', 'ready', 'failed');"
)
op.execute("ALTER TABLE podcasts ALTER COLUMN status DROP DEFAULT;")
op.execute(
"""
ALTER TABLE podcasts
ALTER COLUMN status TYPE podcast_status
USING (
CASE status::text
WHEN 'awaiting_brief' THEN 'pending'
WHEN 'drafting' THEN 'generating'
WHEN 'awaiting_review' THEN 'generating'
WHEN 'rendering' THEN 'generating'
WHEN 'cancelled' THEN 'failed'
ELSE status::text
END
)::podcast_status;
"""
)
op.execute("ALTER TABLE podcasts ALTER COLUMN status SET DEFAULT 'ready';")
op.execute("DROP TYPE podcast_status_new;")

View file

@ -0,0 +1,26 @@
"""publish podcasts to zero_publication
Reconciles ``zero_publication`` after migration 158 added the lifecycle columns,
so the frontend observes podcast status and the reviewable brief by push.
Revision ID: 159
Revises: 158
"""
from collections.abc import Sequence
from alembic import op
from app.zero_publication import apply_publication
revision: str = "159"
down_revision: str | None = "158"
branch_labels: str | Sequence[str] | None = None
depends_on: str | Sequence[str] | None = None
def upgrade() -> None:
apply_publication(op.get_bind())
def downgrade() -> None:
"""No-op. Historical publication shapes are immutable."""

View file

@ -126,23 +126,25 @@ user: "Create issues in Linear for each of these five bugs: <list>"
<example>
user: "Make a 30-second podcast of this conversation."
Celery-backed deliverable. The `deliverables` subagent dispatches the
Celery job and then **waits for it to finish** before returning. The
call may take 10-60 seconds (or longer for video presentations) —
that is intentional, not a hang. You always get back one of two
Receipt shapes:
Podcast deliverable. The `deliverables` subagent sets the podcast up and
returns **immediately** — generation does not happen during the call. A
live card in the chat takes over from there: the user reviews the brief
(language, voices, length) on the card, and the episode drafts and
renders automatically after they approve.
task(deliverables, "Generate a podcast titled '<title>' from the
following content. Use a 30-second style brief. Return the podcast
id and title.\n\n<source content>")
following content. Aim for a 30-second style brief. Return the
podcast id and title.\n\n<source content>")
Outcomes:
- **`status="success"`**: the audio is saved. Tell the user the
podcast is **ready** and quote the `external_id` / `preview` so
they can find it in the podcast panel.
- **`status="success"`**: the podcast is set up. Do NOT describe its
current status or promise it is ready — the card tracks progress
live and will outlive whatever you say. Just point the user at the
card in the chat.
- **`status="failed"`**: surface the Receipt's `error` field
verbatim. Do NOT silently re-dispatch — the backend already tried
and reported a real error.
Same two-way pattern applies to video presentations (which take
longer to render, but still return a terminal status). If a
Video presentations differ: that Celery-backed call **waits for the
render to finish** before returning (possibly minutes — intentional,
not a hang) and ends with a terminal status. If a
`task(deliverables, ...)` invocation itself times out at the subagent
layer (separate from the Receipt), that's an operator-side problem
with the subagent invoke timeout, not a deliverable failure — pass

View file

@ -1,11 +1,10 @@
"""Factory for a podcast-generation tool.
Dispatches the heavy generation to Celery and then polls the podcast row
until it reaches a terminal status (READY/FAILED). The tool always
returns a real terminal ``Receipt`` never a pending one. The wait is
bounded by the existing per-invocation safety net
(``SURFSENSE_SUBAGENT_INVOKE_TIMEOUT_SECONDS`` in multi-agent mode,
HTTP / process lifetime in single-agent mode).
Creates the podcast and proposes its brief (language, voices, length) inline,
then returns immediately with the row awaiting review. Everything after
brief approval, drafting, rendering happens on the live podcast card, so
this tool never blocks on generation and the chat text must not describe a
status that the card will outgrow.
"""
import logging
@ -18,13 +17,12 @@ from sqlalchemy.ext.asyncio import AsyncSession
from app.agents.chat.multi_agent_chat.shared.receipts.command import with_receipt
from app.agents.chat.multi_agent_chat.shared.receipts.receipt import make_receipt
from app.agents.chat.multi_agent_chat.subagents.builtins.deliverables.deliverable_wait import (
wait_for_deliverable,
)
from app.agents.chat.multi_agent_chat.subagents.builtins.deliverables.tools.thread_resolver import (
resolve_root_thread_id,
)
from app.db import Podcast, PodcastStatus, shielded_async_session
from app.db import PodcastStatus, shielded_async_session
from app.podcasts.generation.brief import propose_brief
from app.podcasts.service import PodcastService
logger = logging.getLogger(__name__)
@ -45,7 +43,7 @@ def create_generate_podcast_tool(
user_prompt: str | None = None,
) -> Command:
"""
Generate a podcast from the provided content.
Prepare a podcast from the provided content for the user to review.
Use this tool when the user asks to create, generate, or make a podcast.
Common triggers include phrases like:
@ -55,100 +53,59 @@ def create_generate_podcast_tool(
- "Make a podcast about..."
- "Turn this into a podcast"
This sets up the podcast and proposes its brief (language, voices,
length). The user reviews the brief on the live podcast card in the
chat; after approval the episode drafts and renders automatically.
Generation does not start here, and the card tracks all progress do
not describe the podcast's current status in your reply.
Args:
source_content: The text content to convert into a podcast.
podcast_title: Title for the podcast (default: "SurfSense Podcast")
user_prompt: Optional instructions for podcast style, tone, or format.
user_prompt: Optional steer for what the episode should focus on.
Returns:
A dictionary containing:
- status: PodcastStatus value (pending, generating, or failed)
- podcast_id: The podcast ID for polling (when status is pending or generating)
- title: The podcast title
- message: Status message (or "error" field if status is failed)
- status: the podcast lifecycle status (awaiting_brief on success)
- podcast_id: the podcast ID to review in the panel
- title: the podcast title
- message: what the user should do next (or "error" when failed)
"""
try:
# One DB session per tool call so parallel invocations never share an AsyncSession.
async with shielded_async_session() as session:
podcast = Podcast(
service = PodcastService(session)
podcast = await service.create(
title=podcast_title,
status=PodcastStatus.PENDING,
search_space_id=search_space_id,
thread_id=resolve_root_thread_id(runtime, thread_id),
)
session.add(podcast)
podcast.source_content = source_content
spec = await propose_brief(
session,
search_space_id=search_space_id,
focus=user_prompt,
)
await service.attach_brief(podcast, spec)
await session.commit()
await session.refresh(podcast)
podcast_id = podcast.id
from app.tasks.celery_tasks.podcast_tasks import (
generate_content_podcast_task,
)
task = generate_content_podcast_task.delay(
podcast_id=podcast_id,
source_content=source_content,
search_space_id=search_space_id,
user_prompt=user_prompt,
)
logger.info(
"[generate_podcast] Created podcast %s, task: %s",
"[generate_podcast] Prepared podcast %s awaiting brief review",
podcast_id,
task.id,
)
# Wait until the Celery worker flips the row to a terminal
# state. The wait is bounded only by the subagent invoke
# timeout (multi-agent) or HTTP lifetime (single-agent) —
# see app.agents.chat.multi_agent_chat.subagents.builtins.deliverables.deliverable_wait for details.
terminal_status, columns, elapsed = await wait_for_deliverable(
model=Podcast,
row_id=podcast_id,
columns=[Podcast.status, Podcast.file_location],
terminal_statuses={PodcastStatus.READY, PodcastStatus.FAILED},
)
if terminal_status == PodcastStatus.READY:
file_location = columns[1] if columns else None
logger.info(
"[generate_podcast] Podcast %s READY in %.2fs (file=%s)",
podcast_id,
elapsed,
file_location,
)
payload: dict[str, Any] = {
"status": PodcastStatus.READY.value,
"podcast_id": podcast_id,
"title": podcast_title,
"file_location": file_location,
"message": ("Podcast generated and saved to your podcast panel."),
}
return with_receipt(
payload=payload,
receipt=make_receipt(
route="deliverables",
type="podcast",
operation="generate",
status="success",
external_id=str(podcast_id),
preview=podcast_title,
),
tool_call_id=runtime.tool_call_id,
)
# Only other terminal state is FAILED.
logger.warning(
"[generate_podcast] Podcast %s FAILED in %.2fs",
podcast_id,
elapsed,
)
err = "Background worker reported FAILED status for this podcast."
payload = {
"status": PodcastStatus.FAILED.value,
payload: dict[str, Any] = {
"status": PodcastStatus.AWAITING_BRIEF.value,
"podcast_id": podcast_id,
"title": podcast_title,
"error": err,
"message": (
"Podcast set up. The card in the chat handles the rest: "
"the user reviews the brief (language, voices, length) "
"there, and the episode drafts and renders automatically "
"after approval. The card tracks progress live, so do not "
"state the podcast's current status in your reply."
),
}
return with_receipt(
payload=payload,
@ -156,10 +113,9 @@ def create_generate_podcast_tool(
route="deliverables",
type="podcast",
operation="generate",
status="failed",
status="success",
external_id=str(podcast_id),
preview=podcast_title,
error=err,
),
tool_call_id=runtime.tool_call_id,
)

View file

@ -1,8 +0,0 @@
"""New LangGraph Agent.
This module defines a custom graph.
"""
from .graph import graph
__all__ = ["graph"]

View file

@ -1,29 +0,0 @@
"""Define the configurable parameters for the agent."""
from __future__ import annotations
from dataclasses import dataclass, fields
from langchain_core.runnables import RunnableConfig
@dataclass(kw_only=True)
class Configuration:
"""The configuration for the agent."""
# Changeme: Add configurable values here!
# these values can be pre-set when you
# create assistants (https://langchain-ai.github.io/langgraph/cloud/how-tos/configuration_cloud/)
# and when you invoke the graph
podcast_title: str
search_space_id: int
user_prompt: str | None = None
@classmethod
def from_runnable_config(
cls, config: RunnableConfig | None = None
) -> Configuration:
"""Create a Configuration instance from a RunnableConfig object."""
configurable = (config.get("configurable") or {}) if config else {}
_fields = {f.name for f in fields(cls) if f.init}
return cls(**{k: v for k, v in configurable.items() if k in _fields})

View file

@ -1,29 +0,0 @@
from langgraph.graph import StateGraph
from .configuration import Configuration
from .nodes import create_merged_podcast_audio, create_podcast_transcript
from .state import State
def build_graph():
# Define a new graph
workflow = StateGraph(State, config_schema=Configuration)
# Add the node to the graph
workflow.add_node("create_podcast_transcript", create_podcast_transcript)
workflow.add_node("create_merged_podcast_audio", create_merged_podcast_audio)
# Set the entrypoint as `call_model`
workflow.add_edge("__start__", "create_podcast_transcript")
workflow.add_edge("create_podcast_transcript", "create_merged_podcast_audio")
workflow.add_edge("create_merged_podcast_audio", "__end__")
# Compile the workflow into an executable graph
graph = workflow.compile()
graph.name = "Surfsense Podcaster" # This defines the custom name in LangSmith
return graph
# Compile the graph once when the module is loaded
graph = build_graph()

View file

@ -1,195 +0,0 @@
import asyncio
import json
import os
import uuid
from pathlib import Path
from typing import Any
from ffmpeg.asyncio import FFmpeg
from langchain_core.messages import HumanMessage, SystemMessage
from langchain_core.runnables import RunnableConfig
from litellm import aspeech
from app.config import config as app_config
from app.services.kokoro_tts_service import get_kokoro_tts_service
from app.services.llm_service import get_agent_llm
from app.utils.content_utils import extract_text_content, strip_markdown_fences
from .configuration import Configuration
from .prompts import get_podcast_generation_prompt
from .state import PodcastTranscriptEntry, PodcastTranscripts, State
from .utils import get_voice_for_provider
async def create_podcast_transcript(
state: State, config: RunnableConfig
) -> dict[str, Any]:
"""Generate the podcast transcript from the source content."""
configuration = Configuration.from_runnable_config(config)
search_space_id = configuration.search_space_id
user_prompt = configuration.user_prompt
llm = await get_agent_llm(state.db_session, search_space_id)
if not llm:
error_message = f"No agent LLM configured for search space {search_space_id}"
print(error_message)
raise RuntimeError(error_message)
prompt = get_podcast_generation_prompt(user_prompt)
messages = [
SystemMessage(content=prompt),
HumanMessage(
content=f"<source_content>{state.source_content}</source_content>"
),
]
llm_response = await llm.ainvoke(messages)
# Reasoning models may return content as blocks; normalise to a string.
content = strip_markdown_fences(extract_text_content(llm_response.content))
try:
podcast_transcript = PodcastTranscripts.model_validate(json.loads(content))
except (json.JSONDecodeError, TypeError, ValueError) as e:
print(f"Direct JSON parsing failed, trying fallback approach: {e!s}")
try:
json_start = content.find("{")
json_end = content.rfind("}") + 1
if json_start >= 0 and json_end > json_start:
json_str = content[json_start:json_end]
parsed_data = json.loads(json_str)
podcast_transcript = PodcastTranscripts.model_validate(parsed_data)
print("Successfully parsed podcast transcript using fallback approach")
else:
error_message = f"Could not find valid JSON in LLM response. Raw response: {content}"
print(error_message)
raise ValueError(error_message)
except (json.JSONDecodeError, TypeError, ValueError) as e2:
error_message = f"Error parsing LLM response (fallback also failed): {e2!s}"
print(f"Error parsing LLM response: {e2!s}")
print(f"Raw response: {content}")
raise
return {"podcast_transcript": podcast_transcript.podcast_transcripts}
async def create_merged_podcast_audio(
state: State, config: RunnableConfig
) -> dict[str, Any]:
"""Generate audio for each transcript and merge them into a single podcast file."""
starting_transcript = PodcastTranscriptEntry(
speaker_id=1, dialog="Welcome to Surfsense Podcast."
)
transcript = state.podcast_transcript
# transcript may be a PodcastTranscripts object or already a list.
if hasattr(transcript, "podcast_transcripts"):
transcript_entries = transcript.podcast_transcripts
else:
transcript_entries = transcript
merged_transcript = [starting_transcript, *transcript_entries]
temp_dir = Path("temp_audio")
temp_dir.mkdir(exist_ok=True)
session_id = str(uuid.uuid4())
output_path = f"podcasts/{session_id}_podcast.mp3"
os.makedirs("podcasts", exist_ok=True)
audio_files = []
async def generate_speech_for_segment(segment, index):
if hasattr(segment, "speaker_id"):
speaker_id = segment.speaker_id
dialog = segment.dialog
else:
speaker_id = segment.get("speaker_id", 0)
dialog = segment.get("dialog", "")
voice = get_voice_for_provider(app_config.TTS_SERVICE, speaker_id)
if app_config.TTS_SERVICE == "local/kokoro":
filename = f"{temp_dir}/{session_id}_{index}.wav"
else:
filename = f"{temp_dir}/{session_id}_{index}.mp3"
try:
if app_config.TTS_SERVICE == "local/kokoro":
kokoro_service = await get_kokoro_tts_service(
lang_code="a"
) # American English
audio_path = await kokoro_service.generate_speech(
text=dialog, voice=voice, speed=1.0, output_path=filename
)
return audio_path
else:
if app_config.TTS_SERVICE_API_BASE:
response = await aspeech(
model=app_config.TTS_SERVICE,
api_base=app_config.TTS_SERVICE_API_BASE,
api_key=app_config.TTS_SERVICE_API_KEY,
voice=voice,
input=dialog,
max_retries=2,
timeout=600,
)
else:
response = await aspeech(
model=app_config.TTS_SERVICE,
api_key=app_config.TTS_SERVICE_API_KEY,
voice=voice,
input=dialog,
max_retries=2,
timeout=600,
)
with open(filename, "wb") as f:
f.write(response.content)
return filename
except Exception as e:
print(f"Error generating speech for segment {index}: {e!s}")
raise
tasks = [
generate_speech_for_segment(segment, i)
for i, segment in enumerate(merged_transcript)
]
audio_files = await asyncio.gather(*tasks)
try:
ffmpeg = FFmpeg().option("y")
for audio_file in audio_files:
ffmpeg = ffmpeg.input(audio_file)
filter_complex = []
for i in range(len(audio_files)):
filter_complex.append(f"[{i}:0]")
filter_complex_str = (
"".join(filter_complex) + f"concat=n={len(audio_files)}:v=0:a=1[outa]"
)
ffmpeg = ffmpeg.option("filter_complex", filter_complex_str)
ffmpeg = ffmpeg.output(output_path, map="[outa]")
await ffmpeg.execute()
print(f"Successfully created podcast audio: {output_path}")
except Exception as e:
print(f"Error merging audio files: {e!s}")
raise
finally:
for audio_file in audio_files:
try:
os.remove(audio_file)
except Exception as e:
print(f"Error removing audio file {audio_file}: {e!s}")
pass
return {
"podcast_transcript": merged_transcript,
"final_podcast_file_path": output_path,
}

View file

@ -1,122 +0,0 @@
import datetime
def get_podcast_generation_prompt(user_prompt: str | None = None):
return f"""
Today's date: {datetime.datetime.now().strftime("%Y-%m-%d")}
<podcast_generation_system>
You are a master podcast scriptwriter, adept at transforming diverse input content into a lively, engaging, and natural-sounding conversation between two distinct podcast hosts. Your primary objective is to craft authentic, flowing dialogue that captures the spontaneity and chemistry of a real podcast discussion, completely avoiding any hint of robotic scripting or stiff formality. Think dynamic interplay, not just information delivery.
{
f'''
You **MUST** strictly adhere to the following user instruction while generating the podcast script:
<user_instruction>
{user_prompt}
</user_instruction>
'''
if user_prompt
else ""
}
<input>
- '<source_content>': A block of text containing the information to be discussed in the podcast. This could be research findings, an article summary, a detailed outline, user chat history related to the topic, or any other relevant raw information. The content might be unstructured but serves as the factual basis for the podcast dialogue.
</input>
<output_format>
A JSON object containing the podcast transcript with alternating speakers:
{{
"podcast_transcripts": [
{{
"speaker_id": 0,
"dialog": "Speaker 0 dialog here"
}},
{{
"speaker_id": 1,
"dialog": "Speaker 1 dialog here"
}},
{{
"speaker_id": 0,
"dialog": "Speaker 0 dialog here"
}},
{{
"speaker_id": 1,
"dialog": "Speaker 1 dialog here"
}}
]
}}
</output_format>
<guidelines>
1. **Establish Distinct & Consistent Host Personas:**
* **Speaker 0 (Lead Host):** Drives the conversation forward, introduces segments, poses key questions derived from the source content, and often summarizes takeaways. Maintain a guiding, clear, and engaging tone.
* **Speaker 1 (Co-Host/Expert):** Offers deeper insights, provides alternative viewpoints or elaborations on the source content, asks clarifying or challenging questions, and shares relevant anecdotes or examples. Adopt a complementary tone (e.g., analytical, enthusiastic, reflective, slightly skeptical).
* **Consistency is Key:** Ensure each speaker maintains their distinct voice, vocabulary choice, sentence structure, and perspective throughout the entire script. Avoid having them sound interchangeable. Their interaction should feel like a genuine partnership.
2. **Craft Natural & Dynamic Dialogue:**
* **Emulate Real Conversation:** Use contractions (e.g., "don't", "it's"), interjections ("Oh!", "Wow!", "Hmm"), discourse markers ("you know", "right?", "well"), and occasional natural pauses or filler words. Avoid overly formal language or complex sentence structures typical of written text.
* **Foster Interaction & Chemistry:** Write dialogue where speakers genuinely react *to each other*. They should build on points ("Exactly, and that reminds me..."), ask follow-up questions ("Could you expand on that?"), express agreement/disagreement respectfully ("That's a fair point, but have you considered...?"), and show active listening.
* **Vary Rhythm & Pace:** Mix short, punchy lines with longer, more explanatory ones. Vary sentence beginnings. Use questions to break up exposition. The rhythm should feel spontaneous, not monotonous.
* **Inject Personality & Relatability:** Allow for appropriate humor, moments of surprise or curiosity, brief personal reflections ("I actually experienced something similar..."), or relatable asides that fit the hosts' personas and the topic. Lightly reference past discussions if it enhances context ("Remember last week when we touched on...?").
3. **Structure for Flow and Listener Engagement:**
* **Natural Beginning:** Start with dialogue that flows naturally after an introduction (which will be added manually). Avoid redundant greetings or podcast name mentions since these will be added separately.
* **Logical Progression & Signposting:** Guide the listener through the information smoothly. Use clear transitions to link different ideas or segments ("So, now that we've covered X, let's dive into Y...", "That actually brings me to another key finding..."). Ensure topics flow logically from one to the next.
* **Meaningful Conclusion:** Summarize the key takeaways or main points discussed, reinforcing the core message derived from the source content. End with a final thought, a lingering question for the audience, or a brief teaser for what's next, providing a sense of closure. Avoid abrupt endings.
4. **Integrate Source Content Seamlessly & Accurately:**
* **Translate, Don't Recite:** Rephrase information from the `<source_content>` into conversational language suitable for each host's persona. Avoid directly copying dense sentences or technical jargon without explanation. The goal is discussion, not narration.
* **Explain & Contextualize:** Use analogies, simple examples, storytelling, or have one host ask clarifying questions (acting as a listener surrogate) to break down complex ideas from the source.
* **Weave Information Naturally:** Integrate facts, data, or key points from the source *within* the dialogue, not as standalone, undigested blocks. Attribute information conversationally where appropriate ("The research mentioned...", "Apparently, the key factor is...").
* **Balance Depth & Accessibility:** Ensure the conversation is informative and factually accurate based on the source content, but prioritize clear communication and engaging delivery over exhaustive technical detail. Make it understandable and interesting for a general audience.
5. **Length & Pacing:**
* **Six-Minute Duration:** Create a transcript that, when read at a natural speaking pace, would result in approximately 6 minutes of audio. Typically, this means around 1000 words total (based on average speaking rate of 150 words per minute).
* **Concise Speaking Turns:** Keep most speaking turns relatively brief and focused. Aim for a natural back-and-forth rhythm rather than extended monologues.
* **Essential Content Only:** Prioritize the most important information from the source content. Focus on quality over quantity, ensuring every line contributes meaningfully to the topic.
</guidelines>
<examples>
Input: "Quantum computing uses quantum bits or qubits which can exist in multiple states simultaneously due to superposition."
Output:
{{
"podcast_transcripts": [
{{
"speaker_id": 0,
"dialog": "Today we're diving into the mind-bending world of quantum computing. You know, this is a topic I've been excited to cover for weeks."
}},
{{
"speaker_id": 1,
"dialog": "Same here! And I know our listeners have been asking for it. But I have to admit, the concept of quantum computing makes my head spin a little. Can we start with the basics?"
}},
{{
"speaker_id": 0,
"dialog": "Absolutely. So regular computers use bits, right? Little on-off switches that are either 1 or 0. But quantum computers use something called qubits, and this is where it gets fascinating."
}},
{{
"speaker_id": 1,
"dialog": "Wait, what makes qubits so special compared to regular bits?"
}},
{{
"speaker_id": 0,
"dialog": "The magic is in something called superposition. These qubits can exist in multiple states at the same time, not just 1 or 0."
}},
{{
"speaker_id": 1,
"dialog": "That sounds impossible! How would you even picture that?"
}},
{{
"speaker_id": 0,
"dialog": "Think of it like a coin spinning in the air. Before it lands, is it heads or tails?"
}},
{{
"speaker_id": 1,
"dialog": "Well, it's... neither? Or I guess both, until it lands? Oh, I think I see where you're going with this."
}}
]
}}
</examples>
Transform the source material into a lively and engaging podcast conversation. Craft dialogue that showcases authentic host chemistry and natural interaction (including occasional disagreement, building on points, or asking follow-up questions). Use varied speech patterns reflecting real human conversation, ensuring the final script effectively educates *and* entertains the listener while keeping within a 5-minute audio duration.
</podcast_generation_system>
"""

View file

@ -1,43 +0,0 @@
"""Define the state structures for the agent."""
from __future__ import annotations
from dataclasses import dataclass
from pydantic import BaseModel, Field
from sqlalchemy.ext.asyncio import AsyncSession
class PodcastTranscriptEntry(BaseModel):
"""
Represents a single entry in a podcast transcript.
"""
speaker_id: int = Field(..., description="The ID of the speaker (0 or 1)")
dialog: str = Field(..., description="The dialog text spoken by the speaker")
class PodcastTranscripts(BaseModel):
"""
Represents the full podcast transcript structure.
"""
podcast_transcripts: list[PodcastTranscriptEntry] = Field(
..., description="List of transcript entries with alternating speakers"
)
@dataclass
class State:
"""Defines the input state for the agent, representing a narrower interface to the outside world.
This class is used to define the initial state and structure of incoming data.
See: https://langchain-ai.github.io/langgraph/concepts/low_level/#state
for more information.
"""
# Runtime context
db_session: AsyncSession
source_content: str
podcast_transcript: list[PodcastTranscriptEntry] | None = None
final_podcast_file_path: str | None = None

View file

@ -1,84 +0,0 @@
def get_voice_for_provider(provider: str, speaker_id: int) -> dict | str:
"""
Get the appropriate voice configuration based on the TTS provider and speaker ID.
Args:
provider: The TTS provider (e.g., "openai/tts-1", "vertex_ai/test")
speaker_id: The ID of the speaker (0-5)
Returns:
Voice configuration - string for OpenAI, dict for Vertex AI
"""
if provider == "local/kokoro":
# Kokoro voice mapping - https://huggingface.co/hexgrad/Kokoro-82M/tree/main/voices
kokoro_voices = {
0: "am_adam", # Default/intro voice
1: "af_bella", # First speaker
}
return kokoro_voices.get(speaker_id, "af_heart")
# Extract provider type from the model string
provider_type = (
provider.split("/")[0].lower() if "/" in provider else provider.lower()
)
if provider_type == "openai":
# OpenAI voice mapping - simple string values
openai_voices = {
0: "alloy", # Default/intro voice
1: "echo", # First speaker
2: "fable", # Second speaker
3: "onyx", # Third speaker
4: "nova", # Fourth speaker
5: "shimmer", # Fifth speaker
}
return openai_voices.get(speaker_id, "alloy")
elif provider_type == "vertex_ai":
# Vertex AI voice mapping - dict with languageCode and name
vertex_voices = {
0: {
"languageCode": "en-US",
"name": "en-US-Studio-O",
},
1: {
"languageCode": "en-US",
"name": "en-US-Studio-M",
},
2: {
"languageCode": "en-UK",
"name": "en-UK-Studio-A",
},
3: {
"languageCode": "en-UK",
"name": "en-UK-Studio-B",
},
4: {
"languageCode": "en-AU",
"name": "en-AU-Studio-A",
},
5: {
"languageCode": "en-AU",
"name": "en-AU-Studio-B",
},
}
return vertex_voices.get(speaker_id, vertex_voices[0])
elif provider_type == "azure":
# OpenAI voice mapping - simple string values
azure_voices = {
0: "alloy", # Default/intro voice
1: "echo", # First speaker
2: "fable", # Second speaker
3: "onyx", # Third speaker
4: "nova", # Fourth speaker
5: "shimmer", # Fifth speaker
}
return azure_voices.get(speaker_id, "alloy")
else:
# Default fallback to OpenAI format for unknown providers
default_voices = {
0: {},
1: {},
}
return default_voices.get(speaker_id, default_voices[0])

View file

@ -1,8 +1,7 @@
"""Video Presentation LangGraph Agent.
This module defines a graph for generating video presentations
from source content, similar to the podcaster agent but producing
slide-based video presentations with TTS narration.
This module defines a graph for generating slide-based video presentations
from source content, with TTS narration per slide.
"""
from .graph import graph

View file

@ -181,7 +181,8 @@ celery_app = Celery(
backend=CELERY_RESULT_BACKEND,
include=[
"app.tasks.celery_tasks.document_tasks",
"app.tasks.celery_tasks.podcast_tasks",
"app.podcasts.tasks.draft",
"app.podcasts.tasks.render",
"app.tasks.celery_tasks.video_presentation_tasks",
"app.tasks.celery_tasks.connector_tasks",
"app.tasks.celery_tasks.obsidian_tasks",

View file

@ -114,13 +114,6 @@ class SearchSourceConnectorType(StrEnum):
COMPOSIO_GOOGLE_CALENDAR_CONNECTOR = "COMPOSIO_GOOGLE_CALENDAR_CONNECTOR"
class PodcastStatus(StrEnum):
PENDING = "pending"
GENERATING = "generating"
READY = "ready"
FAILED = "failed"
class VideoPresentationStatus(StrEnum):
PENDING = "pending"
GENERATING = "generating"
@ -1537,41 +1530,6 @@ class Chunk(BaseModel, TimestampMixin):
document = relationship("Document", back_populates="chunks")
class Podcast(BaseModel, TimestampMixin):
"""Podcast model for storing generated podcasts."""
__tablename__ = "podcasts"
title = Column(String(500), nullable=False)
podcast_transcript = Column(JSONB, nullable=True)
file_location = Column(Text, nullable=True)
status = Column(
SQLAlchemyEnum(
PodcastStatus,
name="podcast_status",
create_type=False,
values_callable=lambda x: [e.value for e in x],
),
nullable=False,
default=PodcastStatus.READY,
server_default="ready",
index=True,
)
search_space_id = Column(
Integer, ForeignKey("searchspaces.id", ondelete="CASCADE"), nullable=False
)
search_space = relationship("SearchSpace", back_populates="podcasts")
thread_id = Column(
Integer,
ForeignKey("new_chat_threads.id", ondelete="SET NULL"),
nullable=True,
index=True,
)
thread = relationship("NewChatThread")
class VideoPresentation(BaseModel, TimestampMixin):
"""Video presentation model for storing AI-generated video presentations.
@ -2908,6 +2866,10 @@ from app.automations.persistence import ( # noqa: E402, F401
)
from app.file_storage.persistence import DocumentFile # noqa: E402, F401
from app.notifications.persistence import Notification # noqa: E402, F401
from app.podcasts.persistence import ( # noqa: E402, F401
Podcast,
PodcastStatus,
)
engine = create_async_engine(
DATABASE_URL,

View file

@ -0,0 +1,9 @@
"""Podcast feature: brief resolution, transcript drafting, and audio rendering.
Owns the ``podcasts`` table model, which :mod:`app.db` re-exports so existing
``from app.db import Podcast`` imports keep resolving.
"""
from __future__ import annotations
__all__: list[str] = []

View file

@ -0,0 +1,7 @@
"""HTTP API for the podcast lifecycle."""
from __future__ import annotations
from .routes import router
__all__ = ["router"]

View file

@ -0,0 +1,342 @@
"""HTTP surface for the podcast lifecycle.
Status is observed by the frontend through Zero, so these routes are about
actions (create, edit/approve the brief, regenerate, cancel) and audio delivery.
Each mutating route performs the guarded transition via the service, commits,
then enqueues the matching Celery task; lifecycle errors map to 409/422.
"""
from __future__ import annotations
import os
from pathlib import Path
from fastapi import APIRouter, Depends, HTTPException, Response
from fastapi.responses import StreamingResponse
from sqlalchemy import select
from sqlalchemy.ext.asyncio import AsyncSession
from app.config import config as app_config
from app.db import (
Permission,
SearchSpace,
SearchSpaceMembership,
User,
get_async_session,
)
from app.podcasts.generation.brief import propose_brief
from app.podcasts.persistence import Podcast, PodcastRepository
from app.podcasts.service import (
InvalidTransition,
PodcastService,
PreconditionFailed,
SpecConflict,
)
from app.podcasts.storage import open_audio_stream, purge_audio
from app.podcasts.tasks import draft_transcript_task
from app.podcasts.tts import get_text_to_speech
from app.podcasts.voices import (
get_voice_catalog,
provider_from_service,
render_voice_preview,
)
from app.users import current_active_user
from app.utils.rbac import check_permission
from .schemas import (
CreatePodcastRequest,
PodcastDetail,
PodcastSummary,
UpdateSpecRequest,
VoiceOption,
)
router = APIRouter()
@router.get("/podcasts", response_model=list[PodcastSummary])
async def list_podcasts(
search_space_id: int | None = None,
skip: int = 0,
limit: int = 100,
session: AsyncSession = Depends(get_async_session),
user: User = Depends(current_active_user),
):
if skip < 0 or limit < 1:
raise HTTPException(status_code=400, detail="Invalid pagination parameters")
if search_space_id is not None:
await _require(session, user, search_space_id, Permission.PODCASTS_READ)
query = (
select(Podcast)
.where(Podcast.search_space_id == search_space_id)
.order_by(Podcast.created_at.desc())
.offset(skip)
.limit(limit)
)
else:
query = (
select(Podcast)
.join(SearchSpace)
.join(SearchSpaceMembership)
.where(SearchSpaceMembership.user_id == user.id)
.order_by(Podcast.created_at.desc())
.offset(skip)
.limit(limit)
)
result = await session.execute(query)
return list(result.scalars().all())
@router.get("/podcasts/voices", response_model=list[VoiceOption])
async def list_voices(language: str | None = None):
"""Voices the active TTS provider offers, optionally filtered by language."""
if not app_config.TTS_SERVICE:
raise HTTPException(status_code=503, detail="No TTS provider configured")
provider = provider_from_service(app_config.TTS_SERVICE)
catalog = get_voice_catalog()
voices = (
catalog.for_language(provider, language)
if language
else catalog.for_provider(provider)
)
return [
VoiceOption(
voice_id=v.voice_id,
display_name=v.display_name,
language=v.language,
gender=v.gender.value,
)
for v in voices
]
@router.get("/podcasts/voices/{voice_id}/preview")
async def preview_voice(
voice_id: str,
user: User = Depends(current_active_user),
):
"""A short audio sample of a voice, so users pick by sound."""
if not app_config.TTS_SERVICE:
raise HTTPException(status_code=503, detail="No TTS provider configured")
provider = provider_from_service(app_config.TTS_SERVICE)
try:
voice = get_voice_catalog().get(voice_id)
except KeyError:
raise HTTPException(status_code=404, detail="Unknown voice") from None
if voice.provider is not provider:
raise HTTPException(
status_code=404, detail="Voice not offered by the active TTS provider"
)
data, content_type = await render_voice_preview(voice, get_text_to_speech())
return Response(content=data, media_type=content_type)
@router.post("/podcasts", response_model=PodcastDetail, status_code=201)
async def create_podcast(
body: CreatePodcastRequest,
session: AsyncSession = Depends(get_async_session),
user: User = Depends(current_active_user),
):
await _require(session, user, body.search_space_id, Permission.PODCASTS_CREATE)
service = PodcastService(session)
podcast = await service.create(
title=body.title,
search_space_id=body.search_space_id,
thread_id=body.thread_id,
)
podcast.source_content = body.source_content
spec = await propose_brief(
session,
search_space_id=body.search_space_id,
speaker_count=body.speaker_count,
min_minutes=body.min_minutes,
max_minutes=body.max_minutes,
focus=body.focus,
)
await service.attach_brief(podcast, spec)
await session.commit()
return PodcastDetail.of(podcast)
@router.get("/podcasts/{podcast_id}", response_model=PodcastDetail)
async def get_podcast(
podcast_id: int,
session: AsyncSession = Depends(get_async_session),
user: User = Depends(current_active_user),
):
podcast = await _load(session, user, podcast_id, Permission.PODCASTS_READ)
return PodcastDetail.of(podcast)
@router.patch("/podcasts/{podcast_id}/spec", response_model=PodcastDetail)
async def update_spec(
podcast_id: int,
body: UpdateSpecRequest,
session: AsyncSession = Depends(get_async_session),
user: User = Depends(current_active_user),
):
podcast = await _load(session, user, podcast_id, Permission.PODCASTS_UPDATE)
async with _lifecycle_errors():
await PodcastService(session).update_spec(
podcast, body.spec, body.expected_version
)
await session.commit()
return PodcastDetail.of(podcast)
@router.post("/podcasts/{podcast_id}/brief/approve", response_model=PodcastDetail)
async def approve_brief(
podcast_id: int,
session: AsyncSession = Depends(get_async_session),
user: User = Depends(current_active_user),
):
"""Approve the brief and start drafting the transcript."""
podcast = await _load(session, user, podcast_id, Permission.PODCASTS_UPDATE)
async with _lifecycle_errors():
await PodcastService(session).begin_drafting(podcast)
await session.commit()
draft_transcript_task.delay(podcast.id, podcast.search_space_id)
return PodcastDetail.of(podcast)
@router.post(
"/podcasts/{podcast_id}/transcript/regenerate", response_model=PodcastDetail
)
async def regenerate_transcript(
podcast_id: int,
session: AsyncSession = Depends(get_async_session),
user: User = Depends(current_active_user),
):
"""Reopen the brief gate for a fresh take; drafting waits for re-approval."""
podcast = await _load(session, user, podcast_id, Permission.PODCASTS_UPDATE)
async with _lifecycle_errors():
await PodcastService(session).regenerate(podcast)
await session.commit()
return PodcastDetail.of(podcast)
@router.post("/podcasts/{podcast_id}/regenerate/revert", response_model=PodcastDetail)
async def revert_regeneration(
podcast_id: int,
session: AsyncSession = Depends(get_async_session),
user: User = Depends(current_active_user),
):
"""Back out of a regeneration and return to the finished episode."""
podcast = await _load(session, user, podcast_id, Permission.PODCASTS_UPDATE)
async with _lifecycle_errors():
await PodcastService(session).revert_regeneration(podcast)
await session.commit()
return PodcastDetail.of(podcast)
@router.post("/podcasts/{podcast_id}/cancel", response_model=PodcastDetail)
async def cancel_podcast(
podcast_id: int,
session: AsyncSession = Depends(get_async_session),
user: User = Depends(current_active_user),
):
podcast = await _load(session, user, podcast_id, Permission.PODCASTS_UPDATE)
async with _lifecycle_errors():
await PodcastService(session).cancel(podcast)
await session.commit()
return PodcastDetail.of(podcast)
@router.delete("/podcasts/{podcast_id}", response_model=dict)
async def delete_podcast(
podcast_id: int,
session: AsyncSession = Depends(get_async_session),
user: User = Depends(current_active_user),
):
podcast = await _load(session, user, podcast_id, Permission.PODCASTS_DELETE)
await purge_audio(podcast)
await session.delete(podcast)
await session.commit()
return {"message": "Podcast deleted successfully"}
@router.get("/podcasts/{podcast_id}/stream")
async def stream_podcast(
podcast_id: int,
session: AsyncSession = Depends(get_async_session),
user: User = Depends(current_active_user),
):
podcast = await _load(session, user, podcast_id, Permission.PODCASTS_READ)
if podcast.storage_key:
return StreamingResponse(
open_audio_stream(podcast),
media_type="audio/mpeg",
headers={"Accept-Ranges": "bytes"},
)
# Back-compat: rows rendered before the storage migration kept a local path.
if podcast.file_location and os.path.isfile(podcast.file_location):
path = podcast.file_location
def iterfile():
with open(path, mode="rb") as handle:
yield from handle
return StreamingResponse(
iterfile(),
media_type="audio/mpeg",
headers={
"Accept-Ranges": "bytes",
"Content-Disposition": f"inline; filename={Path(path).name}",
},
)
raise HTTPException(status_code=404, detail="Podcast audio not found")
async def _require(
session: AsyncSession,
user: User,
search_space_id: int,
permission: Permission,
) -> None:
await check_permission(
session,
user,
search_space_id,
permission.value,
"You don't have permission for podcasts in this search space",
)
async def _load(
session: AsyncSession,
user: User,
podcast_id: int,
permission: Permission,
) -> Podcast:
podcast = await PodcastRepository(session).get(podcast_id)
if podcast is None:
raise HTTPException(status_code=404, detail="Podcast not found")
await _require(session, user, podcast.search_space_id, permission)
return podcast
class _lifecycle_errors:
"""Map service lifecycle errors onto HTTP responses."""
async def __aenter__(self) -> None:
return None
async def __aexit__(self, exc_type, exc, tb) -> bool:
if exc is None:
return False
if isinstance(exc, SpecConflict):
raise HTTPException(status_code=409, detail=str(exc)) from exc
if isinstance(exc, InvalidTransition):
raise HTTPException(status_code=409, detail=str(exc)) from exc
if isinstance(exc, PreconditionFailed):
raise HTTPException(status_code=422, detail=str(exc)) from exc
return False

View file

@ -0,0 +1,97 @@
"""Request and response shapes for the podcast API.
Read models surface the lifecycle state the frontend can't derive from Zero (the
deserialized brief and transcript); the action requests carry just what each
guarded transition needs.
"""
from __future__ import annotations
from datetime import datetime
from pydantic import BaseModel, ConfigDict, Field
from app.podcasts.persistence import Podcast, PodcastStatus
from app.podcasts.schemas import PodcastSpec, Transcript
from app.podcasts.service import has_stored_episode, read_spec, read_transcript
# Defaults applied when a create request omits brief sizing; the brief gate lets
# the user adjust before any cost is incurred.
DEFAULT_SPEAKER_COUNT = 2
DEFAULT_MIN_MINUTES = 10
DEFAULT_MAX_MINUTES = 20
class CreatePodcastRequest(BaseModel):
"""Create a podcast and kick off brief proposal."""
title: str = Field(..., min_length=1, max_length=500)
search_space_id: int
source_content: str = Field(..., min_length=1)
thread_id: int | None = None
speaker_count: int = Field(default=DEFAULT_SPEAKER_COUNT, ge=1, le=6)
min_minutes: int = Field(default=DEFAULT_MIN_MINUTES, ge=1)
max_minutes: int = Field(default=DEFAULT_MAX_MINUTES, ge=1)
focus: str | None = Field(default=None, max_length=2000)
class UpdateSpecRequest(BaseModel):
"""Replace the brief at the gate, guarded by the expected version."""
spec: PodcastSpec
expected_version: int = Field(..., ge=1)
class VoiceOption(BaseModel):
"""One selectable voice surfaced to the brief editor."""
voice_id: str
display_name: str
language: str
gender: str
class PodcastSummary(BaseModel):
"""Lightweight list item."""
model_config = ConfigDict(from_attributes=True)
id: int
title: str
status: PodcastStatus
created_at: datetime
search_space_id: int
class PodcastDetail(BaseModel):
"""Full podcast state for the detail view and action responses."""
id: int
title: str
status: PodcastStatus
spec_version: int
spec: PodcastSpec | None
transcript: Transcript | None
has_audio: bool
duration_seconds: int | None
error: str | None
created_at: datetime
search_space_id: int
thread_id: int | None
@classmethod
def of(cls, podcast: Podcast) -> PodcastDetail:
return cls(
id=podcast.id,
title=podcast.title,
status=PodcastStatus(podcast.status),
spec_version=podcast.spec_version,
spec=read_spec(podcast),
transcript=read_transcript(podcast),
has_audio=has_stored_episode(podcast),
duration_seconds=podcast.duration_seconds,
error=podcast.error,
created_at=podcast.created_at,
search_space_id=podcast.search_space_id,
thread_id=podcast.thread_id,
)

View file

@ -0,0 +1,19 @@
"""Generation: the controlled graphs that produce a brief and a transcript.
``brief`` proposes a reviewable spec from deterministic defaults; ``transcript``
is the LLM-driven step, drafting long-form dialogue outline-first.
"""
from __future__ import annotations
from .brief import BriefConfig, BriefState, build_brief_graph
from .transcript import TranscriptConfig, TranscriptState, build_transcript_graph
__all__ = [
"BriefConfig",
"BriefState",
"TranscriptConfig",
"TranscriptState",
"build_brief_graph",
"build_transcript_graph",
]

View file

@ -0,0 +1,10 @@
"""Brief planning: propose a reviewable spec from last-used preferences."""
from __future__ import annotations
from .config import BriefConfig
from .graph import build_brief_graph
from .propose import propose_brief
from .state import BriefState
__all__ = ["BriefConfig", "BriefState", "build_brief_graph", "propose_brief"]

View file

@ -0,0 +1,30 @@
"""Configurable inputs for the brief-planning graph."""
from __future__ import annotations
from dataclasses import dataclass, field, fields
from langchain_core.runnables import RunnableConfig
# Sensible defaults for a fresh brief; the user adjusts the range at the gate.
DEFAULT_SPEAKER_COUNT = 2
DEFAULT_MIN_MINUTES = 10
DEFAULT_MAX_MINUTES = 20
@dataclass(kw_only=True)
class BriefConfig:
"""Signals used to propose a brief; everything here is non-LLM context."""
speaker_count: int = DEFAULT_SPEAKER_COUNT
min_minutes: int = DEFAULT_MIN_MINUTES
max_minutes: int = DEFAULT_MAX_MINUTES
focus: str | None = None
last_used_language: str | None = None
last_used_voices: list[str] = field(default_factory=list)
@classmethod
def from_runnable_config(cls, config: RunnableConfig | None = None) -> BriefConfig:
configurable = (config.get("configurable") or {}) if config else {}
names = {f.name for f in fields(cls) if f.init}
return cls(**{k: v for k, v in configurable.items() if k in names})

View file

@ -0,0 +1,25 @@
"""The brief-planning graph: propose a reviewable spec from defaults."""
from __future__ import annotations
from langgraph.graph import StateGraph
from .config import BriefConfig
from .nodes import propose_spec
from .state import BriefState
def build_brief_graph():
workflow = StateGraph(BriefState, config_schema=BriefConfig)
workflow.add_node("propose_spec", propose_spec)
workflow.add_edge("__start__", "propose_spec")
workflow.add_edge("propose_spec", "__end__")
graph = workflow.compile()
graph.name = "Surfsense Podcast Brief"
return graph
graph = build_brief_graph()

View file

@ -0,0 +1,119 @@
"""Brief-planning node: propose a full spec from deterministic defaults.
``propose_spec`` is pure resolution it never spends tokens. It reuses the
user's last-used language/voices when available and otherwise falls back to
English, so the brief gate opens pre-filled and the common case needs no edits.
"""
from __future__ import annotations
from typing import Any
from langchain_core.runnables import RunnableConfig
from app.config import config as app_config
from app.podcasts.resolution import (
DEFAULT_LANGUAGE,
LanguageContext,
resolve_language,
resolve_voices,
)
from app.podcasts.schemas import (
DurationTarget,
PodcastSpec,
PodcastStyle,
SpeakerRole,
SpeakerSpec,
normalize_language_tag,
)
from app.podcasts.voices import (
TtsProvider,
VoiceCatalog,
get_voice_catalog,
provider_from_service,
)
from .config import BriefConfig
from .state import BriefState
# Default role per speaker slot; extra speakers beyond the list fall back to guest.
_ROLE_BY_SLOT = (
SpeakerRole.HOST,
SpeakerRole.GUEST,
SpeakerRole.EXPERT,
SpeakerRole.COHOST,
SpeakerRole.NARRATOR,
)
def propose_spec(state: BriefState, config: RunnableConfig) -> dict[str, Any]:
"""Build a complete :class:`PodcastSpec` from the resolved defaults."""
brief = BriefConfig.from_runnable_config(config)
provider = _active_provider()
catalog = get_voice_catalog()
language = _supported_language(
last_used=brief.last_used_language,
provider=provider,
catalog=catalog,
)
voices = resolve_voices(
catalog=catalog,
provider=provider,
language=language,
speaker_count=brief.speaker_count,
preferred=brief.last_used_voices,
)
speakers = [
SpeakerSpec(
slot=slot,
name=_default_name(slot),
role=_role_for(slot),
voice_id=voice.voice_id,
)
for slot, voice in enumerate(voices)
]
spec = PodcastSpec(
language=language,
style=PodcastStyle.CONVERSATIONAL,
speakers=speakers,
duration=DurationTarget(
min_minutes=brief.min_minutes, max_minutes=brief.max_minutes
),
focus=brief.focus,
)
return {"spec": spec}
def _active_provider() -> TtsProvider:
service = app_config.TTS_SERVICE
if not service:
raise ValueError("TTS_SERVICE is not configured")
return provider_from_service(service)
def _supported_language(
*,
last_used: str | None,
provider: TtsProvider,
catalog: VoiceCatalog,
) -> str:
raw = resolve_language(LanguageContext(last_used=last_used))
try:
language = normalize_language_tag(raw)
except ValueError:
language = DEFAULT_LANGUAGE
if not catalog.supports_language(provider, language):
return DEFAULT_LANGUAGE
return language
def _role_for(slot: int) -> SpeakerRole:
return _ROLE_BY_SLOT[slot] if slot < len(_ROLE_BY_SLOT) else SpeakerRole.GUEST
def _default_name(slot: int) -> str:
role = _role_for(slot)
label = role.value.replace("cohost", "co-host").title()
return label if slot < len(_ROLE_BY_SLOT) else f"{label} {slot}"

View file

@ -0,0 +1,40 @@
"""Propose a podcast's initial brief spec."""
from __future__ import annotations
from sqlalchemy.ext.asyncio import AsyncSession
from app.podcasts.persistence import PodcastRepository
from app.podcasts.schemas import PodcastSpec
from app.podcasts.service import preferences_from
from .config import DEFAULT_MAX_MINUTES, DEFAULT_MIN_MINUTES, DEFAULT_SPEAKER_COUNT
from .graph import graph as brief_graph
from .state import BriefState
async def propose_brief(
session: AsyncSession,
*,
search_space_id: int,
speaker_count: int = DEFAULT_SPEAKER_COUNT,
min_minutes: int = DEFAULT_MIN_MINUTES,
max_minutes: int = DEFAULT_MAX_MINUTES,
focus: str | None = None,
) -> PodcastSpec:
"""Reuse the last-used language and voices, else English; return the spec."""
last_language, last_voices = preferences_from(
await PodcastRepository(session).latest_with_spec(search_space_id)
)
config = {
"configurable": {
"speaker_count": speaker_count,
"min_minutes": min_minutes,
"max_minutes": max_minutes,
"focus": focus,
"last_used_language": last_language,
"last_used_voices": last_voices,
}
}
result = await brief_graph.ainvoke(BriefState(), config=config)
return result["spec"]

View file

@ -0,0 +1,14 @@
"""Mutable state threaded through the brief-planning graph."""
from __future__ import annotations
from dataclasses import dataclass
from app.podcasts.schemas import PodcastSpec
@dataclass
class BriefState:
"""The proposed spec the graph produces; inputs arrive via the config."""
spec: PodcastSpec | None = None

View file

@ -0,0 +1,13 @@
"""Prompt builders for the generation graphs."""
from __future__ import annotations
from .draft_segment import draft_segment_prompt
from .plan_outline import plan_outline_prompt
from .speakers import render_speaker_roster
__all__ = [
"draft_segment_prompt",
"plan_outline_prompt",
"render_speaker_roster",
]

View file

@ -0,0 +1,54 @@
"""Prompt for drafting one outline segment into dialogue turns.
Each segment is drafted on its own so long episodes stay coherent and within
context limits. A short recap of the preceding dialogue is passed in so the new
segment continues naturally instead of restarting. The model must write in the
episode language and attribute every line to a real speaker slot.
"""
from __future__ import annotations
from typing import TYPE_CHECKING
from app.podcasts.schemas import PodcastSpec
from .speakers import render_speaker_roster
if TYPE_CHECKING:
from app.podcasts.generation.transcript.planning import OutlineSegment
def draft_segment_prompt(
*,
spec: PodcastSpec,
segment: OutlineSegment,
position: int,
total: int,
recap: str | None,
) -> str:
talking_points = "\n".join(f"- {point}" for point in segment.talking_points)
recap_block = (
f"\nRecap of the conversation so far (continue from here, do not repeat "
f"it):\n{recap}\n"
if recap
else "\nThis is the opening segment; begin the conversation naturally.\n"
)
return f"""\
You are scripting natural, engaging podcast dialogue for segment {position} of \
{total}.
Write entirely in {spec.language}. The format is {spec.style.value}.
Speakers attribute every line using these exact slot numbers:
{render_speaker_roster(spec)}
{recap_block}
This segment is "{segment.title}". Cover these points using only facts grounded \
in the provided source content:
{talking_points}
Aim for about {segment.target_words} words of dialogue. Keep turns conversational \
and varied; speakers should react to each other rather than deliver monologues. \
Do not add greetings or sign-offs unless this is the first or last segment.
Respond with strict JSON and nothing else:
{{"turns": [{{"speaker": <slot>, "text": "..."}}]}}
"""

View file

@ -0,0 +1,47 @@
"""Prompt for planning a long-form podcast outline before drafting dialogue.
Outlining first is what makes long-form reliable: a single LLM call cannot hold
a coherent one- to two-hour script, but it can plan segments that are then
drafted independently against a shared plan. The prompt is told the target
length so the number and size of segments scale with the requested duration.
"""
from __future__ import annotations
from app.podcasts.schemas import PodcastSpec
from .speakers import render_speaker_roster
def plan_outline_prompt(
*,
spec: PodcastSpec,
target_words: int,
suggested_segments: int,
focus: str | None,
) -> str:
focus_block = (
f"\nThe user asked the episode to focus on:\n{focus}\n" if focus else ""
)
return f"""\
You are a podcast showrunner planning the structure of an episode before any \
dialogue is written.
The episode language is {spec.language}. The format is {spec.style.value}.
Speakers (refer to them by these slots later):
{render_speaker_roster(spec)}
{focus_block}
Plan an outline that, when fully drafted, reaches roughly {target_words} words \
of spoken dialogue (about {suggested_segments} segments). Each segment is one \
coherent beat of the conversation: an opening, distinct topic areas grounded in \
the source content, and a closing.
For each segment provide:
- title: a short label for the beat
- talking_points: 2-5 concrete points to cover, drawn from the source content
- target_words: how many words of dialogue this segment should run (the sum \
across segments should approximate {target_words})
Respond with strict JSON and nothing else:
{{"segments": [{{"title": "...", "talking_points": ["..."], "target_words": 0}}]}}
"""

View file

@ -0,0 +1,18 @@
"""Render a spec's speaker roster for prompts.
The drafting prompts must reference speakers by the exact ``slot`` the renderer
expects, so this is the single place that formats that roster keeping the
slot contract identical across every prompt that mentions speakers.
"""
from __future__ import annotations
from app.podcasts.schemas import PodcastSpec
def render_speaker_roster(spec: PodcastSpec) -> str:
lines = [
f"- slot {speaker.slot}{speaker.name} (role: {speaker.role.value})"
for speaker in spec.speakers
]
return "\n".join(lines)

View file

@ -0,0 +1,48 @@
"""Parse a model's reply into a Pydantic shape, tolerating chatty output.
Agent LLMs return JSON wrapped in prose, markdown fences, or reasoning blocks,
so a plain ``model_validate_json`` is unreliable. Centralising the tolerant
parse here keeps every generation node validating replies the same way.
"""
from __future__ import annotations
from typing import TYPE_CHECKING, TypeVar
from pydantic import BaseModel, ValidationError
from app.utils.content_utils import extract_text_content, strip_markdown_fences
if TYPE_CHECKING:
from langchain_core.messages import BaseMessage
T = TypeVar("T", bound=BaseModel)
class StructuredOutputError(RuntimeError):
"""The model reply could not be parsed into the expected shape."""
async def invoke_json(llm, messages: list[BaseMessage], model: type[T]) -> T:
"""Invoke ``llm`` and validate its reply as ``model``."""
response = await llm.ainvoke(messages)
content = strip_markdown_fences(extract_text_content(response.content))
try:
return model.model_validate_json(content)
except (ValidationError, ValueError):
pass
start = content.find("{")
end = content.rfind("}") + 1
if 0 <= start < end:
try:
return model.model_validate_json(content[start:end])
except (ValidationError, ValueError) as exc:
raise StructuredOutputError(
f"could not parse {model.__name__} from model reply"
) from exc
raise StructuredOutputError(
f"no JSON object found for {model.__name__} in model reply"
)

View file

@ -0,0 +1,17 @@
"""Transcript drafting: outline-first, long-form dialogue generation."""
from __future__ import annotations
from .config import TranscriptConfig
from .graph import build_transcript_graph
from .planning import Outline, OutlineSegment, SegmentDraft
from .state import TranscriptState
__all__ = [
"Outline",
"OutlineSegment",
"SegmentDraft",
"TranscriptConfig",
"TranscriptState",
"build_transcript_graph",
]

View file

@ -0,0 +1,26 @@
"""Configurable inputs for the transcript-drafting graph."""
from __future__ import annotations
from dataclasses import dataclass, fields
from langchain_core.runnables import RunnableConfig
from app.podcasts.schemas import PodcastSpec
@dataclass(kw_only=True)
class TranscriptConfig:
"""The approved spec and user focus that drive drafting."""
search_space_id: int
spec: PodcastSpec
focus: str | None = None
@classmethod
def from_runnable_config(
cls, config: RunnableConfig | None = None
) -> TranscriptConfig:
configurable = (config.get("configurable") or {}) if config else {}
names = {f.name for f in fields(cls) if f.init}
return cls(**{k: v for k, v in configurable.items() if k in names})

View file

@ -0,0 +1,29 @@
"""The transcript-drafting graph: outline, draft segments, finalize."""
from __future__ import annotations
from langgraph.graph import StateGraph
from .config import TranscriptConfig
from .nodes import draft_segments, finalize, plan_outline
from .state import TranscriptState
def build_transcript_graph():
workflow = StateGraph(TranscriptState, config_schema=TranscriptConfig)
workflow.add_node("plan_outline", plan_outline)
workflow.add_node("draft_segments", draft_segments)
workflow.add_node("finalize", finalize)
workflow.add_edge("__start__", "plan_outline")
workflow.add_edge("plan_outline", "draft_segments")
workflow.add_edge("draft_segments", "finalize")
workflow.add_edge("finalize", "__end__")
graph = workflow.compile()
graph.name = "Surfsense Podcast Transcript"
return graph
graph = build_transcript_graph()

View file

@ -0,0 +1,127 @@
"""Transcript-drafting nodes: plan an outline, draft each beat, then assemble.
Long-form is produced beat-by-beat: a single call plans the structure, then each
segment is drafted on its own with a recap of what came before so the script
stays coherent without holding the whole episode in one context window.
"""
from __future__ import annotations
from typing import Any
from langchain_core.messages import HumanMessage, SystemMessage
from langchain_core.runnables import RunnableConfig
from app.podcasts.schemas import PodcastSpec, Transcript, TranscriptTurn
from app.services.llm_service import get_agent_llm
from ..prompts import draft_segment_prompt, plan_outline_prompt
from ..structured import invoke_json
from .config import TranscriptConfig
from .planning import Outline, OutlineSegment, SegmentDraft
from .state import TranscriptState
# Average speaking rate; converts target minutes to a target word count.
_WORDS_PER_MINUTE = 150
# Rough words per outline segment, used to suggest how many segments to plan.
_WORDS_PER_SEGMENT = 250
# Cap on source text sent per LLM call to bound tokens on large sources.
_SOURCE_BUDGET_CHARS = 12000
# How much prior dialogue to recap into each segment for continuity.
_RECAP_CHARS = 800
async def plan_outline(
state: TranscriptState, config: RunnableConfig
) -> dict[str, Any]:
"""Plan the segment structure sized to the spec's target duration."""
tc = TranscriptConfig.from_runnable_config(config)
llm = await _require_llm(state, tc)
target_words = round(tc.spec.duration.midpoint_minutes * _WORDS_PER_MINUTE)
suggested_segments = max(1, round(target_words / _WORDS_PER_SEGMENT))
messages = [
SystemMessage(
content=plan_outline_prompt(
spec=tc.spec,
target_words=target_words,
suggested_segments=suggested_segments,
focus=tc.focus,
)
),
HumanMessage(content=_source_block(state.source_content)),
]
outline = await invoke_json(llm, messages, Outline)
return {"outline": outline}
async def draft_segments(
state: TranscriptState, config: RunnableConfig
) -> dict[str, Any]:
"""Draft each outline segment in order, carrying a running recap."""
tc = TranscriptConfig.from_runnable_config(config)
llm = await _require_llm(state, tc)
outline = state.outline
if outline is None:
raise RuntimeError("draft_segments requires an outline")
source_block = _source_block(state.source_content)
turns: list[TranscriptTurn] = []
total = len(outline.segments)
for index, segment in enumerate(outline.segments):
messages = [
SystemMessage(
content=draft_segment_prompt(
spec=tc.spec,
segment=segment,
position=index + 1,
total=total,
recap=_recap(turns, tc.spec),
)
),
HumanMessage(content=source_block),
]
draft = await invoke_json(llm, messages, SegmentDraft)
turns.extend(_valid_turns(draft, tc.spec))
return {"drafted_turns": turns}
def finalize(state: TranscriptState, config: RunnableConfig) -> dict[str, Any]:
"""Assemble drafted turns into a validated transcript."""
if not state.drafted_turns:
raise RuntimeError("drafting produced no usable dialogue")
return {"transcript": Transcript(turns=state.drafted_turns)}
async def _require_llm(state: TranscriptState, tc: TranscriptConfig):
llm = await get_agent_llm(state.db_session, tc.search_space_id)
if llm is None:
raise RuntimeError(
f"no agent LLM configured for search space {tc.search_space_id}"
)
return llm
def _source_block(source_content: str) -> str:
sample = (source_content or "")[:_SOURCE_BUDGET_CHARS]
return f"<source_content>{sample}</source_content>"
def _valid_turns(draft: SegmentDraft, spec: PodcastSpec) -> list[TranscriptTurn]:
# Drop any turn the model attributed to a slot the spec doesn't define, so a
# stray attribution can't break rendering downstream.
valid_slots = {speaker.slot for speaker in spec.speakers}
return [turn for turn in draft.turns if turn.speaker in valid_slots]
def _recap(turns: list[TranscriptTurn], spec: PodcastSpec) -> str | None:
if not turns:
return None
names = {speaker.slot: speaker.name for speaker in spec.speakers}
rendered = "\n".join(
f"{names.get(turn.speaker, turn.speaker)}: {turn.text}" for turn in turns
)
return rendered[-_RECAP_CHARS:]

View file

@ -0,0 +1,32 @@
"""Internal shapes the transcript graph passes between its nodes.
These are generation-time artifacts (the outline and per-segment drafts), not
persisted or API-facing. Segment drafts reuse :class:`TranscriptTurn` so the
speaker-slot contract and turn validation are identical to the final transcript.
"""
from __future__ import annotations
from pydantic import BaseModel, Field
from app.podcasts.schemas import TranscriptTurn
class OutlineSegment(BaseModel):
"""One planned beat of the conversation, drafted independently."""
title: str = Field(..., min_length=1)
talking_points: list[str] = Field(default_factory=list)
target_words: int = Field(..., ge=1)
class Outline(BaseModel):
"""The full plan: ordered segments sized to the target duration."""
segments: list[OutlineSegment] = Field(..., min_length=1)
class SegmentDraft(BaseModel):
"""The dialogue a single segment produced."""
turns: list[TranscriptTurn] = Field(default_factory=list)

View file

@ -0,0 +1,22 @@
"""Mutable state threaded through the transcript-drafting graph."""
from __future__ import annotations
from dataclasses import dataclass, field
from sqlalchemy.ext.asyncio import AsyncSession
from app.podcasts.schemas import Transcript, TranscriptTurn
from .planning import Outline
@dataclass
class TranscriptState:
"""Source content plus the intermediate and final drafting artifacts."""
db_session: AsyncSession
source_content: str
outline: Outline | None = None
drafted_turns: list[TranscriptTurn] = field(default_factory=list)
transcript: Transcript | None = None

View file

@ -0,0 +1,9 @@
"""Models, enums, and data access for the podcasts table."""
from __future__ import annotations
from .enums import PodcastStatus
from .models import Podcast
from .repository import PodcastRepository
__all__ = ["Podcast", "PodcastRepository", "PodcastStatus"]

View file

@ -0,0 +1,7 @@
"""Enums for the podcasts table."""
from __future__ import annotations
from .podcast_status import PodcastStatus
__all__ = ["PodcastStatus"]

View file

@ -0,0 +1,42 @@
"""Podcast generation lifecycle.
The status drives a guarded state machine. A podcast is proposed (``PENDING``),
gets a reviewable brief (``AWAITING_BRIEF``), is drafted into a transcript
(``DRAFTING``), then rendered to audio (``RENDERING`` ``READY``). ``FAILED``
and ``CANCELLED`` are terminal; a ``READY`` episode can be sent back to the
brief gate for regeneration, and an in-flight regeneration can be reverted to
``READY`` while the previous audio still exists. ``AWAITING_REVIEW`` is
retained for legacy rows but
never entered anymore the brief is the only approval gate. The Python enum is
kept in lockstep with the ``podcast_status`` Postgres type via its paired
migration.
"""
from __future__ import annotations
from enum import StrEnum
class PodcastStatus(StrEnum):
PENDING = "pending"
AWAITING_BRIEF = "awaiting_brief"
DRAFTING = "drafting"
AWAITING_REVIEW = "awaiting_review"
RENDERING = "rendering"
READY = "ready"
FAILED = "failed"
CANCELLED = "cancelled"
@property
def is_terminal(self) -> bool:
"""Whether no further transition is possible from this state."""
return self in _TERMINAL
@property
def is_gate(self) -> bool:
"""Whether this state waits on user input before proceeding."""
return self in _GATES
_TERMINAL = frozenset({PodcastStatus.FAILED, PodcastStatus.CANCELLED})
_GATES = frozenset({PodcastStatus.AWAITING_BRIEF, PodcastStatus.AWAITING_REVIEW})

View file

@ -0,0 +1,82 @@
"""``podcasts`` table: a generated podcast, its brief, transcript, and state."""
from __future__ import annotations
from sqlalchemy import (
Column,
Enum as SQLAlchemyEnum,
ForeignKey,
Integer,
String,
Text,
)
from sqlalchemy.dialects.postgresql import JSONB
from sqlalchemy.orm import relationship
from app.db import BaseModel, TimestampMixin
from .enums import PodcastStatus
class Podcast(BaseModel, TimestampMixin):
"""A podcast across its whole lifecycle: brief, transcript, audio, status.
``spec`` (the reviewable brief) and ``podcast_transcript`` are JSONB so the
flexible Pydantic shapes can evolve without migrations. ``spec_version``
backs optimistic concurrency on brief edits. Rendered audio lives in the
object store, addressed by ``storage_backend`` + ``storage_key`` rather than
a raw path.
"""
__tablename__ = "podcasts"
title = Column(String(500), nullable=False)
status = Column(
SQLAlchemyEnum(
PodcastStatus,
name="podcast_status",
create_type=False,
values_callable=lambda x: [e.value for e in x],
),
nullable=False,
default=PodcastStatus.PENDING,
server_default=PodcastStatus.PENDING.value,
index=True,
)
# The source material the episode is generated from. Persisted because
# drafting happens after the brief gate, long after creation.
source_content = Column(Text, nullable=True)
# The reviewable brief (PodcastSpec); null until the brief gate is reached.
spec = Column(JSONB, nullable=True)
# Bumped on every spec edit; guards concurrent edits at the brief gate.
spec_version = Column(Integer, nullable=False, default=1, server_default="1")
# The drafted dialogue (Transcript); null until drafting completes.
podcast_transcript = Column(JSONB, nullable=True)
# Where the rendered audio lives in the object store; null until READY.
storage_backend = Column(String(32), nullable=True)
storage_key = Column(Text, nullable=True)
duration_seconds = Column(Integer, nullable=True)
# Human-readable reason when status is FAILED.
error = Column(Text, nullable=True)
# Legacy local audio path; retained for back-compat until cutover.
file_location = Column(Text, nullable=True)
search_space_id = Column(
Integer, ForeignKey("searchspaces.id", ondelete="CASCADE"), nullable=False
)
search_space = relationship("SearchSpace", back_populates="podcasts")
thread_id = Column(
Integer,
ForeignKey("new_chat_threads.id", ondelete="SET NULL"),
nullable=True,
index=True,
)
thread = relationship("NewChatThread")

View file

@ -0,0 +1,46 @@
"""Data access for the ``podcasts`` table.
A thin async repository so the service and tasks never write raw queries. It
only loads and persists rows; lifecycle rules and (de)serialization live in the
service.
"""
from __future__ import annotations
from sqlalchemy import select
from sqlalchemy.ext.asyncio import AsyncSession
from .models import Podcast
class PodcastRepository:
"""Loads and stores :class:`Podcast` rows for one session."""
def __init__(self, session: AsyncSession) -> None:
self._session = session
async def get(self, podcast_id: int) -> Podcast | None:
return await self._session.get(Podcast, podcast_id)
async def add(self, podcast: Podcast) -> Podcast:
"""Persist a new row and assign its primary key."""
self._session.add(podcast)
await self._session.flush()
return podcast
async def latest_with_spec(self, search_space_id: int) -> Podcast | None:
"""Most recent podcast in the space that has a stored brief.
Used to seed language/voice defaults for a new podcast from what the
user chose last.
"""
result = await self._session.execute(
select(Podcast)
.where(
Podcast.search_space_id == search_space_id,
Podcast.spec.is_not(None),
)
.order_by(Podcast.created_at.desc())
.limit(1)
)
return result.scalars().first()

View file

@ -0,0 +1,12 @@
"""Rendering: synthesise and merge an approved transcript into audio.
The :class:`PodcastRenderer` is the public entry point; the segment cache and
FFmpeg merge are implementation details it owns.
"""
from __future__ import annotations
from .errors import RenderError
from .renderer import PodcastRenderer, RenderedPodcast
__all__ = ["PodcastRenderer", "RenderError", "RenderedPodcast"]

View file

@ -0,0 +1,53 @@
"""Content-addressed cache for synthesised segments.
Each segment's audio is keyed by everything that determines its bytes (voice,
language, speed, text). Keeping the cache in a stable per-podcast directory
makes re-renders cheap: changing one speaker's voice only misses that speaker's
turns, and a worker restart mid-render resumes from whatever was already
written. The key intentionally excludes the segment's position so identical
lines (e.g. repeated "Right.") synthesise once.
"""
from __future__ import annotations
import hashlib
import json
from pathlib import Path
from app.podcasts.tts import SynthesisRequest
class SegmentCache:
"""On-disk store of segment audio, addressed by request content hash."""
def __init__(self, root: Path) -> None:
self._root = root
self._root.mkdir(parents=True, exist_ok=True)
def key(self, request: SynthesisRequest) -> str:
"""A stable hash of the inputs that determine the synthesised bytes."""
material = json.dumps(
{
"voice": request.voice,
"language": request.language,
"speed": request.speed,
"text": request.text,
},
sort_keys=True,
ensure_ascii=True,
)
return hashlib.sha256(material.encode("utf-8")).hexdigest()
def path(self, key: str, container: str) -> Path:
return self._root / f"{key}.{container}"
def get(self, key: str, container: str) -> Path | None:
"""Return the cached segment path, or ``None`` on a miss."""
path = self.path(key, container)
return path if path.exists() else None
def put(self, key: str, container: str, data: bytes) -> Path:
"""Write ``data`` for ``key`` and return its path."""
path = self.path(key, container)
path.write_bytes(data)
return path

View file

@ -0,0 +1,11 @@
"""Failures raised while rendering a transcript to audio."""
from __future__ import annotations
class RenderError(RuntimeError):
"""Rendering could not produce a final audio file.
Wraps both per-segment synthesis failures and the merge step so the render
task sees one failure type regardless of where it originated.
"""

View file

@ -0,0 +1,48 @@
"""Concatenate ordered segment files into a single MP3.
Uses FFmpeg's concat *demuxer* (a list file of inputs) rather than a
``filter_complex`` graph. The demuxer takes one ``-i`` no matter how many
segments there are, so an hour-long episode with thousands of turns never hits
command-line length limits. Output is always re-encoded to MP3 for a uniform
artifact regardless of the source container (Kokoro WAV or hosted MP3).
"""
from __future__ import annotations
from pathlib import Path
from ffmpeg.asyncio import FFmpeg
from .errors import RenderError
async def concat_to_mp3(segment_paths: list[Path], output_path: Path) -> None:
"""Merge ``segment_paths`` in order into ``output_path`` as MP3."""
if not segment_paths:
raise RenderError("cannot merge an empty list of segments")
list_file = output_path.with_name(f"{output_path.stem}.concat.txt")
list_file.write_text(_concat_list(segment_paths), encoding="utf-8")
try:
ffmpeg = (
FFmpeg()
.option("y")
.input(str(list_file), f="concat", safe=0)
.output(str(output_path), {"c:a": "libmp3lame"})
)
await ffmpeg.execute()
except Exception as exc: # noqa: BLE001 - normalise ffmpeg failures
raise RenderError(f"audio merge failed: {exc}") from exc
finally:
list_file.unlink(missing_ok=True)
def _concat_list(segment_paths: list[Path]) -> str:
# The concat demuxer reads `file '<path>'` lines; single quotes in a path
# are escaped per its quoting rules ('\'').
lines = []
for path in segment_paths:
escaped = str(path.resolve()).replace("'", "'\\''")
lines.append(f"file '{escaped}'")
return "\n".join(lines) + "\n"

View file

@ -0,0 +1,157 @@
"""Render an approved transcript into a single podcast audio file.
The renderer is the only place that turns dialogue into sound. It maps each
turn to its speaker's voice, synthesises segments concurrently (capped, served
from the segment cache when possible, and coalesced so identical lines render
once), then merges them in order. It takes a settled spec + transcript and
returns bytes; persistence and lifecycle transitions belong to the service.
"""
from __future__ import annotations
import asyncio
from dataclasses import dataclass
from pathlib import Path
from app.podcasts.schemas import PodcastSpec, Transcript, TranscriptTurn
from app.podcasts.tts import SynthesisRequest, TextToSpeech, TextToSpeechError
from app.podcasts.voices import VoiceCatalog
from .cache import SegmentCache
from .errors import RenderError
from .merge import concat_to_mp3
# Bounds how many segments synthesise at once. Protects hosted-provider rate
# limits and avoids thrashing the local Kokoro pipeline; the renderer is I/O- or
# model-bound per segment, so a small pool already saturates throughput.
DEFAULT_MAX_CONCURRENCY = 4
_MERGED_FILENAME = "podcast.mp3"
@dataclass(frozen=True, slots=True)
class RenderedPodcast:
"""The finished episode: encoded bytes plus their container."""
data: bytes
container: str
class PodcastRenderer:
"""Synthesises and merges a transcript using one TTS provider."""
def __init__(
self,
*,
tts: TextToSpeech,
catalog: VoiceCatalog,
max_concurrency: int = DEFAULT_MAX_CONCURRENCY,
) -> None:
self._tts = tts
self._catalog = catalog
self._max_concurrency = max_concurrency
async def render(
self,
*,
spec: PodcastSpec,
transcript: Transcript,
workdir: Path,
) -> RenderedPodcast:
"""Produce the merged MP3 for ``transcript`` under ``spec``.
``workdir`` holds the segment cache and merge output; reusing the same
directory across renders is what makes voice edits cheap.
"""
cache = SegmentCache(workdir / "segments")
requests = [self._request_for(spec, turn) for turn in transcript.turns]
# Concurrency primitives are created per render so each call is bound to
# the event loop running it (Celery tasks may use a fresh loop).
synthesizer = _SegmentSynthesizer(self._tts, cache, self._max_concurrency)
segment_paths = await asyncio.gather(
*(synthesizer.segment(request) for request in requests)
)
output_path = workdir / _MERGED_FILENAME
await concat_to_mp3(list(segment_paths), output_path)
return RenderedPodcast(data=output_path.read_bytes(), container="mp3")
def _request_for(
self, spec: PodcastSpec, turn: TranscriptTurn
) -> SynthesisRequest:
try:
speaker = spec.speaker_for(turn.speaker)
except KeyError as exc:
raise RenderError(
f"transcript references unknown speaker slot {turn.speaker}"
) from exc
try:
voice = self._catalog.get(speaker.voice_id)
except KeyError as exc:
raise RenderError(f"unknown voice {speaker.voice_id!r}") from exc
return SynthesisRequest(
text=turn.text, voice=voice.native_ref, language=spec.language
)
class _SegmentSynthesizer:
"""Per-render synthesis coordinator: caps concurrency and dedupes work.
Beyond the on-disk cache (which serves cross-render reuse), this coalesces
identical segments that race within one render so the same line is voiced
once even when several turns request it simultaneously.
"""
def __init__(
self, tts: TextToSpeech, cache: SegmentCache, max_concurrency: int
) -> None:
self._tts = tts
self._cache = cache
self._container = tts.container
self._semaphore = asyncio.Semaphore(max_concurrency)
self._inflight: dict[str, asyncio.Future[Path]] = {}
self._inflight_lock = asyncio.Lock()
async def segment(self, request: SynthesisRequest) -> Path:
key = self._cache.key(request)
cached = self._cache.get(key, self._container)
if cached is not None:
return cached
async with self._inflight_lock:
future = self._inflight.get(key)
owner = future is None
if owner:
future = asyncio.get_event_loop().create_future()
self._inflight[key] = future
# The owner runs the work and publishes the outcome on the shared future;
# every caller (owner included) reads it back via ``await future`` so the
# result is retrieved exactly once-or-more and never left dangling.
if owner:
try:
path = await self._synthesize(request, key)
except BaseException as exc: # noqa: BLE001 - relayed to all waiters
future.set_exception(exc)
else:
future.set_result(path)
finally:
await self._forget(key)
return await future
async def _synthesize(self, request: SynthesisRequest, key: str) -> Path:
async with self._semaphore:
cached = self._cache.get(key, self._container)
if cached is not None:
return cached
try:
audio = await self._tts.synthesize(request)
except TextToSpeechError as exc:
raise RenderError(f"segment synthesis failed: {exc}") from exc
return self._cache.put(key, audio.container, audio.data)
async def _forget(self, key: str) -> None:
async with self._inflight_lock:
self._inflight.pop(key, None)

View file

@ -0,0 +1,27 @@
"""Resolution: deterministic default chains for a fresh brief.
Turns the user's last-used preferences into concrete language and voice
defaults, so the brief gate opens pre-filled and most users approve without
editing.
"""
from __future__ import annotations
from .language import (
DEFAULT_LANGUAGE,
DEFAULT_LANGUAGE_CHAIN,
LanguageContext,
LanguageResolver,
resolve_language,
)
from .voices import VoiceResolutionError, resolve_voices
__all__ = [
"DEFAULT_LANGUAGE",
"DEFAULT_LANGUAGE_CHAIN",
"LanguageContext",
"LanguageResolver",
"VoiceResolutionError",
"resolve_language",
"resolve_voices",
]

View file

@ -0,0 +1,64 @@
"""Resolve the brief's language without spending tokens at the gate.
The chain mirrors the agreed policy: reuse the language the user last chose, and
otherwise default to English (which the user can still override in the brief). We
deliberately never guess the language from the source content proposing a
language the user did not ask for is worse than a predictable default.
"""
from __future__ import annotations
from abc import ABC, abstractmethod
from dataclasses import dataclass
# What a brand-new user with no signal gets, and what every chain ends on.
DEFAULT_LANGUAGE = "en"
@dataclass(frozen=True, slots=True)
class LanguageContext:
"""Signals available when proposing a language for a fresh podcast."""
last_used: str | None = None
class LanguageResolver(ABC):
"""One step in the language fallback chain."""
@abstractmethod
def resolve(self, context: LanguageContext) -> str | None:
"""Return a language tag, or ``None`` to defer to the next resolver."""
class LastUsedLanguage(LanguageResolver):
"""Reuse the language from the user's previous podcast."""
def resolve(self, context: LanguageContext) -> str | None:
return context.last_used
class DefaultLanguage(LanguageResolver):
"""Terminal step: always yields the default so the chain never fails."""
def resolve(self, context: LanguageContext) -> str | None:
return DEFAULT_LANGUAGE
# Order encodes the policy; prepend stronger signals here as they appear.
DEFAULT_LANGUAGE_CHAIN: tuple[LanguageResolver, ...] = (
LastUsedLanguage(),
DefaultLanguage(),
)
def resolve_language(
context: LanguageContext,
chain: tuple[LanguageResolver, ...] = DEFAULT_LANGUAGE_CHAIN,
) -> str:
"""Walk ``chain`` and return the first language a resolver yields."""
for resolver in chain:
language = resolver.resolve(context)
if language:
return language.strip()
# The default resolver guarantees a value; this guards a misconfigured chain.
return DEFAULT_LANGUAGE

View file

@ -0,0 +1,79 @@
"""Assign a default voice to each speaker for the resolved language.
The default chain reuses the user's previously chosen voices where they are
still valid for the new language/provider, then fills any remaining speakers
with distinct catalog voices (preferring an unused gender so a two-speaker
episode sounds like two people). The user can override any of these in the
brief; this only seeds sensible defaults so most briefs need no edits.
"""
from __future__ import annotations
from collections.abc import Sequence
from app.podcasts.voices import CatalogVoice, TtsProvider, VoiceCatalog
class VoiceResolutionError(RuntimeError):
"""No catalog voice exists for the requested provider and language."""
def resolve_voices(
*,
catalog: VoiceCatalog,
provider: TtsProvider,
language: str,
speaker_count: int,
preferred: Sequence[str] | None = None,
) -> list[CatalogVoice]:
"""Return one :class:`CatalogVoice` per speaker, in slot order.
``preferred`` is the user's last-used voice ids (by slot); any that no
longer fit the provider/language are silently dropped and replaced.
"""
if speaker_count < 1:
raise ValueError("speaker_count must be >= 1")
available = catalog.for_language(provider, language)
if not available:
raise VoiceResolutionError(
f"{provider.value} has no voice for language {language!r}"
)
preferred = preferred or ()
by_id = {voice.voice_id: voice for voice in available}
assignment: list[CatalogVoice] = []
used_ids: set[str] = set()
used_genders: set = set()
for slot in range(speaker_count):
reuse_id = preferred[slot] if slot < len(preferred) else None
if reuse_id and reuse_id in by_id and reuse_id not in used_ids:
voice = by_id[reuse_id]
else:
voice = _pick_distinct(available, used_ids, used_genders)
assignment.append(voice)
used_ids.add(voice.voice_id)
used_genders.add(voice.gender)
return assignment
def _pick_distinct(
available: list[CatalogVoice],
used_ids: set[str],
used_genders: set,
) -> CatalogVoice:
"""Pick a fresh voice, preferring an unused gender, then any unused voice.
Falls back to the first catalog voice when speakers outnumber distinct
voices, so resolution always assigns every speaker rather than failing.
"""
fresh = [v for v in available if v.voice_id not in used_ids]
if fresh:
for voice in fresh:
if voice.gender not in used_genders:
return voice
return fresh[0]
return available[0]

View file

@ -0,0 +1,24 @@
"""Pydantic shapes for the podcast brief and transcript."""
from __future__ import annotations
from .spec import (
DurationTarget,
PodcastSpec,
PodcastStyle,
SpeakerRole,
SpeakerSpec,
normalize_language_tag,
)
from .transcript import Transcript, TranscriptTurn
__all__ = [
"DurationTarget",
"PodcastSpec",
"PodcastStyle",
"SpeakerRole",
"SpeakerSpec",
"Transcript",
"TranscriptTurn",
"normalize_language_tag",
]

View file

@ -0,0 +1,164 @@
"""The brief: the editable configuration a user approves before drafting.
A :class:`PodcastSpec` front-loads every decision that drives token or audio
cost (language, speakers, voices, style, target length) so the expensive
drafting and rendering steps run once against settled inputs. It is stored as
JSONB on the ``podcasts`` row and round-trips through the review API.
"""
from __future__ import annotations
import re
from enum import StrEnum
from pydantic import BaseModel, ConfigDict, Field, field_validator, model_validator
# A speaker count beyond this is almost never a real podcast and explodes the
# voice/turn-attribution space, so we reject it at the brief gate.
MAX_SPEAKERS = 6
# Long-form is a goal, but an open-ended upper bound invites runaway TTS bills.
# One day of audio is a generous ceiling that still blocks obvious mistakes.
MAX_DURATION_MINUTES = 24 * 60
# BCP-47 primary subtag plus optional region (e.g. ``en``, ``en-US``, ``pt-BR``).
# Kept deliberately permissive: the voice catalog, not the brief, decides which
# languages can actually be synthesised. Casing is normalised after matching.
_LANGUAGE_TAG = re.compile(r"^[A-Za-z]{2,3}(-[A-Za-z0-9]{2,8})*$")
def normalize_language_tag(value: str) -> str:
"""Validate and canonicalise a BCP-47 tag (lowercased primary subtag).
Shared with the generation layer so resolved and user-entered languages are
normalised identically before they reach a :class:`PodcastSpec`.
"""
cleaned = value.strip()
if not _LANGUAGE_TAG.match(cleaned):
raise ValueError(f"not a valid BCP-47 language tag: {value!r}")
primary, _, rest = cleaned.partition("-")
return primary.lower() if not rest else f"{primary.lower()}-{rest}"
class SpeakerRole(StrEnum):
"""How a speaker functions in the conversation, used to steer drafting."""
HOST = "host"
COHOST = "cohost"
GUEST = "guest"
EXPERT = "expert"
NARRATOR = "narrator"
class PodcastStyle(StrEnum):
"""The conversational format the transcript should follow."""
CONVERSATIONAL = "conversational"
INTERVIEW = "interview"
DEBATE = "debate"
MONOLOGUE = "monologue"
NARRATIVE = "narrative"
class SpeakerSpec(BaseModel):
"""One voice in the podcast: who they are and which TTS voice renders them.
``slot`` is the stable join key. Transcript turns reference a speaker by
``slot`` and the renderer resolves ``voice_id`` for that same slot, so the
two never drift even if speakers are reordered in the brief.
"""
model_config = ConfigDict(extra="forbid")
slot: int = Field(..., ge=0, description="Stable index a transcript turn references")
name: str = Field(..., min_length=1, max_length=120)
role: SpeakerRole
voice_id: str = Field(
...,
min_length=1,
description="Catalog voice id valid for the spec's language and provider",
)
@field_validator("name", "voice_id")
@classmethod
def _strip_required_text(cls, value: str) -> str:
cleaned = value.strip()
if not cleaned:
raise ValueError("must not be blank")
return cleaned
class DurationTarget(BaseModel):
"""The desired finished length as an inclusive minute range.
Drafting aims for the midpoint and treats the bounds as soft guardrails;
storing a range (rather than a point) keeps long-form expectations honest
without pretending we can hit an exact runtime.
"""
model_config = ConfigDict(extra="forbid")
min_minutes: int = Field(..., ge=1, le=MAX_DURATION_MINUTES)
max_minutes: int = Field(..., ge=1, le=MAX_DURATION_MINUTES)
@model_validator(mode="after")
def _check_order(self) -> DurationTarget:
if self.max_minutes < self.min_minutes:
raise ValueError("max_minutes must be >= min_minutes")
return self
@property
def midpoint_minutes(self) -> float:
"""The runtime drafting should aim for within the range."""
return (self.min_minutes + self.max_minutes) / 2
class PodcastSpec(BaseModel):
"""The full brief approved before any tokens or audio are spent."""
model_config = ConfigDict(extra="forbid")
language: str = Field(..., description="BCP-47 tag, e.g. 'en', 'en-US', 'pt-BR'")
style: PodcastStyle = PodcastStyle.CONVERSATIONAL
speakers: list[SpeakerSpec] = Field(..., min_length=1, max_length=MAX_SPEAKERS)
duration: DurationTarget
focus: str | None = Field(
default=None,
max_length=2000,
description="Optional user steer for what the episode should emphasise",
)
@field_validator("language")
@classmethod
def _normalise_language(cls, value: str) -> str:
return normalize_language_tag(value)
@field_validator("focus")
@classmethod
def _blank_focus_is_none(cls, value: str | None) -> str | None:
if value is None:
return None
cleaned = value.strip()
return cleaned or None
@model_validator(mode="after")
def _check_speaker_slots(self) -> PodcastSpec:
slots = [speaker.slot for speaker in self.speakers]
if len(slots) != len(set(slots)):
raise ValueError("speaker slots must be unique")
return self
@model_validator(mode="after")
def _check_style_speakers(self) -> PodcastSpec:
# One voice is what "monologue" means; letting extra speakers through
# would force drafting to silently pick a winner.
if self.style is PodcastStyle.MONOLOGUE and len(self.speakers) != 1:
raise ValueError("a monologue has exactly one speaker")
return self
def speaker_for(self, slot: int) -> SpeakerSpec:
"""Return the speaker bound to ``slot`` or raise if none matches."""
for speaker in self.speakers:
if speaker.slot == slot:
return speaker
raise KeyError(f"no speaker for slot {slot}")

View file

@ -0,0 +1,41 @@
"""The transcript: ordered dialogue turns drafting produces for review.
A :class:`Transcript` is the reviewable artifact at the go/no-go gate and the
exact input the renderer turns into audio. Each turn names a speaker by the
``slot`` defined in the :class:`~app.podcasts.schemas.spec.PodcastSpec`, so the
renderer can resolve the right voice without re-attributing anything.
"""
from __future__ import annotations
from pydantic import BaseModel, ConfigDict, Field, field_validator
class TranscriptTurn(BaseModel):
"""A single spoken line by one speaker."""
model_config = ConfigDict(extra="forbid")
speaker: int = Field(..., ge=0, description="The PodcastSpec speaker slot speaking")
text: str = Field(..., min_length=1)
@field_validator("text")
@classmethod
def _strip_text(cls, value: str) -> str:
cleaned = value.strip()
if not cleaned:
raise ValueError("turn text must not be blank")
return cleaned
class Transcript(BaseModel):
"""The full ordered dialogue for an episode."""
model_config = ConfigDict(extra="forbid")
turns: list[TranscriptTurn] = Field(..., min_length=1)
@property
def word_count(self) -> int:
"""Total spoken words, used to estimate runtime against the brief."""
return sum(len(turn.text.split()) for turn in self.turns)

View file

@ -0,0 +1,255 @@
"""The podcast lifecycle authority: every status change goes through here.
The service owns the state machine. Each method names a real lifecycle step,
validates it against the allowed-transition table, and (de)serializes the brief
and transcript to/from their JSONB columns. It deliberately does not enqueue
Celery work callers transition the row here, then schedule the next task so
the rules stay testable and free of task-queue coupling.
"""
from __future__ import annotations
from sqlalchemy.ext.asyncio import AsyncSession
from app.podcasts.persistence import Podcast, PodcastRepository, PodcastStatus
from app.podcasts.schemas import PodcastSpec, Transcript, TranscriptTurn
_MAX_ERROR_CHARS = 2000
# The only status changes the machine permits. Terminal states have no exits.
_ALLOWED: dict[PodcastStatus, frozenset[PodcastStatus]] = {
PodcastStatus.PENDING: frozenset(
{PodcastStatus.AWAITING_BRIEF, PodcastStatus.FAILED, PodcastStatus.CANCELLED}
),
# The READY exits below exist for reverting a regeneration; the audio
# guard for that lives in revert_regeneration.
PodcastStatus.AWAITING_BRIEF: frozenset(
{
PodcastStatus.DRAFTING,
PodcastStatus.READY,
PodcastStatus.FAILED,
PodcastStatus.CANCELLED,
}
),
PodcastStatus.DRAFTING: frozenset(
{
PodcastStatus.RENDERING,
PodcastStatus.READY,
PodcastStatus.FAILED,
PodcastStatus.CANCELLED,
}
),
# Never entered anymore (the transcript gate was dropped); kept with exits
# so legacy rows aren't stranded.
PodcastStatus.AWAITING_REVIEW: frozenset(
{PodcastStatus.AWAITING_BRIEF, PodcastStatus.FAILED, PodcastStatus.CANCELLED}
),
PodcastStatus.RENDERING: frozenset(
{PodcastStatus.READY, PodcastStatus.FAILED, PodcastStatus.CANCELLED}
),
# Not terminal: regeneration reopens the brief gate so the user can tweak
# the spec before a new take is drafted.
PodcastStatus.READY: frozenset({PodcastStatus.AWAITING_BRIEF}),
PodcastStatus.FAILED: frozenset(),
PodcastStatus.CANCELLED: frozenset(),
}
class PodcastError(RuntimeError):
"""Base class for lifecycle errors."""
class InvalidTransition(PodcastError):
"""A requested status change is not permitted from the current state."""
class SpecConflict(PodcastError):
"""A spec edit raced another: the expected version is stale."""
def __init__(self, expected: int, actual: int) -> None:
super().__init__(
f"spec version conflict: expected {expected}, current is {actual}"
)
self.expected = expected
self.actual = actual
class PreconditionFailed(PodcastError):
"""A transition's data precondition (brief/transcript present) is unmet."""
class PodcastService:
"""Drives one podcast through its lifecycle within a single session."""
def __init__(self, session: AsyncSession) -> None:
self._session = session
self._repo = PodcastRepository(session)
async def create(
self, *, title: str, search_space_id: int, thread_id: int | None = None
) -> Podcast:
"""Create a fresh podcast in ``PENDING`` awaiting its brief."""
podcast = Podcast(
title=title,
search_space_id=search_space_id,
thread_id=thread_id,
status=PodcastStatus.PENDING,
spec_version=1,
)
return await self._repo.add(podcast)
async def attach_brief(self, podcast: Podcast, spec: PodcastSpec) -> Podcast:
"""Record the proposed brief and open the review gate."""
self._transition(podcast, PodcastStatus.AWAITING_BRIEF)
podcast.spec = spec.model_dump(mode="json")
await self._session.flush()
return podcast
async def update_spec(
self, podcast: Podcast, spec: PodcastSpec, expected_version: int
) -> Podcast:
"""Edit the brief at the gate, guarded by optimistic concurrency."""
if _status(podcast) is not PodcastStatus.AWAITING_BRIEF:
raise InvalidTransition(
f"the brief can only be edited while awaiting_brief, "
f"not {_status(podcast).value}"
)
if expected_version != podcast.spec_version:
raise SpecConflict(expected_version, podcast.spec_version)
podcast.spec = spec.model_dump(mode="json")
podcast.spec_version += 1
await self._session.flush()
return podcast
async def begin_drafting(self, podcast: Podcast) -> Podcast:
"""Approve the brief and start transcript drafting."""
if podcast.spec is None:
raise PreconditionFailed("cannot draft without a brief")
self._transition(podcast, PodcastStatus.DRAFTING)
await self._session.flush()
return podcast
async def attach_transcript(
self, podcast: Podcast, transcript: Transcript
) -> Podcast:
"""Record the drafted transcript and move straight to rendering."""
self._transition(podcast, PodcastStatus.RENDERING)
podcast.podcast_transcript = transcript.model_dump(mode="json")
await self._session.flush()
return podcast
# Guards regenerate beyond the transition table: from PENDING the
# AWAITING_BRIEF target is also legal, but there it means attaching a brief.
_REGENERABLE = frozenset({PodcastStatus.READY, PodcastStatus.AWAITING_REVIEW})
async def regenerate(self, podcast: Podcast) -> Podcast:
"""Reopen the brief gate; the saved spec becomes the new starting point."""
if _status(podcast) not in self._REGENERABLE:
raise InvalidTransition(
f"nothing to regenerate from {_status(podcast).value}"
)
# Legacy episodes finished before briefs existed; a gate with nothing
# to review would strand them.
if podcast.spec is None:
raise PreconditionFailed("cannot regenerate without a brief")
self._transition(podcast, PodcastStatus.AWAITING_BRIEF)
await self._session.flush()
return podcast
async def revert_regeneration(self, podcast: Podcast) -> Podcast:
"""Back out of a regeneration and fall back to the stored episode.
Regeneration keeps the rendered audio until a new take replaces it, so
any point before that commit is a free change of mind. A fresh podcast
has no regeneration to revert and is rejected.
"""
if not has_stored_episode(podcast):
raise InvalidTransition("no finished episode to fall back to")
self._transition(podcast, PodcastStatus.READY)
await self._session.flush()
return podcast
async def attach_audio(
self,
podcast: Podcast,
*,
storage_backend: str,
storage_key: str,
duration_seconds: int | None = None,
) -> Podcast:
"""Record rendered audio and mark the podcast ready."""
self._transition(podcast, PodcastStatus.READY)
podcast.storage_backend = storage_backend
podcast.storage_key = storage_key
podcast.duration_seconds = duration_seconds
podcast.error = None
await self._session.flush()
return podcast
async def fail(self, podcast: Podcast, error: str) -> Podcast:
"""Move a non-terminal podcast to ``FAILED`` with a reason."""
self._transition(podcast, PodcastStatus.FAILED)
podcast.error = (error or "")[:_MAX_ERROR_CHARS] or None
await self._session.flush()
return podcast
async def cancel(self, podcast: Podcast) -> Podcast:
"""Cancel a podcast that has produced nothing the user could keep.
No user action may destroy playable audio: once an episode exists,
backing out goes through revert_regeneration instead.
"""
if has_stored_episode(podcast):
raise InvalidTransition(
"a finished episode exists; revert the regeneration instead"
)
self._transition(podcast, PodcastStatus.CANCELLED)
await self._session.flush()
return podcast
def _transition(self, podcast: Podcast, target: PodcastStatus) -> None:
current = _status(podcast)
if target not in _ALLOWED[current]:
raise InvalidTransition(
f"{current.value} -> {target.value} is not allowed"
)
podcast.status = target
def _status(podcast: Podcast) -> PodcastStatus:
return PodcastStatus(podcast.status)
def has_stored_episode(podcast: Podcast) -> bool:
"""Whether finished audio is stored (``file_location`` covers legacy rows)."""
return bool(podcast.storage_key or podcast.file_location)
def read_spec(podcast: Podcast) -> PodcastSpec | None:
"""Deserialize the stored brief, or ``None`` if not yet proposed."""
return PodcastSpec.model_validate(podcast.spec) if podcast.spec else None
def read_transcript(podcast: Podcast) -> Transcript | None:
"""Deserialize the stored transcript, or ``None`` if not yet drafted."""
raw = podcast.podcast_transcript
if not raw:
return None
# Rows from before the lifecycle rework stored a bare turn list with
# different field names; they must keep reading, not fail validation.
if isinstance(raw, list):
return Transcript(
turns=[
TranscriptTurn(speaker=turn["speaker_id"], text=turn["dialog"])
for turn in raw
]
)
return Transcript.model_validate(raw)
def preferences_from(podcast: Podcast | None) -> tuple[str | None, list[str]]:
"""Extract reusable (language, voice_ids) defaults from a prior podcast."""
spec = read_spec(podcast) if podcast is not None else None
if spec is None:
return None, []
return spec.language, [speaker.voice_id for speaker in spec.speakers]

View file

@ -0,0 +1,53 @@
"""Durable storage for rendered podcast audio.
Wraps the shared :class:`StorageBackend` so the rest of the module never deals
with object keys directly. Audio is stored under a per-podcast key, streamed for
download, and purged when a podcast is deleted.
"""
from __future__ import annotations
import uuid
from collections.abc import AsyncIterator
from app.file_storage.factory import get_storage_backend
from app.podcasts.persistence import Podcast
_AUDIO_CONTENT_TYPE = "audio/mpeg"
def build_audio_key(*, search_space_id: int, podcast_id: int) -> str:
"""Object key for a podcast's audio.
Shape: ``podcasts/{search_space_id}/{podcast_id}/{uuid}.mp3``. The uuid lets
a re-render write a fresh object before the old one is purged.
"""
return f"podcasts/{search_space_id}/{podcast_id}/{uuid.uuid4().hex}.mp3"
async def store_audio(
*, search_space_id: int, podcast_id: int, data: bytes
) -> tuple[str, str]:
"""Persist audio bytes and return ``(backend_name, storage_key)``."""
backend = get_storage_backend()
key = build_audio_key(search_space_id=search_space_id, podcast_id=podcast_id)
await backend.put(key, data, content_type=_AUDIO_CONTENT_TYPE)
return backend.backend_name, key
def open_audio_stream(podcast: Podcast) -> AsyncIterator[bytes]:
"""Stream a ready podcast's audio bytes. Raises if it has none."""
if not podcast.storage_key:
raise FileNotFoundError(f"podcast {podcast.id} has no stored audio")
return get_storage_backend().open_stream(podcast.storage_key)
async def purge_audio(podcast: Podcast) -> None:
"""Delete a podcast's stored audio if present; a missing object is fine."""
await purge_audio_object(podcast.storage_key)
async def purge_audio_object(key: str | None) -> None:
"""Delete a stored audio object by key, e.g. the one a re-render replaced."""
if key:
await get_storage_backend().delete(key)

View file

@ -0,0 +1,17 @@
"""Celery tasks driving the podcast lifecycle across its expensive phases.
One task per heavy async phase: draft the transcript (LLM) and render the audio
(TTS). The brief is deterministic and proposed inline at create time, so it has
no task. Each task is enqueued by the API after it performs the guarded status
transition, and each pushes its result onto the row for the frontend to observe.
"""
from __future__ import annotations
from .draft import draft_transcript_task
from .render import render_audio_task
__all__ = [
"draft_transcript_task",
"render_audio_task",
]

View file

@ -0,0 +1,99 @@
"""Transcript-drafting task: DRAFTING -> RENDERING.
The expensive, LLM-heavy step, so it runs under ``billable_call``. The API has
already moved the row to DRAFTING and stored the approved brief; this task
drafts the long-form transcript and chains straight into the render the brief
gate is the only approval in the lifecycle.
"""
from __future__ import annotations
import logging
from app.celery_app import celery_app
from app.config import config as app_config
from app.podcasts.generation.transcript.graph import graph as transcript_graph
from app.podcasts.generation.transcript.state import TranscriptState
from app.podcasts.persistence import PodcastRepository
from app.podcasts.service import PodcastService, read_spec
from app.services.billable_calls import (
BillingSettlementError,
QuotaInsufficientError,
_resolve_agent_billing_for_search_space,
billable_call,
)
from app.tasks.celery_tasks import get_celery_session_maker, run_async_celery_task
from .render import render_audio_task
from .runtime import billable_session, mark_failed
logger = logging.getLogger(__name__)
@celery_app.task(name="podcast.draft_transcript", bind=True)
def draft_transcript_task(self, podcast_id: int, search_space_id: int) -> dict:
try:
return run_async_celery_task(
lambda: _draft_transcript(podcast_id, search_space_id)
)
except Exception as exc: # noqa: BLE001 - record and report, never crash worker
logger.error("Podcast %s drafting failed: %s", podcast_id, exc)
run_async_celery_task(lambda: mark_failed(podcast_id, str(exc)))
return {"status": "failed", "podcast_id": podcast_id}
async def _draft_transcript(podcast_id: int, search_space_id: int) -> dict:
async with get_celery_session_maker()() as session:
repo = PodcastRepository(session)
service = PodcastService(session)
podcast = await repo.get(podcast_id)
if podcast is None:
raise ValueError(f"podcast {podcast_id} not found")
spec = read_spec(podcast)
if spec is None:
raise ValueError(f"podcast {podcast_id} has no approved brief")
owner_id, tier, base_model = await _resolve_agent_billing_for_search_space(
session, search_space_id, thread_id=podcast.thread_id
)
state = TranscriptState(
db_session=session, source_content=podcast.source_content or ""
)
config = {
"configurable": {
"search_space_id": search_space_id,
"spec": spec,
"focus": spec.focus,
}
}
try:
async with billable_call(
user_id=owner_id,
search_space_id=search_space_id,
billing_tier=tier,
base_model=base_model,
quota_reserve_micros_override=app_config.QUOTA_DEFAULT_PODCAST_RESERVE_MICROS,
usage_type="podcast_generation",
call_details={"podcast_id": podcast_id, "title": podcast.title},
billable_session_factory=billable_session,
):
result = await transcript_graph.ainvoke(state, config=config)
except QuotaInsufficientError:
await service.fail(podcast, "premium quota exhausted")
await session.commit()
return {"status": "failed", "podcast_id": podcast_id, "reason": "quota"}
except BillingSettlementError:
await service.fail(podcast, "billing settlement failed")
await session.commit()
return {"status": "failed", "podcast_id": podcast_id, "reason": "billing"}
await service.attach_transcript(podcast, result["transcript"])
await session.commit()
# Enqueue only after the transaction is committed, so the render worker can
# never pick up a row whose transcript isn't visible yet.
render_audio_task.delay(podcast_id)
return {"status": "rendering", "podcast_id": podcast_id}

View file

@ -0,0 +1,87 @@
"""Audio-rendering task: RENDERING -> READY.
Synthesises and merges the approved transcript, stores the MP3 in the object
store, and marks the podcast ready. The working directory is stable per podcast
so a re-render (e.g. after a voice change) reuses the segment cache.
"""
from __future__ import annotations
import logging
import tempfile
from pathlib import Path
from app.celery_app import celery_app
from app.podcasts.persistence import PodcastRepository
from app.podcasts.rendering import PodcastRenderer
from app.podcasts.service import (
InvalidTransition,
PodcastService,
read_spec,
read_transcript,
)
from app.podcasts.storage import purge_audio_object, store_audio
from app.podcasts.tts import get_text_to_speech
from app.podcasts.voices import get_voice_catalog
from app.tasks.celery_tasks import get_celery_session_maker, run_async_celery_task
from .runtime import mark_failed
logger = logging.getLogger(__name__)
_WORKDIR_BASE = Path(tempfile.gettempdir()) / "surfsense_podcasts"
@celery_app.task(name="podcast.render_audio", bind=True)
def render_audio_task(self, podcast_id: int) -> dict:
try:
return run_async_celery_task(lambda: _render_audio(podcast_id))
except Exception as exc: # noqa: BLE001 - record and report, never crash worker
logger.error("Podcast %s render failed: %s", podcast_id, exc)
run_async_celery_task(lambda: mark_failed(podcast_id, str(exc)))
return {"status": "failed", "podcast_id": podcast_id}
async def _render_audio(podcast_id: int) -> dict:
async with get_celery_session_maker()() as session:
repo = PodcastRepository(session)
podcast = await repo.get(podcast_id)
if podcast is None:
raise ValueError(f"podcast {podcast_id} not found")
spec = read_spec(podcast)
transcript = read_transcript(podcast)
if spec is None or transcript is None:
raise ValueError(f"podcast {podcast_id} is missing brief or transcript")
renderer = PodcastRenderer(
tts=get_text_to_speech(), catalog=get_voice_catalog()
)
workdir = _WORKDIR_BASE / str(podcast_id)
workdir.mkdir(parents=True, exist_ok=True)
rendered = await renderer.render(
spec=spec, transcript=transcript, workdir=workdir
)
superseded_key = podcast.storage_key
backend_name, key = await store_audio(
search_space_id=podcast.search_space_id,
podcast_id=podcast_id,
data=rendered.data,
)
try:
await PodcastService(session).attach_audio(
podcast, storage_backend=backend_name, storage_key=key
)
await session.commit()
except InvalidTransition:
# A user back-out won the race (e.g. the regeneration was
# reverted): drop the stale render and leave the row alone.
await purge_audio_object(key)
return {"status": "superseded", "podcast_id": podcast_id}
# Purge only after the new audio is committed, so a failed re-render never
# destroys the episode the user can still play.
await purge_audio_object(superseded_key)
return {"status": "ready", "podcast_id": podcast_id}

View file

@ -0,0 +1,40 @@
"""Shared plumbing for the podcast Celery tasks.
Each task runs its async body via :func:`run_async_celery_task` and, on any
failure, records the reason on the row through the lifecycle service. Marking
failed is best-effort: a podcast that already reached a terminal state is left
untouched rather than forced.
"""
from __future__ import annotations
import logging
from contextlib import asynccontextmanager
from app.podcasts.persistence import PodcastRepository
from app.podcasts.service import PodcastError, PodcastService
from app.tasks.celery_tasks import get_celery_session_maker
logger = logging.getLogger(__name__)
@asynccontextmanager
async def billable_session():
"""Session factory for ``billable_call`` inside the worker loop."""
async with get_celery_session_maker()() as session:
yield session
async def mark_failed(podcast_id: int, error: str) -> None:
"""Best-effort: move a non-terminal podcast to FAILED with ``error``."""
async with get_celery_session_maker()() as session:
repo = PodcastRepository(session)
podcast = await repo.get(podcast_id)
if podcast is None:
return
try:
await PodcastService(session).fail(podcast, error)
await session.commit()
except PodcastError:
# Already terminal (e.g. cancelled): nothing to record.
logger.info("Podcast %s already terminal; not marking failed", podcast_id)

View file

@ -0,0 +1,22 @@
"""Text-to-speech: a per-segment synthesis port with provider adapters.
Callers depend on :class:`TextToSpeech` and obtain the configured provider from
:func:`get_text_to_speech`; the concrete Kokoro/LiteLLM adapters stay private.
"""
from __future__ import annotations
from .audio import SynthesizedAudio
from .errors import TextToSpeechError
from .factory import get_text_to_speech
from .port import TextToSpeech
from .request import SynthesisRequest, VoiceRef
__all__ = [
"SynthesisRequest",
"SynthesizedAudio",
"TextToSpeech",
"TextToSpeechError",
"VoiceRef",
"get_text_to_speech",
]

View file

@ -0,0 +1,3 @@
"""Per-provider TextToSpeech implementations."""
from __future__ import annotations

View file

@ -0,0 +1,111 @@
"""Local Kokoro adapter: on-box synthesis, no network or per-segment cost.
Kokoro selects its language model by a single-letter ``lang_code``, so this
adapter maps the brief's BCP-47 tag to that code and caches one pipeline per
code (pipeline construction loads weights and is expensive). Pipelines run in a
thread pool because Kokoro is synchronous; the renderer caps how many segments
synthesise at once.
"""
from __future__ import annotations
import asyncio
import io
from typing import TYPE_CHECKING
from ..audio import SynthesizedAudio
from ..errors import TextToSpeechError
from ..port import TextToSpeech
from ..request import SynthesisRequest
if TYPE_CHECKING:
from kokoro import KPipeline
# Kokoro emits 24 kHz mono PCM regardless of voice.
_SAMPLE_RATE = 24000
# BCP-47 primary subtag -> Kokoro language code. English defaults to American;
# the en-GB region override below switches it to British.
_LANG_CODE_BY_PRIMARY = {
"en": "a",
"es": "e",
"fr": "f",
"hi": "h",
"it": "i",
"ja": "j",
"pt": "p",
"zh": "z",
}
class KokoroTextToSpeech(TextToSpeech):
"""Synthesises segments with locally hosted Kokoro pipelines."""
def __init__(self) -> None:
self._pipelines: dict[str, KPipeline] = {}
@property
def container(self) -> str:
return "wav"
async def synthesize(self, request: SynthesisRequest) -> SynthesizedAudio:
if not isinstance(request.voice, str):
raise TextToSpeechError(
"Kokoro voices are named by string, not a mapping"
)
pipeline = self._pipeline_for(request.language)
loop = asyncio.get_event_loop()
try:
generator = await loop.run_in_executor(
None,
lambda: pipeline(
request.text,
voice=request.voice,
speed=request.speed,
split_pattern=r"\n+",
),
)
segments = [audio for _gs, _ps, audio in generator]
except Exception as exc: # noqa: BLE001 - normalise provider errors
raise TextToSpeechError(f"Kokoro synthesis failed: {exc}") from exc
if not segments:
raise TextToSpeechError("Kokoro produced no audio for the text")
return SynthesizedAudio(
data=_encode_wav(segments, _SAMPLE_RATE),
container="wav",
sample_rate=_SAMPLE_RATE,
)
def _pipeline_for(self, language: str) -> KPipeline:
lang_code = _lang_code(language)
pipeline = self._pipelines.get(lang_code)
if pipeline is None:
from kokoro import KPipeline
pipeline = KPipeline(lang_code=lang_code)
self._pipelines[lang_code] = pipeline
return pipeline
def _lang_code(language: str) -> str:
normalised = language.strip().lower()
if normalised.startswith("en-gb") or normalised == "en-uk":
return "b"
primary = normalised.partition("-")[0]
code = _LANG_CODE_BY_PRIMARY.get(primary)
if code is None:
raise TextToSpeechError(f"Kokoro has no language model for {language!r}")
return code
def _encode_wav(segments: list, sample_rate: int) -> bytes:
import numpy as np
import soundfile as sf
waveform = segments[0] if len(segments) == 1 else np.concatenate(segments)
buffer = io.BytesIO()
sf.write(buffer, waveform, sample_rate, format="WAV")
return buffer.getvalue()

View file

@ -0,0 +1,69 @@
"""LiteLLM adapter: hosted TTS (OpenAI, Azure, Vertex AI) via one ``aspeech`` call.
LiteLLM normalises every hosted provider behind the same ``aspeech`` surface,
so a single adapter covers them all. The provider is encoded in the model
string (e.g. ``openai/tts-1``, ``vertex_ai/...``) and the voice reference is
whatever that provider expects, which the catalog already supplies.
"""
from __future__ import annotations
from ..audio import SynthesizedAudio
from ..errors import TextToSpeechError
from ..port import TextToSpeech
from ..request import SynthesisRequest
# Hosted providers return MP3-encoded bytes from ``aspeech``.
_CONTAINER = "mp3"
# A long single segment still finishes well under this; retries absorb transient
# upstream failures without failing the whole render.
_TIMEOUT_SECONDS = 600
_MAX_RETRIES = 2
class LiteLlmTextToSpeech(TextToSpeech):
"""Synthesises segments through any LiteLLM-supported hosted TTS model."""
def __init__(
self,
*,
model: str,
api_base: str | None = None,
api_key: str | None = None,
) -> None:
self._model = model
self._api_base = api_base
self._api_key = api_key
@property
def container(self) -> str:
return _CONTAINER
async def synthesize(self, request: SynthesisRequest) -> SynthesizedAudio:
from litellm import aspeech
kwargs = {
"model": self._model,
"voice": request.voice,
"input": request.text,
"max_retries": _MAX_RETRIES,
"timeout": _TIMEOUT_SECONDS,
}
if self._api_base:
kwargs["api_base"] = self._api_base
if self._api_key:
kwargs["api_key"] = self._api_key
try:
response = await aspeech(**kwargs)
except Exception as exc: # noqa: BLE001 - normalise provider errors
raise TextToSpeechError(
f"{self._model} synthesis failed: {exc}"
) from exc
data = getattr(response, "content", None)
if not data:
raise TextToSpeechError(f"{self._model} returned no audio")
return SynthesizedAudio(data=data, container=_CONTAINER)

View file

@ -0,0 +1,19 @@
"""The bytes a TTS provider returns for one segment."""
from __future__ import annotations
from dataclasses import dataclass
@dataclass(frozen=True, slots=True)
class SynthesizedAudio:
"""Encoded audio for a single segment, ready to cache and concatenate.
``container`` is the file extension the bytes are encoded as (``"wav"`` or
``"mp3"``); the renderer uses it to name the on-disk segment so FFmpeg can
demux the right format during merge.
"""
data: bytes
container: str
sample_rate: int | None = None

View file

@ -0,0 +1,13 @@
"""Failures raised by the TTS layer."""
from __future__ import annotations
class TextToSpeechError(RuntimeError):
"""A provider failed to synthesise a segment.
Raised for both configuration faults (an unusable voice reference) and
provider faults (the upstream call errored or returned no audio), so the
renderer can fail the segment without unwrapping provider-specific
exceptions.
"""

View file

@ -0,0 +1,38 @@
"""Resolve the configured :class:`TextToSpeech` as a process-wide singleton."""
from __future__ import annotations
from functools import lru_cache
from .port import TextToSpeech
# Sentinel model string that selects the local Kokoro pipeline; anything else is
# treated as a LiteLLM-hosted model (``openai/...``, ``vertex_ai/...``, etc.).
KOKORO_SERVICE = "local/kokoro"
@lru_cache(maxsize=1)
def get_text_to_speech() -> TextToSpeech:
"""Build the provider selected by ``TTS_SERVICE`` (adapters lazy-imported).
Cached because the Kokoro adapter holds loaded pipelines that must be reused
across segments and requests rather than rebuilt per call.
"""
from app.config import config as app_config
service = app_config.TTS_SERVICE
if not service:
raise ValueError("TTS_SERVICE is not configured")
if service == KOKORO_SERVICE:
from .adapters.kokoro import KokoroTextToSpeech
return KokoroTextToSpeech()
from .adapters.litellm import LiteLlmTextToSpeech
return LiteLlmTextToSpeech(
model=service,
api_base=app_config.TTS_SERVICE_API_BASE,
api_key=app_config.TTS_SERVICE_API_KEY,
)

View file

@ -0,0 +1,31 @@
"""The TTS contract: turn one segment of text into encoded audio."""
from __future__ import annotations
from abc import ABC, abstractmethod
from .audio import SynthesizedAudio
from .request import SynthesisRequest
class TextToSpeech(ABC):
"""Synthesises a single segment; one implementation per provider.
The contract is intentionally per-segment rather than per-episode: it keeps
each call independently cacheable and lets the renderer cap concurrency and
retry segments in isolation. Stitching segments into one file is the
renderer's job, not the provider's.
"""
@property
@abstractmethod
def container(self) -> str:
"""File extension/container this provider emits (e.g. ``"mp3"``)."""
@abstractmethod
async def synthesize(self, request: SynthesisRequest) -> SynthesizedAudio:
"""Voice ``request.text`` and return its encoded audio.
Raises :class:`~app.podcasts.tts.errors.TextToSpeechError` on any
provider or configuration failure.
"""

View file

@ -0,0 +1,22 @@
"""What the renderer hands a TTS provider to voice a single segment."""
from __future__ import annotations
from collections.abc import Mapping
from dataclasses import dataclass
from typing import Any
# A provider-native voice reference. OpenAI/Azure/Kokoro name a voice with a
# string; Vertex passes a mapping (``languageCode`` + ``name``). The catalog
# stores whichever shape the provider expects and we pass it through untouched.
VoiceRef = str | Mapping[str, Any]
@dataclass(frozen=True, slots=True)
class SynthesisRequest:
"""One unit of speech to synthesise: the smallest cacheable render step."""
text: str
voice: VoiceRef
language: str
speed: float = 1.0

View file

@ -0,0 +1,23 @@
"""Voices: the catalog of selectable TTS voices and the active provider.
Callers obtain the catalog via :func:`get_voice_catalog` and identify the
configured provider via :func:`provider_from_service`.
"""
from __future__ import annotations
from .catalog import VoiceCatalog, get_voice_catalog
from .preview import render_voice_preview
from .provider import TtsProvider, provider_from_service
from .voice import ANY_LANGUAGE, CatalogVoice, VoiceGender
__all__ = [
"ANY_LANGUAGE",
"CatalogVoice",
"TtsProvider",
"VoiceCatalog",
"VoiceGender",
"get_voice_catalog",
"provider_from_service",
"render_voice_preview",
]

View file

@ -0,0 +1,55 @@
"""The voice catalog: look up and filter selectable voices.
A :class:`VoiceCatalog` is the single source of truth for which voices exist.
Resolution uses it to pick defaults for a brief, the API exposes it as picker
options, and the renderer uses it to turn a stored ``voice_id`` back into the
provider-native reference.
"""
from __future__ import annotations
from collections.abc import Iterable
from functools import lru_cache
from .data import AZURE_VOICES, KOKORO_VOICES, OPENAI_VOICES, VERTEX_VOICES
from .provider import TtsProvider
from .voice import CatalogVoice
class VoiceCatalog:
"""An indexed, read-only collection of :class:`CatalogVoice`."""
def __init__(self, voices: Iterable[CatalogVoice]) -> None:
self._by_id: dict[str, CatalogVoice] = {}
self._by_provider: dict[TtsProvider, list[CatalogVoice]] = {}
for voice in voices:
if voice.voice_id in self._by_id:
raise ValueError(f"duplicate voice_id: {voice.voice_id}")
self._by_id[voice.voice_id] = voice
self._by_provider.setdefault(voice.provider, []).append(voice)
def get(self, voice_id: str) -> CatalogVoice:
"""Return the voice with ``voice_id`` or raise ``KeyError``."""
return self._by_id[voice_id]
def for_provider(self, provider: TtsProvider) -> list[CatalogVoice]:
"""All voices offered by ``provider``, in catalog order."""
return list(self._by_provider.get(provider, ()))
def for_language(
self, provider: TtsProvider, language: str
) -> list[CatalogVoice]:
"""``provider`` voices that can render ``language``, in catalog order."""
return [v for v in self.for_provider(provider) if v.speaks(language)]
def supports_language(self, provider: TtsProvider, language: str) -> bool:
"""Whether ``provider`` has at least one voice for ``language``."""
return any(v.speaks(language) for v in self.for_provider(provider))
@lru_cache(maxsize=1)
def get_voice_catalog() -> VoiceCatalog:
"""The process-wide catalog assembled from every provider's roster."""
return VoiceCatalog(
(*KOKORO_VOICES, *OPENAI_VOICES, *AZURE_VOICES, *VERTEX_VOICES)
)

View file

@ -0,0 +1,67 @@
"""Audible previews so users pick voices by sound, not by name.
A preview is a short sample sentence synthesised in the voice's own language.
Samples are served through the same content-addressed cache the renderer uses,
so each voice costs at most one synthesis per cache lifetime repeat listens
while comparing voices are free.
"""
from __future__ import annotations
import tempfile
from pathlib import Path
from app.podcasts.rendering.cache import SegmentCache
from app.podcasts.tts import SynthesisRequest, TextToSpeech
from .voice import ANY_LANGUAGE, CatalogVoice
# Previews are user-independent, so one rendered sample serves everyone.
PREVIEW_CACHE_ROOT = Path(tempfile.gettempdir()) / "surfsense_podcasts" / "previews"
_FALLBACK_LANGUAGE = "en"
# A voice previews best speaking its own language.
_SAMPLE_TEXTS = {
"en": "Hi there! This is how I sound when narrating your podcast.",
"es": "¡Hola! Así sueno cuando narro tu pódcast.",
"fr": "Bonjour ! Voici ma voix quand je raconte votre podcast.",
"hi": "नमस्ते! आपका पॉडकास्ट सुनाते समय मेरी आवाज़ ऐसी होती है।",
"it": "Ciao! Questa è la mia voce quando racconto il tuo podcast.",
"ja": "こんにちは。ポッドキャストをお届けするときの私の声です。",
"pt": "Olá! É assim que eu soo ao narrar o seu podcast.",
"zh": "你好!这就是我为你播报播客时的声音。",
}
_CONTENT_TYPES = {"mp3": "audio/mpeg", "wav": "audio/wav"}
async def render_voice_preview(
voice: CatalogVoice, tts: TextToSpeech
) -> tuple[bytes, str]:
"""Return ``(audio_bytes, content_type)`` for a sample spoken by ``voice``."""
language = (
_FALLBACK_LANGUAGE if voice.language == ANY_LANGUAGE else voice.language
)
request = SynthesisRequest(
text=_sample_text(language), voice=voice.native_ref, language=language
)
cache = SegmentCache(PREVIEW_CACHE_ROOT)
key = cache.key(request)
cached = cache.get(key, tts.container)
if cached is not None:
return cached.read_bytes(), _content_type(tts.container)
audio = await tts.synthesize(request)
cache.put(key, audio.container, audio.data)
return audio.data, _content_type(audio.container)
def _sample_text(language: str) -> str:
primary = language.split("-", 1)[0].strip().lower()
return _SAMPLE_TEXTS.get(primary, _SAMPLE_TEXTS[_FALLBACK_LANGUAGE])
def _content_type(container: str) -> str:
return _CONTENT_TYPES.get(container, "application/octet-stream")

View file

@ -0,0 +1,27 @@
"""The TTS providers we carry voices for, and how to name one from config."""
from __future__ import annotations
from enum import StrEnum
class TtsProvider(StrEnum):
"""A speech provider whose voices the catalog enumerates."""
KOKORO = "kokoro"
OPENAI = "openai"
AZURE = "azure"
VERTEX_AI = "vertex_ai"
def provider_from_service(service: str) -> TtsProvider:
"""Map a ``TTS_SERVICE`` string to its provider.
The config value is a LiteLLM-style ``provider/model`` string
(``openai/tts-1``, ``vertex_ai/...``) except for local Kokoro, which is
spelled ``local/kokoro``; both halves of that special case resolve here.
"""
prefix = service.split("/", 1)[0].strip().lower()
if prefix == "local":
return TtsProvider.KOKORO
return TtsProvider(prefix)

View file

@ -0,0 +1,50 @@
"""A catalog voice: a stable id paired with its provider-native reference."""
from __future__ import annotations
from dataclasses import dataclass
from enum import StrEnum
from app.podcasts.tts import VoiceRef
from .provider import TtsProvider
# A voice that speaks whatever language the input text is in (e.g. OpenAI's
# voices), matched against every requested language.
ANY_LANGUAGE = "*"
class VoiceGender(StrEnum):
"""Perceived voice gender, used to pick distinct voices per speaker."""
MALE = "male"
FEMALE = "female"
NEUTRAL = "neutral"
@dataclass(frozen=True, slots=True)
class CatalogVoice:
"""One selectable voice.
``voice_id`` is the provider-prefixed, stable id stored on a speaker in the
brief (e.g. ``"kokoro:am_adam"``). ``native_ref`` is the untyped value the
TTS adapter passes to the provider a string for most, a mapping for
Vertex kept separate so renaming the catalog id never breaks synthesis.
"""
voice_id: str
provider: TtsProvider
language: str
display_name: str
gender: VoiceGender
native_ref: VoiceRef
def speaks(self, language: str) -> bool:
"""Whether this voice can render ``language`` (primary subtag match)."""
if self.language == ANY_LANGUAGE:
return True
return _primary(self.language) == _primary(language)
def _primary(language: str) -> str:
return language.split("-", 1)[0].strip().lower()

View file

@ -4,6 +4,7 @@ from app.automations.api import router as automations_router
from app.file_storage.api import router as file_storage_router
from app.gateway import require_gateway_enabled
from app.notifications.api import router as notifications_router
from app.podcasts.api import router as podcasts_router
from .agent_action_log_route import router as agent_action_log_router
from .agent_flags_route import router as agent_flags_router
@ -50,7 +51,6 @@ from .notes_routes import router as notes_router
from .notion_add_connector_route import router as notion_add_connector_router
from .obsidian_plugin_routes import router as obsidian_plugin_router
from .onedrive_add_connector_route import router as onedrive_add_connector_router
from .podcasts_routes import router as podcasts_router
from .prompts_routes import router as prompts_router
from .public_chat_routes import router as public_chat_router
from .rbac_routes import router as rbac_router

View file

@ -1,211 +0,0 @@
"""
Podcast routes for CRUD operations and audio streaming.
These routes support the podcast generation feature in new-chat.
Frontend polls GET /podcasts/{podcast_id} to check status field.
"""
import os
from pathlib import Path
from fastapi import APIRouter, Depends, HTTPException
from fastapi.responses import StreamingResponse
from sqlalchemy import select
from sqlalchemy.exc import SQLAlchemyError
from sqlalchemy.ext.asyncio import AsyncSession
from app.db import (
Permission,
Podcast,
SearchSpace,
SearchSpaceMembership,
User,
get_async_session,
)
from app.schemas import PodcastRead
from app.users import current_active_user
from app.utils.rbac import check_permission
router = APIRouter()
@router.get("/podcasts", response_model=list[PodcastRead])
async def read_podcasts(
skip: int = 0,
limit: int = 100,
search_space_id: int | None = None,
session: AsyncSession = Depends(get_async_session),
user: User = Depends(current_active_user),
):
"""
List podcasts the user has access to.
Requires PODCASTS_READ permission for the search space(s).
"""
if skip < 0 or limit < 1:
raise HTTPException(status_code=400, detail="Invalid pagination parameters")
try:
if search_space_id is not None:
# Check permission for specific search space
await check_permission(
session,
user,
search_space_id,
Permission.PODCASTS_READ.value,
"You don't have permission to read podcasts in this search space",
)
result = await session.execute(
select(Podcast)
.filter(Podcast.search_space_id == search_space_id)
.offset(skip)
.limit(limit)
)
else:
# Get podcasts from all search spaces user has membership in
result = await session.execute(
select(Podcast)
.join(SearchSpace)
.join(SearchSpaceMembership)
.filter(SearchSpaceMembership.user_id == user.id)
.offset(skip)
.limit(limit)
)
return result.scalars().all()
except HTTPException:
raise
except SQLAlchemyError:
raise HTTPException(
status_code=500, detail="Database error occurred while fetching podcasts"
) from None
@router.get("/podcasts/{podcast_id}", response_model=PodcastRead)
async def read_podcast(
podcast_id: int,
session: AsyncSession = Depends(get_async_session),
user: User = Depends(current_active_user),
):
"""
Get a specific podcast by ID.
Requires authentication with PODCASTS_READ permission.
For public podcast access, use /public/{share_token}/podcasts/{podcast_id}/stream
"""
try:
result = await session.execute(select(Podcast).filter(Podcast.id == podcast_id))
podcast = result.scalars().first()
if not podcast:
raise HTTPException(
status_code=404,
detail="Podcast not found",
)
await check_permission(
session,
user,
podcast.search_space_id,
Permission.PODCASTS_READ.value,
"You don't have permission to read podcasts in this search space",
)
return PodcastRead.from_orm_with_entries(podcast)
except HTTPException as he:
raise he
except SQLAlchemyError:
raise HTTPException(
status_code=500, detail="Database error occurred while fetching podcast"
) from None
@router.delete("/podcasts/{podcast_id}", response_model=dict)
async def delete_podcast(
podcast_id: int,
session: AsyncSession = Depends(get_async_session),
user: User = Depends(current_active_user),
):
"""
Delete a podcast.
Requires PODCASTS_DELETE permission for the search space.
"""
try:
result = await session.execute(select(Podcast).filter(Podcast.id == podcast_id))
db_podcast = result.scalars().first()
if not db_podcast:
raise HTTPException(status_code=404, detail="Podcast not found")
# Check permission for the search space
await check_permission(
session,
user,
db_podcast.search_space_id,
Permission.PODCASTS_DELETE.value,
"You don't have permission to delete podcasts in this search space",
)
await session.delete(db_podcast)
await session.commit()
return {"message": "Podcast deleted successfully"}
except HTTPException as he:
raise he
except SQLAlchemyError:
await session.rollback()
raise HTTPException(
status_code=500, detail="Database error occurred while deleting podcast"
) from None
@router.get("/podcasts/{podcast_id}/stream")
@router.get("/podcasts/{podcast_id}/audio")
async def stream_podcast(
podcast_id: int,
session: AsyncSession = Depends(get_async_session),
user: User = Depends(current_active_user),
):
"""
Stream a podcast audio file.
Requires authentication with PODCASTS_READ permission.
For public podcast access, use /public/{share_token}/podcasts/{podcast_id}/stream
Note: Both /stream and /audio endpoints are supported for compatibility.
"""
try:
result = await session.execute(select(Podcast).filter(Podcast.id == podcast_id))
podcast = result.scalars().first()
if not podcast:
raise HTTPException(status_code=404, detail="Podcast not found")
await check_permission(
session,
user,
podcast.search_space_id,
Permission.PODCASTS_READ.value,
"You don't have permission to access podcasts in this search space",
)
file_path = podcast.file_location
if not file_path or not os.path.isfile(file_path):
raise HTTPException(status_code=404, detail="Podcast audio file not found")
def iterfile():
with open(file_path, mode="rb") as file_like:
yield from file_like
return StreamingResponse(
iterfile(),
media_type="audio/mpeg",
headers={
"Accept-Ranges": "bytes",
"Content-Disposition": f"inline; filename={Path(file_path).name}",
},
)
except HTTPException as he:
raise he
except Exception as e:
raise HTTPException(
status_code=500, detail=f"Error streaming podcast: {e!s}"
) from e

View file

@ -99,6 +99,17 @@ async def stream_public_podcast(
if not podcast_info:
raise HTTPException(status_code=404, detail="Podcast not found")
storage_key = podcast_info.get("storage_key")
if storage_key:
from app.file_storage.factory import get_storage_backend
return StreamingResponse(
get_storage_backend().open_stream(storage_key),
media_type="audio/mpeg",
headers={"Accept-Ranges": "bytes"},
)
# Legacy fallback for snapshots taken before the storage migration.
file_path = podcast_info.get("file_path")
if not file_path or not os.path.isfile(file_path):

View file

@ -68,7 +68,6 @@ from .new_llm_config import (
NewLLMConfigRead,
NewLLMConfigUpdate,
)
from .podcasts import PodcastBase, PodcastCreate, PodcastRead, PodcastUpdate
from .rbac_schemas import (
InviteAcceptRequest,
InviteAcceptResponse,
@ -237,10 +236,6 @@ __all__ = [
"PermissionInfo",
"PermissionsListResponse",
# Podcast schemas
"PodcastBase",
"PodcastCreate",
"PodcastRead",
"PodcastUpdate",
"RefreshTokenRequest",
"RefreshTokenResponse",
# Report schemas

View file

@ -1,66 +0,0 @@
"""Podcast schemas for API responses."""
from datetime import datetime
from enum import StrEnum
from typing import Any
from pydantic import BaseModel
class PodcastStatusEnum(StrEnum):
PENDING = "pending"
GENERATING = "generating"
READY = "ready"
FAILED = "failed"
class PodcastBase(BaseModel):
"""Base podcast schema."""
title: str
podcast_transcript: list[dict[str, Any]] | None = None
file_location: str | None = None
search_space_id: int
class PodcastCreate(PodcastBase):
"""Schema for creating a podcast."""
pass
class PodcastUpdate(BaseModel):
"""Schema for updating a podcast."""
title: str | None = None
podcast_transcript: list[dict[str, Any]] | None = None
file_location: str | None = None
class PodcastRead(PodcastBase):
"""Schema for reading a podcast."""
id: int
status: PodcastStatusEnum = PodcastStatusEnum.READY
created_at: datetime
transcript_entries: int | None = None
class Config:
from_attributes = True
@classmethod
def from_orm_with_entries(cls, obj):
"""Create PodcastRead with transcript_entries computed."""
data = {
"id": obj.id,
"title": obj.title,
"podcast_transcript": obj.podcast_transcript,
"file_location": obj.file_location,
"search_space_id": obj.search_space_id,
"status": obj.status,
"created_at": obj.created_at,
"transcript_entries": len(obj.podcast_transcript)
if obj.podcast_transcript
else None,
}
return cls(**data)

View file

@ -337,6 +337,9 @@ async def _get_podcast_for_snapshot(
"original_id": podcast.id,
"title": podcast.title,
"transcript": podcast.podcast_transcript,
"storage_backend": podcast.storage_backend,
"storage_key": podcast.storage_key,
# Legacy fallback for rows rendered before the storage migration.
"file_path": podcast.file_location,
}
@ -717,6 +720,8 @@ async def clone_from_snapshot(
new_podcast = Podcast(
title=podcast_info.get("title", "Cloned Podcast"),
podcast_transcript=podcast_info.get("transcript"),
storage_backend=podcast_info.get("storage_backend"),
storage_key=podcast_info.get("storage_key"),
file_location=podcast_info.get("file_path"),
status=PodcastStatus.READY,
search_space_id=target_search_space_id,

View file

@ -1,234 +0,0 @@
"""Celery tasks for podcast generation."""
import asyncio
import logging
import sys
from contextlib import asynccontextmanager
from sqlalchemy import select
from app.agents.podcaster.graph import graph as podcaster_graph
from app.agents.podcaster.state import State as PodcasterState
from app.celery_app import celery_app
from app.config import config as app_config
from app.db import Podcast, PodcastStatus
from app.services.billable_calls import (
BillingSettlementError,
QuotaInsufficientError,
_resolve_agent_billing_for_search_space,
billable_call,
)
from app.tasks.celery_tasks import get_celery_session_maker, run_async_celery_task
logger = logging.getLogger(__name__)
if sys.platform.startswith("win"):
try:
asyncio.set_event_loop_policy(asyncio.WindowsProactorEventLoopPolicy())
except AttributeError:
logger.warning(
"WindowsProactorEventLoopPolicy is unavailable; async subprocess support may fail."
)
# =============================================================================
# Content-based podcast generation (for new-chat)
# =============================================================================
@asynccontextmanager
async def _celery_billable_session():
"""Session factory used by billable_call inside the Celery worker loop."""
async with get_celery_session_maker()() as session:
yield session
@celery_app.task(name="generate_content_podcast", bind=True)
def generate_content_podcast_task(
self,
podcast_id: int,
source_content: str,
search_space_id: int,
user_prompt: str | None = None,
) -> dict:
"""
Celery task to generate podcast from source content.
Updates existing podcast record created by the tool.
"""
try:
return run_async_celery_task(
lambda: _generate_content_podcast(
podcast_id,
source_content,
search_space_id,
user_prompt,
)
)
except Exception as e:
logger.error(f"Error generating content podcast: {e!s}")
try:
run_async_celery_task(lambda: _mark_podcast_failed(podcast_id))
except Exception:
logger.exception("Failed to mark podcast %s as failed", podcast_id)
return {"status": "failed", "podcast_id": podcast_id}
async def _mark_podcast_failed(podcast_id: int) -> None:
"""Mark a podcast as failed in the database."""
async with get_celery_session_maker()() as session:
try:
result = await session.execute(
select(Podcast).filter(Podcast.id == podcast_id)
)
podcast = result.scalars().first()
if podcast:
podcast.status = PodcastStatus.FAILED
await session.commit()
except Exception as e:
logger.error(f"Failed to mark podcast as failed: {e}")
async def _generate_content_podcast(
podcast_id: int,
source_content: str,
search_space_id: int,
user_prompt: str | None = None,
) -> dict:
"""Generate content-based podcast and update existing record."""
async with get_celery_session_maker()() as session:
result = await session.execute(select(Podcast).filter(Podcast.id == podcast_id))
podcast = result.scalars().first()
if not podcast:
raise ValueError(f"Podcast {podcast_id} not found")
try:
podcast.status = PodcastStatus.GENERATING
await session.commit()
try:
(
owner_user_id,
billing_tier,
base_model,
) = await _resolve_agent_billing_for_search_space(
session,
search_space_id,
thread_id=podcast.thread_id,
)
except ValueError as resolve_err:
logger.error(
"Podcast %s: cannot resolve billing for search_space=%s: %s",
podcast.id,
search_space_id,
resolve_err,
)
podcast.status = PodcastStatus.FAILED
await session.commit()
return {
"status": "failed",
"podcast_id": podcast.id,
"reason": "billing_resolution_failed",
}
graph_config = {
"configurable": {
"podcast_title": podcast.title,
"search_space_id": search_space_id,
"user_prompt": user_prompt,
}
}
initial_state = PodcasterState(
source_content=source_content,
db_session=session,
)
try:
async with billable_call(
user_id=owner_user_id,
search_space_id=search_space_id,
billing_tier=billing_tier,
base_model=base_model,
quota_reserve_micros_override=app_config.QUOTA_DEFAULT_PODCAST_RESERVE_MICROS,
usage_type="podcast_generation",
call_details={
"podcast_id": podcast.id,
"title": podcast.title,
"thread_id": podcast.thread_id,
},
billable_session_factory=_celery_billable_session,
):
graph_result = await podcaster_graph.ainvoke(
initial_state, config=graph_config
)
except QuotaInsufficientError as exc:
logger.info(
"Podcast %s denied: out of credits (balance=%d remaining=%d)",
podcast.id,
exc.balance_micros,
exc.remaining_micros,
)
podcast.status = PodcastStatus.FAILED
await session.commit()
return {
"status": "failed",
"podcast_id": podcast.id,
"reason": "premium_quota_exhausted",
}
except BillingSettlementError:
logger.exception(
"Podcast %s: premium billing settlement failed",
podcast.id,
)
podcast.status = PodcastStatus.FAILED
await session.commit()
return {
"status": "failed",
"podcast_id": podcast.id,
"reason": "billing_settlement_failed",
}
podcast_transcript = graph_result.get("podcast_transcript", [])
file_path = graph_result.get("final_podcast_file_path", "")
serializable_transcript = []
for entry in podcast_transcript:
if hasattr(entry, "speaker_id"):
serializable_transcript.append(
{"speaker_id": entry.speaker_id, "dialog": entry.dialog}
)
else:
serializable_transcript.append(
{
"speaker_id": entry.get("speaker_id", 0),
"dialog": entry.get("dialog", ""),
}
)
podcast.podcast_transcript = serializable_transcript
podcast.file_location = file_path
podcast.status = PodcastStatus.READY
logger.info(
"Podcast %s: committing READY transcript_entries=%d file=%s",
podcast.id,
len(serializable_transcript),
file_path,
)
await session.commit()
logger.info("Podcast %s: READY commit complete", podcast.id)
logger.info(f"Successfully generated podcast: {podcast.id}")
return {
"status": "ready",
"podcast_id": podcast.id,
"title": podcast.title,
"transcript_entries": len(serializable_transcript),
}
except Exception as e:
logger.error(f"Error in _generate_content_podcast: {e!s}")
podcast.status = PodcastStatus.FAILED
await session.commit()
raise

View file

@ -15,22 +15,28 @@ def iter_completion_emission_frames(
out = ctx.tool_output
payload = out if isinstance(out, dict) else {"result": out}
yield ctx.emit_tool_output_card(payload)
if isinstance(out, dict) and out.get("status") in (
status = out.get("status") if isinstance(out, dict) else None
title = out.get("title", "Podcast") if isinstance(out, dict) else "Podcast"
if status in (
"awaiting_brief",
"awaiting_review",
"pending",
"generating",
"processing",
"drafting",
"rendering",
):
# This line is persisted with the chat while the podcast keeps moving,
# so it must stay true after the lifecycle outgrows today's status.
yield ctx.streaming_service.format_terminal_info(
f"Podcast queued: {out.get('title', 'Podcast')}",
f"Podcast created: {title}",
"success",
)
elif isinstance(out, dict) and out.get("status") in ("ready", "success"):
elif status in ("ready", "success"):
yield ctx.streaming_service.format_terminal_info(
f"Podcast generated successfully: {out.get('title', 'Podcast')}",
f"Podcast generated successfully: {title}",
"success",
)
elif isinstance(out, dict) and out.get("status") in ("failed", "error"):
error_msg = out.get("error", "Unknown error")
elif status in ("failed", "error"):
error_msg = out.get("error", "Unknown error") if isinstance(out, dict) else "Unknown error"
yield ctx.streaming_service.format_terminal_info(
f"Podcast generation failed: {error_msg}",
"error",

View file

@ -24,11 +24,11 @@ def resolve_start_thinking(tool_name: str, tool_input: Any) -> ToolStartThinking
d.get("source_content", "") if isinstance(tool_input, dict) else ""
)
return ToolStartThinking(
title="Generating podcast",
title="Preparing podcast",
items=[
f"Title: {podcast_title}",
f"Content: {content_len:,} characters",
"Preparing audio generation...",
"Proposing brief (language, voices, length)...",
],
)
@ -50,17 +50,19 @@ def resolve_completed_thinking(
if isinstance(tool_output, dict)
else "Podcast"
)
if podcast_status in ("pending", "generating", "processing"):
if podcast_status in (
"awaiting_brief",
"awaiting_review",
"pending",
"drafting",
"rendering",
):
# Persisted with the chat while the podcast keeps moving, so the copy
# must stay true after the lifecycle outgrows today's status.
completed = [
f"Title: {podcast_title}",
"Podcast generation started",
"Processing in background...",
]
elif podcast_status == "already_generating":
completed = [
f"Title: {podcast_title}",
"Podcast already in progress",
"Please wait for it to complete",
"Podcast created",
"Review and progress continue on the podcast card",
]
elif podcast_status in ("failed", "error"):
error_msg = (
@ -79,4 +81,4 @@ def resolve_completed_thinking(
]
else:
completed = items
return ("Generating podcast", completed)
return ("Preparing podcast", completed)

View file

@ -52,6 +52,22 @@ AUTOMATION_RUN_COLS = [
"created_at",
]
# Enough to drive the lifecycle UI by push: status, the reviewable brief, and
# its version. The bulky source_content and transcript are deliberately excluded
# and fetched over REST when a gate opens.
PODCAST_COLS = [
"id",
"title",
"status",
"spec",
"spec_version",
"duration_seconds",
"error",
"search_space_id",
"thread_id",
"created_at",
]
ZERO_PUBLICATION: Mapping[str, Sequence[str] | None] = {
"notifications": None,
"documents": DOCUMENT_COLS,
@ -62,6 +78,7 @@ ZERO_PUBLICATION: Mapping[str, Sequence[str] | None] = {
"chat_session_state": None,
"user": USER_COLS,
"automation_runs": AUTOMATION_RUN_COLS,
"podcasts": PODCAST_COLS,
}
@ -89,7 +106,9 @@ def _expected_columns(conn: Connection, table: str) -> list[str] | None:
return None
expected = list(columns)
if table in {"documents", "user"} and _column_exists(conn, table, "_0_version"):
if table in {"documents", "user", "podcasts"} and _column_exists(
conn, table, "_0_version"
):
expected.append("_0_version")
return expected

View file

@ -0,0 +1,323 @@
"""Podcast API + task integration fixtures.
The app's DB session and current-user dependencies ride the test's transactional
`db_session`, so seeded rows and rows touched through the endpoints (or the task
bodies) share one transaction that rolls back per test. Only true externals are
faked: the Celery broker (`*_task.delay`) is captured instead of dispatched, the
object store is a tiny in-memory backend, the Celery tasks' own session maker is
bound to the test transaction, and for the render task the TTS provider and
the FFmpeg merge are stubbed. `TTS_SERVICE` is pinned so the deterministic brief
proposal can resolve voices.
"""
from __future__ import annotations
import contextlib
import uuid
from collections.abc import AsyncGenerator, AsyncIterator
from pathlib import Path
import httpx
import pytest
import pytest_asyncio
from httpx import ASGITransport
from sqlalchemy.ext.asyncio import AsyncSession
from app.app import app, limiter
from app.config import config as app_config
from app.db import SearchSpace, User, get_async_session
from app.routes.search_spaces_routes import create_default_roles_and_membership
from app.podcasts.persistence import Podcast, PodcastStatus
from app.podcasts.schemas import (
DurationTarget,
PodcastSpec,
PodcastStyle,
SpeakerRole,
SpeakerSpec,
Transcript,
TranscriptTurn,
)
from app.podcasts.service import PodcastService
from app.podcasts.tts import SynthesisRequest, SynthesizedAudio, TextToSpeech
from app.users import current_active_user
pytestmark = pytest.mark.integration
limiter.enabled = False
@pytest_asyncio.fixture
async def client(
db_session: AsyncSession,
db_user: User,
) -> AsyncGenerator[httpx.AsyncClient, None]:
async def override_session() -> AsyncGenerator[AsyncSession, None]:
yield db_session
async def override_user() -> User:
return db_user
previous_overrides = app.dependency_overrides.copy()
app.dependency_overrides[get_async_session] = override_session
app.dependency_overrides[current_active_user] = override_user
try:
async with httpx.AsyncClient(
transport=ASGITransport(app=app),
base_url="http://test",
timeout=30.0,
follow_redirects=False,
) as test_client:
yield test_client
finally:
app.dependency_overrides.clear()
app.dependency_overrides.update(previous_overrides)
@pytest.fixture(autouse=True)
def tts_service(monkeypatch) -> str:
"""Pin a provider with language-agnostic voices so brief proposal resolves."""
service = "openai/tts-1"
monkeypatch.setattr(app_config, "TTS_SERVICE", service)
return service
class CapturedTasks:
"""Records the args each podcast Celery task was enqueued with."""
def __init__(self) -> None:
self.draft: list[tuple] = []
self.render: list[tuple] = []
@pytest.fixture(autouse=True)
def captured_tasks(monkeypatch) -> CapturedTasks:
"""Capture `*_task.delay` instead of hitting the broker (a boundary)."""
captured = CapturedTasks()
from app.podcasts.tasks import draft_transcript_task, render_audio_task
monkeypatch.setattr(
draft_transcript_task, "delay", lambda *a, **k: captured.draft.append((a, k))
)
monkeypatch.setattr(
render_audio_task, "delay", lambda *a, **k: captured.render.append((a, k))
)
return captured
class FakeStorageBackend:
"""In-memory object store standing in for the real audio backend."""
backend_name = "memory"
def __init__(self) -> None:
self.objects: dict[str, bytes] = {}
self.deleted: list[str] = []
async def put(self, key: str, data: bytes, content_type: str | None = None) -> None:
self.objects[key] = data
async def open_stream(self, key: str) -> AsyncIterator[bytes]:
yield self.objects.get(key, b"audio-bytes")
async def delete(self, key: str) -> None:
self.deleted.append(key)
@pytest.fixture
def fake_storage(monkeypatch) -> FakeStorageBackend:
"""Route audio storage to an in-memory backend for the stream routes."""
backend = FakeStorageBackend()
monkeypatch.setattr(
"app.podcasts.storage.get_storage_backend", lambda: backend
)
monkeypatch.setattr(
"app.file_storage.factory.get_storage_backend", lambda: backend
)
return backend
@pytest.fixture
def bind_task_session(db_session: AsyncSession, monkeypatch) -> AsyncSession:
"""Bind the Celery tasks' own session maker to the test transaction.
Task bodies open ``get_celery_session_maker()()`` rather than receiving a
session, so this hands them the test's session without closing it on exit; a
task's ``commit()`` then releases a savepoint and the per-test rollback still
cleans up.
"""
def _make_session():
@contextlib.asynccontextmanager
async def _ctx() -> AsyncIterator[AsyncSession]:
yield db_session
return _ctx()
for module in (
"app.podcasts.tasks.draft",
"app.podcasts.tasks.render",
"app.podcasts.tasks.runtime",
):
monkeypatch.setattr(
f"{module}.get_celery_session_maker", lambda: _make_session
)
return db_session
class FakeTextToSpeech(TextToSpeech):
"""In-memory TTS provider: every segment yields fixed bytes (the boundary).
Records each request so tests can assert how often synthesis was paid for.
"""
def __init__(self) -> None:
self.requests: list[SynthesisRequest] = []
@property
def container(self) -> str:
return "mp3"
async def synthesize(self, request: SynthesisRequest) -> SynthesizedAudio:
self.requests.append(request)
return SynthesizedAudio(data=b"segment-audio", container="mp3")
@pytest.fixture
def fake_tts(monkeypatch) -> FakeTextToSpeech:
"""Stand in for the configured TTS provider in the render task."""
provider = FakeTextToSpeech()
monkeypatch.setattr(
"app.podcasts.tasks.render.get_text_to_speech", lambda: provider
)
return provider
@pytest.fixture
def fake_merge(monkeypatch) -> None:
"""Stub the FFmpeg merge (an external binary) to emit a fixed MP3."""
async def _merge(segment_paths: list[Path], output_path: Path) -> None:
output_path.write_bytes(b"merged-audio")
monkeypatch.setattr("app.podcasts.rendering.renderer.concat_to_mp3", _merge)
def build_spec(
*,
language: str = "en",
voice_ids: tuple[str, str] = ("openai:alloy", "openai:nova"),
) -> PodcastSpec:
"""A valid two-speaker brief; tests override only what they assert on."""
return PodcastSpec(
language=language,
style=PodcastStyle.CONVERSATIONAL,
speakers=[
SpeakerSpec(slot=0, name="Host", role=SpeakerRole.HOST, voice_id=voice_ids[0]),
SpeakerSpec(slot=1, name="Guest", role=SpeakerRole.GUEST, voice_id=voice_ids[1]),
],
duration=DurationTarget(min_minutes=10, max_minutes=20),
)
def build_transcript() -> Transcript:
return Transcript(
turns=[
TranscriptTurn(speaker=0, text="Welcome to the show."),
TranscriptTurn(speaker=1, text="Glad to be here."),
]
)
@pytest.fixture
def make_podcast(db_session: AsyncSession):
"""Create a podcast advanced to a target lifecycle state via the service.
Setup runs through the same public service the API uses, on the test's
session, so the endpoint under test reads a realistically-built row.
"""
_LADDER = [
PodcastStatus.AWAITING_BRIEF,
PodcastStatus.DRAFTING,
PodcastStatus.RENDERING,
PodcastStatus.READY,
]
async def _make(
*,
search_space_id: int,
status: PodcastStatus = PodcastStatus.AWAITING_BRIEF,
title: str = "Test Podcast",
thread_id: int | None = None,
) -> Podcast:
service = PodcastService(db_session)
podcast = await service.create(
title=title, search_space_id=search_space_id, thread_id=thread_id
)
if status is PodcastStatus.PENDING:
await db_session.flush()
return podcast
targets = _LADDER[: _LADDER.index(status) + 1]
for target in targets:
if target is PodcastStatus.AWAITING_BRIEF:
await service.attach_brief(podcast, build_spec())
elif target is PodcastStatus.DRAFTING:
await service.begin_drafting(podcast)
elif target is PodcastStatus.RENDERING:
await service.attach_transcript(podcast, build_transcript())
elif target is PodcastStatus.READY:
await service.attach_audio(
podcast,
storage_backend="memory",
storage_key="podcasts/audio.mp3",
duration_seconds=123,
)
await db_session.flush()
return podcast
return _make
@pytest.fixture
def act_as():
"""Switch the authenticated user for subsequent requests on ``client``.
The ``client`` fixture installs db_user and restores the prior overrides on
teardown, so re-pointing the auth dependency here is undone per test.
"""
def _act(user: User) -> None:
app.dependency_overrides[current_active_user] = lambda: user
return _act
@pytest_asyncio.fixture
async def db_other_user(db_session: AsyncSession) -> User:
"""A second user who is not a member of ``db_search_space``."""
user = User(
id=uuid.uuid4(),
email="stranger@surfsense.net",
hashed_password="hashed",
is_active=True,
is_superuser=False,
is_verified=True,
)
db_session.add(user)
await db_session.flush()
return user
@pytest_asyncio.fixture
async def foreign_podcast(
db_session: AsyncSession, db_other_user: User, make_podcast
) -> Podcast:
"""A podcast in a space owned by the other user, invisible to db_user."""
space = SearchSpace(name="Stranger Space", user_id=db_other_user.id)
db_session.add(space)
await db_session.flush()
await create_default_roles_and_membership(db_session, space.id, db_other_user.id)
await db_session.flush()
return await make_podcast(search_space_id=space.id, title="Foreign")

View file

@ -0,0 +1,80 @@
"""The brief review gate: edit the spec, then approve to start drafting.
Covers what the user can do while ``awaiting_brief`` edit the brief under
optimistic concurrency and approve it and the HTTP status codes the service's
guards map to when an edit races or comes too late.
"""
from __future__ import annotations
import pytest
pytestmark = pytest.mark.integration
BASE = "/api/v1/podcasts"
async def _create(client, search_space_id: int) -> dict:
resp = await client.post(
BASE,
json={
"title": "Episode",
"search_space_id": search_space_id,
"source_content": "Source content.",
},
)
assert resp.status_code == 201
return resp.json()
async def test_approve_brief_starts_drafting_and_enqueues_draft(
client, db_search_space, captured_tasks
):
podcast = await _create(client, db_search_space.id)
resp = await client.post(f"{BASE}/{podcast['id']}/brief/approve")
assert resp.status_code == 200
assert resp.json()["status"] == "drafting"
assert captured_tasks.draft == [((podcast["id"], db_search_space.id), {})]
assert captured_tasks.render == []
async def test_update_spec_bumps_version_and_persists(client, db_search_space):
podcast = await _create(client, db_search_space.id)
spec = podcast["spec"]
spec["focus"] = "A sharper angle"
resp = await client.patch(
f"{BASE}/{podcast['id']}/spec",
json={"spec": spec, "expected_version": podcast["spec_version"]},
)
assert resp.status_code == 200
body = resp.json()
assert body["spec_version"] == podcast["spec_version"] + 1
assert body["spec"]["focus"] == "A sharper angle"
assert body["status"] == "awaiting_brief"
async def test_update_spec_with_stale_version_conflicts(client, db_search_space):
podcast = await _create(client, db_search_space.id)
resp = await client.patch(
f"{BASE}/{podcast['id']}/spec",
json={"spec": podcast["spec"], "expected_version": 999},
)
assert resp.status_code == 409
async def test_update_spec_after_approval_is_rejected(client, db_search_space):
podcast = await _create(client, db_search_space.id)
await client.post(f"{BASE}/{podcast['id']}/brief/approve")
resp = await client.patch(
f"{BASE}/{podcast['id']}/spec",
json={"spec": podcast["spec"], "expected_version": podcast["spec_version"]},
)
assert resp.status_code == 409

View file

@ -0,0 +1,59 @@
"""Cancelling a podcast: allowed while in flight, refused once an episode exists.
Cancellation is the escape hatch for a podcast that has produced nothing yet.
Once a finished episode exists including during a regeneration, whose audio
survives until a new render commits cancel is refused (409): reverting the
regeneration is the way back, and no user action may destroy playable audio.
"""
import pytest
from app.podcasts.persistence import PodcastStatus
pytestmark = pytest.mark.integration
BASE = "/api/v1/podcasts"
async def test_cancel_from_a_live_state_succeeds(
client, db_search_space, make_podcast
):
podcast = await make_podcast(
search_space_id=db_search_space.id, status=PodcastStatus.AWAITING_BRIEF
)
resp = await client.post(f"{BASE}/{podcast.id}/cancel")
assert resp.status_code == 200
assert resp.json()["status"] == "cancelled"
async def test_cancel_from_a_terminal_state_conflicts(
client, db_search_space, make_podcast
):
podcast = await make_podcast(
search_space_id=db_search_space.id, status=PodcastStatus.READY
)
resp = await client.post(f"{BASE}/{podcast.id}/cancel")
assert resp.status_code == 409
async def test_cancel_of_a_regeneration_is_rejected(
client, db_search_space, make_podcast
):
# Cancelling here would destroy a playable episode; reverting the
# regeneration is the way back.
podcast = await make_podcast(
search_space_id=db_search_space.id, status=PodcastStatus.READY
)
await client.post(f"{BASE}/{podcast.id}/transcript/regenerate")
resp = await client.post(f"{BASE}/{podcast.id}/cancel")
assert resp.status_code == 409
# The regeneration is still revertable afterwards.
follow_up = await client.post(f"{BASE}/{podcast.id}/regenerate/revert")
assert follow_up.status_code == 200
assert follow_up.json()["status"] == "ready"

View file

@ -0,0 +1,51 @@
"""Creating a podcast proposes a brief and opens the review gate.
Driven through the real POST endpoint (auth + DB on one transaction): the row is
created, a brief is proposed inline from defaults, and the podcast lands in
``awaiting_brief`` with a complete spec and nothing generated yet.
"""
from __future__ import annotations
import pytest
pytestmark = pytest.mark.integration
BASE = "/api/v1/podcasts"
async def test_create_proposes_brief_and_opens_gate(client, db_search_space):
resp = await client.post(
BASE,
json={
"title": "My Episode",
"search_space_id": db_search_space.id,
"source_content": "A long piece of source content about a topic.",
},
)
assert resp.status_code == 201
body = resp.json()
assert body["title"] == "My Episode"
assert body["status"] == "awaiting_brief"
assert body["spec_version"] == 1
assert body["spec"] is not None
assert body["spec"]["language"] == "en"
assert len(body["spec"]["speakers"]) == 2
assert body["transcript"] is None
assert body["has_audio"] is False
async def test_create_honors_requested_speaker_count(client, db_search_space):
resp = await client.post(
BASE,
json={
"title": "Solo",
"search_space_id": db_search_space.id,
"source_content": "Content.",
"speaker_count": 3,
},
)
assert resp.status_code == 201
assert len(resp.json()["spec"]["speakers"]) == 3

View file

@ -0,0 +1,117 @@
"""The transcript-drafting task against a real database.
Drafting is the expensive LLM step, so it runs under ``billable_call``. The
behavior that protects users' money: when billing succeeds, the drafted
transcript is stored and rendering starts immediately (DRAFTING -> RENDERING,
render task enqueued the brief gate is the only approval); when billing denies
or settlement fails, the podcast ends FAILED with no transcript left behind. The
DB, service, and transcript persistence run for real; only the true externals
are faked billing (the metering boundary) and the generation graph (the LLM).
"""
from __future__ import annotations
from contextlib import asynccontextmanager
from types import SimpleNamespace
from uuid import uuid4
import pytest
from app.podcasts.persistence import PodcastStatus
from app.podcasts.service import read_transcript
from app.podcasts.tasks import draft
from app.services.billable_calls import (
BillingSettlementError,
QuotaInsufficientError,
)
from .conftest import build_transcript
pytestmark = pytest.mark.integration
def _wire_billing(monkeypatch, *, billable_call, transcript=None) -> None:
"""Replace the billing + LLM externals the draft body reaches for."""
async def _resolver(_session, _search_space_id, *, thread_id=None):
return uuid4(), "free", "openrouter/model"
async def _ainvoke(_state, config=None):
return {"transcript": transcript}
monkeypatch.setattr(draft, "_resolve_agent_billing_for_search_space", _resolver)
monkeypatch.setattr(draft, "billable_call", billable_call)
monkeypatch.setattr(draft, "transcript_graph", SimpleNamespace(ainvoke=_ainvoke))
async def test_successful_draft_stores_transcript_and_starts_rendering(
monkeypatch, db_search_space, make_podcast, bind_task_session, captured_tasks
):
podcast = await make_podcast(
search_space_id=db_search_space.id, status=PodcastStatus.DRAFTING
)
@asynccontextmanager
async def _ok(**_kwargs):
yield SimpleNamespace()
_wire_billing(monkeypatch, billable_call=_ok, transcript=build_transcript())
result = await draft._draft_transcript(podcast.id, db_search_space.id)
assert result["status"] == "rendering"
assert podcast.status == PodcastStatus.RENDERING
assert read_transcript(podcast) is not None
assert captured_tasks.render == [((podcast.id,), {})]
async def test_quota_denial_fails_the_podcast_without_a_transcript(
monkeypatch, db_search_space, make_podcast, bind_task_session
):
podcast = await make_podcast(
search_space_id=db_search_space.id, status=PodcastStatus.DRAFTING
)
@asynccontextmanager
async def _deny(**_kwargs):
raise QuotaInsufficientError(
usage_type="podcast_generation",
used_micros=5_000_000,
limit_micros=5_000_000,
remaining_micros=0,
)
yield # pragma: no cover - unreachable, satisfies the CM protocol
_wire_billing(monkeypatch, billable_call=_deny)
result = await draft._draft_transcript(podcast.id, db_search_space.id)
assert result["reason"] == "quota"
assert podcast.status == PodcastStatus.FAILED
assert read_transcript(podcast) is None
async def test_billing_settlement_failure_fails_the_podcast(
monkeypatch, db_search_space, make_podcast, bind_task_session
):
podcast = await make_podcast(
search_space_id=db_search_space.id, status=PodcastStatus.DRAFTING
)
@asynccontextmanager
async def _settlement_fails(**_kwargs):
yield SimpleNamespace()
raise BillingSettlementError(
usage_type="podcast_generation",
user_id=uuid4(),
cause=RuntimeError("finalize failed"),
)
_wire_billing(
monkeypatch, billable_call=_settlement_fails, transcript=build_transcript()
)
result = await draft._draft_transcript(podcast.id, db_search_space.id)
assert result["reason"] == "billing"
assert podcast.status == PodcastStatus.FAILED

View file

@ -0,0 +1,64 @@
"""Public (unauthenticated) podcast streaming from a chat snapshot.
A shared chat snapshot carries each podcast's stored-audio key; the public route
streams those bytes from the object store via ``share_token`` with no auth. A
podcast that isn't in the snapshot is a 404.
"""
import pytest
from app.db import NewChatThread, PublicChatSnapshot, User
pytestmark = pytest.mark.integration
async def _snapshot(db_session, *, search_space_id, user: User, token: str, podcasts):
thread = NewChatThread(
title="Shared", search_space_id=search_space_id, created_by_id=user.id
)
db_session.add(thread)
await db_session.flush()
snapshot = PublicChatSnapshot(
thread_id=thread.id,
share_token=token,
content_hash=f"hash-{token}",
message_ids=[],
snapshot_data={"podcasts": podcasts},
)
db_session.add(snapshot)
await db_session.flush()
async def test_public_stream_serves_audio_via_storage_key(
client, db_session, db_search_space, db_user, fake_storage
):
await _snapshot(
db_session,
search_space_id=db_search_space.id,
user=db_user,
token="tok-audio",
podcasts=[{"original_id": 555, "storage_key": "podcasts/x.mp3"}],
)
fake_storage.objects["podcasts/x.mp3"] = b"public-audio"
resp = await client.get("/api/v1/public/tok-audio/podcasts/555/stream")
assert resp.status_code == 200
assert resp.headers["content-type"] == "audio/mpeg"
assert resp.content == b"public-audio"
async def test_public_stream_404_when_podcast_absent_from_snapshot(
client, db_session, db_search_space, db_user
):
await _snapshot(
db_session,
search_space_id=db_search_space.id,
user=db_user,
token="tok-empty",
podcasts=[],
)
resp = await client.get("/api/v1/public/tok-empty/podcasts/999/stream")
assert resp.status_code == 404

View file

@ -0,0 +1,202 @@
"""Regeneration: the listen-then-redo loop after the brief gate.
A user who dislikes the finished audio sends the episode back to the brief
gate: the saved brief reopens for tweaks (voices, length, focus) and drafting
only restarts on a fresh approval. The whole redo can also be reverted at any
point before the new render commits, falling back to the still-stored episode.
These pin the READY -> AWAITING_BRIEF -> DRAFTING round trip, the revert
fallback, and the 409s for acting from states that have nothing to redo or
revert.
"""
from __future__ import annotations
import pytest
from app.podcasts.persistence import Podcast, PodcastStatus
from app.podcasts.service import PodcastService
from .conftest import build_transcript
pytestmark = pytest.mark.integration
BASE = "/api/v1/podcasts"
async def test_regenerate_from_ready_reopens_the_brief_gate(
client, db_search_space, make_podcast, captured_tasks
):
podcast = await make_podcast(
search_space_id=db_search_space.id, status=PodcastStatus.READY
)
resp = await client.post(f"{BASE}/{podcast.id}/transcript/regenerate")
assert resp.status_code == 200
body = resp.json()
assert body["status"] == "awaiting_brief"
# The prior brief is kept as the starting point for the new take.
assert body["spec"] is not None
# Nothing drafts until the user approves the reopened brief.
assert captured_tasks.draft == []
assert captured_tasks.render == []
async def test_approving_the_reopened_brief_starts_a_fresh_draft(
client, db_search_space, make_podcast, captured_tasks
):
podcast = await make_podcast(
search_space_id=db_search_space.id, status=PodcastStatus.READY
)
await client.post(f"{BASE}/{podcast.id}/transcript/regenerate")
resp = await client.post(f"{BASE}/{podcast.id}/brief/approve")
assert resp.status_code == 200
assert resp.json()["status"] == "drafting"
assert captured_tasks.draft == [((podcast.id, db_search_space.id), {})]
async def test_regenerate_from_brief_gate_is_rejected(
client, db_search_space, make_podcast, captured_tasks
):
# Nothing has been drafted yet, so there is nothing to regenerate.
podcast = await make_podcast(
search_space_id=db_search_space.id, status=PodcastStatus.AWAITING_BRIEF
)
resp = await client.post(f"{BASE}/{podcast.id}/transcript/regenerate")
assert resp.status_code == 409
assert captured_tasks.draft == []
async def test_regenerate_from_cancelled_is_rejected(
client, db_search_space, make_podcast, captured_tasks
):
podcast = await make_podcast(
search_space_id=db_search_space.id, status=PodcastStatus.AWAITING_BRIEF
)
await client.post(f"{BASE}/{podcast.id}/cancel")
resp = await client.post(f"{BASE}/{podcast.id}/transcript/regenerate")
assert resp.status_code == 409
assert captured_tasks.draft == []
async def test_reverting_a_regeneration_restores_the_ready_episode(
client, db_search_space, make_podcast, captured_tasks
):
podcast = await make_podcast(
search_space_id=db_search_space.id, status=PodcastStatus.READY
)
await client.post(f"{BASE}/{podcast.id}/transcript/regenerate")
resp = await client.post(f"{BASE}/{podcast.id}/regenerate/revert")
assert resp.status_code == 200
body = resp.json()
assert body["status"] == "ready"
# The episode the user could already play is untouched.
assert body["has_audio"] is True
assert captured_tasks.draft == []
assert captured_tasks.render == []
async def test_reverting_mid_draft_keeps_the_episode(
client, db_search_space, make_podcast
):
# Changing one's mind is allowed even after the reopened brief was
# approved: the episode survives until a new render replaces it.
podcast = await make_podcast(
search_space_id=db_search_space.id, status=PodcastStatus.READY
)
await client.post(f"{BASE}/{podcast.id}/transcript/regenerate")
await client.post(f"{BASE}/{podcast.id}/brief/approve")
resp = await client.post(f"{BASE}/{podcast.id}/regenerate/revert")
assert resp.status_code == 200
assert resp.json()["status"] == "ready"
async def test_reverting_mid_render_keeps_the_episode(
client, db_session, db_search_space, make_podcast
):
podcast = await make_podcast(
search_space_id=db_search_space.id, status=PodcastStatus.READY
)
service = PodcastService(db_session)
await service.regenerate(podcast)
await service.begin_drafting(podcast)
await service.attach_transcript(podcast, build_transcript())
resp = await client.post(f"{BASE}/{podcast.id}/regenerate/revert")
assert resp.status_code == 200
assert resp.json()["status"] == "ready"
async def test_reverted_episode_can_be_regenerated_again(
client, db_search_space, make_podcast
):
# Reverting must not strand the episode: the user can change their mind
# again immediately.
podcast = await make_podcast(
search_space_id=db_search_space.id, status=PodcastStatus.READY
)
await client.post(f"{BASE}/{podcast.id}/transcript/regenerate")
await client.post(f"{BASE}/{podcast.id}/regenerate/revert")
resp = await client.post(f"{BASE}/{podcast.id}/transcript/regenerate")
assert resp.status_code == 200
assert resp.json()["status"] == "awaiting_brief"
async def test_revert_on_a_fresh_brief_gate_is_rejected(
client, db_search_space, make_podcast
):
# A first-time brief has no regeneration to revert.
podcast = await make_podcast(
search_space_id=db_search_space.id, status=PodcastStatus.AWAITING_BRIEF
)
resp = await client.post(f"{BASE}/{podcast.id}/regenerate/revert")
assert resp.status_code == 409
assert resp.json()["detail"]
async def test_revert_when_nothing_was_regenerated_is_rejected(
client, db_search_space, make_podcast
):
podcast = await make_podcast(
search_space_id=db_search_space.id, status=PodcastStatus.READY
)
resp = await client.post(f"{BASE}/{podcast.id}/regenerate/revert")
assert resp.status_code == 409
async def test_regenerate_without_a_brief_is_rejected(
client, db_session, db_search_space, captured_tasks
):
# Legacy episodes finished before briefs existed; reopening a gate with
# nothing to review would strand them there.
podcast = Podcast(
title="Legacy Episode",
search_space_id=db_search_space.id,
status=PodcastStatus.READY,
spec_version=1,
file_location="/var/old/podcast.mp3",
)
db_session.add(podcast)
await db_session.flush()
resp = await client.post(f"{BASE}/{podcast.id}/transcript/regenerate")
assert resp.status_code == 422
assert captured_tasks.draft == []

View file

@ -0,0 +1,100 @@
"""The audio-rendering task against a real database.
From RENDERING, the task synthesises and merges the approved transcript, stores
the bytes, and marks the podcast READY with the storage location recorded. The
DB, service, renderer orchestration, and storage wrapper run for real; the true
externals are faked the TTS provider, the FFmpeg merge, and the object store.
"""
from __future__ import annotations
import pytest
from app.podcasts.persistence import PodcastStatus
from app.podcasts.service import PodcastService
from app.podcasts.tasks import render
from .conftest import build_transcript
pytestmark = pytest.mark.integration
async def test_render_marks_ready_and_stores_audio(
db_search_space, make_podcast, bind_task_session, fake_tts, fake_merge, fake_storage
):
podcast = await make_podcast(
search_space_id=db_search_space.id, status=PodcastStatus.RENDERING
)
result = await render._render_audio(podcast.id)
assert result["status"] == "ready"
assert podcast.status == PodcastStatus.READY
assert podcast.storage_backend == "memory"
assert podcast.storage_key
assert fake_storage.objects[podcast.storage_key] == b"merged-audio"
async def test_rerender_replaces_audio_and_purges_the_old_object(
db_session,
db_search_space,
make_podcast,
bind_task_session,
fake_tts,
fake_merge,
fake_storage,
):
# A regenerated episode keeps exactly one stored object: the new render
# must not leak the superseded audio in the object store.
podcast = await make_podcast(
search_space_id=db_search_space.id, status=PodcastStatus.READY
)
old_key = podcast.storage_key
fake_storage.objects[old_key] = b"old-audio"
service = PodcastService(db_session)
await service.regenerate(podcast)
await service.begin_drafting(podcast)
await service.attach_transcript(podcast, build_transcript())
result = await render._render_audio(podcast.id)
assert result["status"] == "ready"
assert podcast.status == PodcastStatus.READY
assert podcast.storage_key != old_key
assert fake_storage.objects[podcast.storage_key] == b"merged-audio"
assert old_key in fake_storage.deleted
async def test_render_losing_to_a_user_revert_keeps_the_episode_and_leaks_nothing(
db_session,
db_search_space,
make_podcast,
bind_task_session,
fake_tts,
fake_merge,
fake_storage,
):
# The user reverts the regeneration while the render is in flight: the
# stale render must neither resurrect the redo nor leak the object it
# already stored.
podcast = await make_podcast(
search_space_id=db_search_space.id, status=PodcastStatus.READY
)
old_key = podcast.storage_key
fake_storage.objects[old_key] = b"old-audio"
service = PodcastService(db_session)
await service.regenerate(podcast)
await service.begin_drafting(podcast)
await service.attach_transcript(podcast, build_transcript())
await service.revert_regeneration(podcast)
result = await render._render_audio(podcast.id)
assert result["status"] == "superseded"
assert podcast.status == PodcastStatus.READY
assert podcast.storage_key == old_key
assert old_key not in fake_storage.deleted
stale_keys = [key for key in fake_storage.objects if key != old_key]
assert all(key in fake_storage.deleted for key in stale_keys)

View file

@ -0,0 +1,53 @@
"""Podcasts are scoped to search-space membership.
A user can only create or read podcasts in spaces they belong to, and an
unscoped listing returns only the caller's own podcasts — never another
member's.
"""
import pytest
pytestmark = pytest.mark.integration
BASE = "/api/v1/podcasts"
async def test_reading_a_podcast_in_a_nonmember_space_is_forbidden(
client, db_search_space, make_podcast, act_as, db_other_user
):
podcast = await make_podcast(search_space_id=db_search_space.id)
act_as(db_other_user)
resp = await client.get(f"{BASE}/{podcast.id}")
assert resp.status_code == 403
async def test_creating_in_a_nonmember_space_is_forbidden(
client, db_search_space, act_as, db_other_user
):
act_as(db_other_user)
resp = await client.post(
BASE,
json={
"title": "X",
"search_space_id": db_search_space.id,
"source_content": "content",
},
)
assert resp.status_code == 403
async def test_listing_returns_only_the_callers_podcasts(
client, db_search_space, make_podcast, foreign_podcast
):
mine = await make_podcast(search_space_id=db_search_space.id, title="Mine")
resp = await client.get(BASE)
assert resp.status_code == 200
ids = {p["id"] for p in resp.json()}
assert mine.id in ids
assert foreign_podcast.id not in ids

View file

@ -0,0 +1,41 @@
"""Streaming a podcast's rendered audio over HTTP.
A ready podcast streams its bytes from the storage backend; a podcast with no
stored audio returns 404. Storage is an in-memory backend (the object store is a
system boundary).
"""
from __future__ import annotations
import pytest
from app.podcasts.persistence import PodcastStatus
pytestmark = pytest.mark.integration
BASE = "/api/v1/podcasts"
async def test_stream_serves_stored_audio(
client, db_search_space, make_podcast, fake_storage
):
podcast = await make_podcast(
search_space_id=db_search_space.id, status=PodcastStatus.READY
)
fake_storage.objects["podcasts/audio.mp3"] = b"the-audio"
resp = await client.get(f"{BASE}/{podcast.id}/stream")
assert resp.status_code == 200
assert resp.headers["content-type"] == "audio/mpeg"
assert resp.content == b"the-audio"
async def test_stream_404_when_no_audio(client, db_search_space, make_podcast):
podcast = await make_podcast(
search_space_id=db_search_space.id, status=PodcastStatus.DRAFTING
)
resp = await client.get(f"{BASE}/{podcast.id}/stream")
assert resp.status_code == 404

View file

@ -0,0 +1,45 @@
"""The task failure safety net (``mark_failed``) against a real database.
When a task body raises, ``mark_failed`` records the reason on the row. Its
contract has two halves worth securing: a still-running podcast moves to FAILED
with the reason, while one that already reached a terminal state is left exactly
as it was rather than forced. A missing row is a no-op, never a crash.
"""
from __future__ import annotations
import pytest
from app.podcasts.persistence import PodcastStatus
from app.podcasts.tasks import runtime
pytestmark = pytest.mark.integration
async def test_marking_failed_records_the_reason_on_a_running_podcast(
db_search_space, make_podcast, bind_task_session
):
podcast = await make_podcast(
search_space_id=db_search_space.id, status=PodcastStatus.DRAFTING
)
await runtime.mark_failed(podcast.id, "tts provider unavailable")
assert podcast.status == PodcastStatus.FAILED
assert podcast.error == "tts provider unavailable"
async def test_marking_failed_leaves_an_already_terminal_podcast_untouched(
db_search_space, make_podcast, bind_task_session
):
podcast = await make_podcast(
search_space_id=db_search_space.id, status=PodcastStatus.READY
)
await runtime.mark_failed(podcast.id, "too late")
assert podcast.status == PodcastStatus.READY
async def test_marking_a_missing_podcast_failed_is_a_no_op(bind_task_session):
await runtime.mark_failed(987654321, "gone") # must not raise

View file

@ -0,0 +1,79 @@
"""Audible voice previews for the brief gate's voice picker.
A user choosing voices should hear them, not guess from names. The endpoint
synthesises a short sample for a catalog voice and caches it on disk so each
voice is paid for at most once per process lifetime. Unknown voices and voices
of an inactive provider are 404; no configured TTS is 503.
"""
from __future__ import annotations
import pytest
from app.config import config as app_config
from .conftest import FakeTextToSpeech
pytestmark = pytest.mark.integration
BASE = "/api/v1/podcasts"
@pytest.fixture
def preview_tts(monkeypatch, tmp_path) -> FakeTextToSpeech:
"""Route preview synthesis to the fake provider and an isolated cache."""
provider = FakeTextToSpeech()
monkeypatch.setattr(
"app.podcasts.api.routes.get_text_to_speech", lambda: provider
)
monkeypatch.setattr(
"app.podcasts.voices.preview.PREVIEW_CACHE_ROOT", tmp_path
)
return provider
async def test_preview_returns_playable_audio_for_a_catalog_voice(
client, preview_tts
):
resp = await client.get(f"{BASE}/voices/openai:alloy/preview")
assert resp.status_code == 200
assert resp.headers["content-type"] == "audio/mpeg"
assert resp.content == b"segment-audio"
async def test_preview_is_synthesised_once_then_served_from_cache(
client, preview_tts
):
first = await client.get(f"{BASE}/voices/openai:alloy/preview")
second = await client.get(f"{BASE}/voices/openai:alloy/preview")
assert first.status_code == second.status_code == 200
assert second.content == first.content
assert len(preview_tts.requests) == 1
async def test_preview_unknown_voice_is_404(client, preview_tts):
resp = await client.get(f"{BASE}/voices/openai:nope/preview")
assert resp.status_code == 404
assert preview_tts.requests == []
async def test_preview_voice_of_inactive_provider_is_404(client, preview_tts):
# The active provider is OpenAI (pinned in conftest); a Kokoro voice exists
# in the catalog but cannot be heard through the configured provider.
resp = await client.get(f"{BASE}/voices/kokoro:af_heart/preview")
assert resp.status_code == 404
assert preview_tts.requests == []
async def test_preview_without_tts_provider_is_503(
client, preview_tts, monkeypatch
):
monkeypatch.setattr(app_config, "TTS_SERVICE", None)
resp = await client.get(f"{BASE}/voices/openai:alloy/preview")
assert resp.status_code == 503

View file

@ -0,0 +1,31 @@
"""GET /podcasts/voices: the active provider's catalog, or 503 if unconfigured.
The brief UI needs the voices the configured TTS provider offers; with no
provider configured there is nothing to choose from, which is a 503 rather than
an empty list.
"""
import pytest
from app.config import config as app_config
pytestmark = pytest.mark.integration
BASE = "/api/v1/podcasts"
async def test_voices_returns_the_active_providers_catalog(client):
resp = await client.get(f"{BASE}/voices")
assert resp.status_code == 200
voices = resp.json()
assert voices # openai/tts-1 offers voices
assert {"voice_id", "display_name", "language", "gender"} <= voices[0].keys()
async def test_voices_503_when_no_tts_configured(client, monkeypatch):
monkeypatch.setattr(app_config, "TTS_SERVICE", "")
resp = await client.get(f"{BASE}/voices")
assert resp.status_code == 503

View file

@ -31,7 +31,8 @@ def _disable_otel(monkeypatch: pytest.MonkeyPatch):
("process_file_upload_with_document", "process"),
("process_circleback_meeting", "process"),
("generate_video_presentation", "generate"),
("generate_content_podcast", "generate"),
("podcast.draft_transcript", "podcast.draft"),
("podcast.render_audio", "podcast.render"),
("cleanup_stale_indexing_notifications", "cleanup"),
("reconcile_pending_stripe_credit_purchases", "reconcile"),
("check_periodic_schedules", "check"),

View file

@ -0,0 +1,72 @@
"""Shared builders for podcast unit tests.
These tests exercise pure logic through public interfaces with no test doubles:
the brief and transcript factories build valid aggregates so each test states
only the fields it cares about. Stateful, persistence-backed paths (the lifecycle
service, the Celery task bodies) are covered by the integration suite against a
real database.
"""
from __future__ import annotations
import pytest
from app.podcasts.schemas import (
DurationTarget,
PodcastSpec,
PodcastStyle,
SpeakerRole,
SpeakerSpec,
Transcript,
TranscriptTurn,
)
@pytest.fixture
def make_spec():
"""Factory for a valid :class:`PodcastSpec`; override only what matters."""
def _make(
*,
language: str = "en",
style: PodcastStyle = PodcastStyle.CONVERSATIONAL,
speakers: list[SpeakerSpec] | None = None,
min_minutes: int = 10,
max_minutes: int = 20,
focus: str | None = None,
) -> PodcastSpec:
if speakers is None:
speakers = [
SpeakerSpec(
slot=0, name="Host", role=SpeakerRole.HOST, voice_id="kokoro:am_adam"
),
SpeakerSpec(
slot=1,
name="Guest",
role=SpeakerRole.GUEST,
voice_id="kokoro:af_bella",
),
]
return PodcastSpec(
language=language,
style=style,
speakers=speakers,
duration=DurationTarget(min_minutes=min_minutes, max_minutes=max_minutes),
focus=focus,
)
return _make
@pytest.fixture
def make_transcript():
"""Factory for a valid :class:`Transcript`."""
def _make(turns: list[tuple[int, str]] | None = None) -> Transcript:
if turns is None:
turns = [(0, "Welcome to the show."), (1, "Glad to be here.")]
return Transcript(
turns=[TranscriptTurn(speaker=slot, text=text) for slot, text in turns]
)
return _make

View file

@ -0,0 +1,94 @@
"""The API read model the frontend renders from.
``PodcastDetail.of`` maps a stored podcast row to the detail view and action
responses: it exposes the deserialized brief and transcript and a simple
``has_audio`` flag the client can't derive from the published Zero columns. Each
test builds a row in one lifecycle shape and asserts the mapping reflects it.
"""
from __future__ import annotations
from datetime import UTC, datetime
import pytest
from app.podcasts.api.schemas import PodcastDetail
from app.podcasts.persistence import Podcast, PodcastStatus
pytestmark = pytest.mark.unit
def _podcast(*, status: PodcastStatus = PodcastStatus.PENDING, **columns) -> Podcast:
"""A persisted-looking row: the id and created_at a saved podcast would carry."""
podcast = Podcast(
title="Episode",
search_space_id=3,
status=status,
spec_version=1,
**columns,
)
podcast.id = 1
podcast.created_at = datetime.now(UTC)
return podcast
def test_a_fresh_podcast_exposes_no_brief_transcript_or_audio():
detail = PodcastDetail.of(_podcast())
assert detail.status == PodcastStatus.PENDING
assert detail.spec is None
assert detail.transcript is None
assert detail.has_audio is False
def test_an_awaiting_brief_podcast_exposes_the_deserialized_brief(make_spec):
podcast = _podcast(
status=PodcastStatus.AWAITING_BRIEF,
spec=make_spec(language="fr").model_dump(mode="json"),
)
detail = PodcastDetail.of(podcast)
assert detail.spec is not None
assert detail.spec.language == "fr"
def test_a_legacy_episode_still_exposes_its_transcript_and_audio():
# Pre-rework rows stored [{speaker_id, dialog}] and a local file path;
# they must keep flowing through the new read model, not fail validation.
podcast = _podcast(
status=PodcastStatus.READY,
podcast_transcript=[
{"speaker_id": 0, "dialog": "Welcome back."},
{"speaker_id": 1, "dialog": "Glad to be here."},
],
file_location="/var/old/podcast.mp3",
)
detail = PodcastDetail.of(podcast)
assert detail.has_audio is True
assert detail.transcript is not None
assert [(turn.speaker, turn.text) for turn in detail.transcript.turns] == [
(0, "Welcome back."),
(1, "Glad to be here."),
]
def test_a_ready_podcast_reports_available_audio(make_spec, make_transcript):
podcast = _podcast(
status=PodcastStatus.READY,
spec=make_spec().model_dump(mode="json"),
podcast_transcript=make_transcript().model_dump(mode="json"),
storage_backend="local",
storage_key="k",
duration_seconds=120,
)
detail = PodcastDetail.of(podcast)
assert detail.status == PodcastStatus.READY
assert detail.has_audio is True
assert detail.duration_seconds == 120
assert detail.transcript is not None
assert detail.error is None

View file

@ -0,0 +1,90 @@
"""The renderer refuses an inconsistent spec/transcript before spending work.
Full synthesis-and-merge needs FFmpeg and a real provider, so it belongs to an
integration test. What is pure and worth securing here is the renderer's
contract that it validates the transcript against the brief up front: a turn
naming an unknown speaker, or a speaker naming an unknown voice, fails loudly
rather than producing silent or wrong audio. The TTS provider is an external
port, faked here and never expected to be called on these paths.
"""
from __future__ import annotations
from pathlib import Path
import pytest
from app.podcasts.rendering import PodcastRenderer, RenderError
from app.podcasts.schemas import (
DurationTarget,
PodcastSpec,
SpeakerRole,
SpeakerSpec,
Transcript,
TranscriptTurn,
)
from app.podcasts.tts import SynthesizedAudio
from app.podcasts.voices import CatalogVoice, TtsProvider, VoiceCatalog, VoiceGender
pytestmark = pytest.mark.unit
class _UnusedTTS:
"""A TTS port double that fails the test if it is ever asked to speak.
These behaviors must short-circuit before synthesis, so any call here is a
regression.
"""
@property
def container(self) -> str:
return "mp3"
async def synthesize(self, _request): # pragma: no cover - must not run
raise AssertionError("synthesis should not be attempted")
return SynthesizedAudio(data=b"", container="mp3")
def _catalog_with(voice_id: str) -> VoiceCatalog:
return VoiceCatalog(
[
CatalogVoice(
voice_id=voice_id,
provider=TtsProvider.KOKORO,
language="en-US",
display_name=voice_id,
gender=VoiceGender.MALE,
native_ref="am_adam",
)
]
)
def _spec(voice_id: str) -> PodcastSpec:
return PodcastSpec(
language="en",
speakers=[
SpeakerSpec(slot=0, name="Host", role=SpeakerRole.HOST, voice_id=voice_id)
],
duration=DurationTarget(min_minutes=5, max_minutes=10),
)
async def test_render_rejects_a_turn_for_an_unknown_speaker(tmp_path):
renderer = PodcastRenderer(tts=_UnusedTTS(), catalog=_catalog_with("kokoro:am_adam"))
transcript = Transcript(turns=[TranscriptTurn(speaker=5, text="Who am I?")])
with pytest.raises(RenderError):
await renderer.render(
spec=_spec("kokoro:am_adam"), transcript=transcript, workdir=Path(tmp_path)
)
async def test_render_rejects_a_speaker_whose_voice_is_not_in_the_catalog(tmp_path):
renderer = PodcastRenderer(tts=_UnusedTTS(), catalog=_catalog_with("kokoro:am_adam"))
transcript = Transcript(turns=[TranscriptTurn(speaker=0, text="Hello.")])
with pytest.raises(RenderError):
await renderer.render(
spec=_spec("kokoro:ghost"), transcript=transcript, workdir=Path(tmp_path)
)

Some files were not shown because too many files have changed in this diff Show more