refactor(podcasts): drop transcript gate, add regenerate-from-ready and voice previews

This commit is contained in:
CREDO23 2026-06-11 10:42:13 +02:00
parent ccd8209d12
commit 11a6b178a0
22 changed files with 591 additions and 347 deletions

View file

@ -1,7 +1,7 @@
"""HTTP surface for the podcast lifecycle.
Status is observed by the frontend through Zero, so these routes are about
actions (create, edit the brief, approve/regenerate, cancel) and audio delivery.
actions (create, edit/approve the brief, regenerate, cancel) and audio delivery.
Each mutating route performs the guarded transition via the service, commits,
then enqueues the matching Celery task; lifecycle errors map to 409/422.
"""
@ -11,7 +11,7 @@ from __future__ import annotations
import os
from pathlib import Path
from fastapi import APIRouter, Depends, HTTPException
from fastapi import APIRouter, Depends, HTTPException, Response
from fastapi.responses import StreamingResponse
from sqlalchemy import select
from sqlalchemy.ext.asyncio import AsyncSession
@ -33,11 +33,13 @@ from app.podcasts.service import (
SpecConflict,
)
from app.podcasts.storage import open_audio_stream, purge_audio
from app.podcasts.tasks import (
draft_transcript_task,
render_audio_task,
from app.podcasts.tasks import draft_transcript_task
from app.podcasts.tts import get_text_to_speech
from app.podcasts.voices import (
get_voice_catalog,
provider_from_service,
render_voice_preview,
)
from app.podcasts.voices import get_voice_catalog, provider_from_service
from app.users import current_active_user
from app.utils.rbac import check_permission
@ -110,6 +112,29 @@ async def list_voices(language: str | None = None):
]
@router.get("/podcasts/voices/{voice_id}/preview")
async def preview_voice(
voice_id: str,
user: User = Depends(current_active_user),
):
"""A short audio sample of a voice, so users pick by sound."""
if not app_config.TTS_SERVICE:
raise HTTPException(status_code=503, detail="No TTS provider configured")
provider = provider_from_service(app_config.TTS_SERVICE)
try:
voice = get_voice_catalog().get(voice_id)
except KeyError:
raise HTTPException(status_code=404, detail="Unknown voice") from None
if voice.provider is not provider:
raise HTTPException(
status_code=404, detail="Voice not offered by the active TTS provider"
)
data, content_type = await render_voice_preview(voice, get_text_to_speech())
return Response(content=data, media_type=content_type)
@router.post("/podcasts", response_model=PodcastDetail, status_code=201)
async def create_podcast(
body: CreatePodcastRequest,
@ -180,21 +205,6 @@ async def approve_brief(
return PodcastDetail.of(podcast)
@router.post("/podcasts/{podcast_id}/transcript/approve", response_model=PodcastDetail)
async def approve_transcript(
podcast_id: int,
session: AsyncSession = Depends(get_async_session),
user: User = Depends(current_active_user),
):
"""Approve the transcript and start rendering audio."""
podcast = await _load(session, user, podcast_id, Permission.PODCASTS_UPDATE)
async with _lifecycle_errors():
await PodcastService(session).approve(podcast)
await session.commit()
render_audio_task.delay(podcast.id)
return PodcastDetail.of(podcast)
@router.post(
"/podcasts/{podcast_id}/transcript/regenerate", response_model=PodcastDetail
)
@ -203,7 +213,7 @@ async def regenerate_transcript(
session: AsyncSession = Depends(get_async_session),
user: User = Depends(current_active_user),
):
"""Reject the transcript and draft a fresh one."""
"""Send a finished episode back to drafting for a fresh take."""
podcast = await _load(session, user, podcast_id, Permission.PODCASTS_UPDATE)
async with _lifecycle_errors():
await PodcastService(session).regenerate(podcast)

View file

@ -2,9 +2,12 @@
The status drives a guarded state machine. A podcast is proposed (``PENDING``),
gets a reviewable brief (``AWAITING_BRIEF``), is drafted into a transcript
(``DRAFTING`` ``AWAITING_REVIEW``), then rendered to audio (``RENDERING``
``READY``). ``FAILED`` and ``CANCELLED`` are terminal. The Python enum is kept
in lockstep with the ``podcast_status`` Postgres type via its paired migration.
(``DRAFTING``), then rendered to audio (``RENDERING`` ``READY``). ``FAILED``
and ``CANCELLED`` are terminal; a ``READY`` episode can be sent back to
drafting for regeneration. ``AWAITING_REVIEW`` is retained for legacy rows but
never entered anymore the brief is the only approval gate. The Python enum is
kept in lockstep with the ``podcast_status`` Postgres type via its paired
migration.
"""
from __future__ import annotations
@ -33,5 +36,5 @@ class PodcastStatus(StrEnum):
return self in _GATES
_TERMINAL = frozenset({PodcastStatus.READY, PodcastStatus.FAILED, PodcastStatus.CANCELLED})
_TERMINAL = frozenset({PodcastStatus.FAILED, PodcastStatus.CANCELLED})
_GATES = frozenset({PodcastStatus.AWAITING_BRIEF, PodcastStatus.AWAITING_REVIEW})

View file

@ -25,20 +25,18 @@ _ALLOWED: dict[PodcastStatus, frozenset[PodcastStatus]] = {
{PodcastStatus.DRAFTING, PodcastStatus.FAILED, PodcastStatus.CANCELLED}
),
PodcastStatus.DRAFTING: frozenset(
{PodcastStatus.AWAITING_REVIEW, PodcastStatus.FAILED, PodcastStatus.CANCELLED}
{PodcastStatus.RENDERING, PodcastStatus.FAILED, PodcastStatus.CANCELLED}
),
# Never entered anymore (the transcript gate was dropped); kept with exits
# so legacy rows aren't stranded.
PodcastStatus.AWAITING_REVIEW: frozenset(
{
PodcastStatus.RENDERING, # approve
PodcastStatus.DRAFTING, # regenerate
PodcastStatus.FAILED,
PodcastStatus.CANCELLED,
}
{PodcastStatus.DRAFTING, PodcastStatus.FAILED, PodcastStatus.CANCELLED}
),
PodcastStatus.RENDERING: frozenset(
{PodcastStatus.READY, PodcastStatus.FAILED, PodcastStatus.CANCELLED}
),
PodcastStatus.READY: frozenset(),
# Not terminal: regeneration is decided by listening to the finished episode.
PodcastStatus.READY: frozenset({PodcastStatus.DRAFTING}),
PodcastStatus.FAILED: frozenset(),
PodcastStatus.CANCELLED: frozenset(),
}
@ -121,22 +119,22 @@ class PodcastService:
async def attach_transcript(
self, podcast: Podcast, transcript: Transcript
) -> Podcast:
"""Record the drafted transcript and open the go/no-go gate."""
self._transition(podcast, PodcastStatus.AWAITING_REVIEW)
"""Record the drafted transcript and move straight to rendering."""
self._transition(podcast, PodcastStatus.RENDERING)
podcast.podcast_transcript = transcript.model_dump(mode="json")
await self._session.flush()
return podcast
async def approve(self, podcast: Podcast) -> Podcast:
"""Accept the transcript and start rendering."""
if not podcast.podcast_transcript:
raise PreconditionFailed("cannot render without a transcript")
self._transition(podcast, PodcastStatus.RENDERING)
await self._session.flush()
return podcast
# Guards regenerate beyond the transition table: from AWAITING_BRIEF the
# DRAFTING target is also legal, but there it means brief approval.
_REGENERABLE = frozenset({PodcastStatus.READY, PodcastStatus.AWAITING_REVIEW})
async def regenerate(self, podcast: Podcast) -> Podcast:
"""Reject the transcript and draft a new one."""
"""Send the episode back to drafting for a fresh transcript and render."""
if _status(podcast) not in self._REGENERABLE:
raise InvalidTransition(
f"nothing to regenerate from {_status(podcast).value}"
)
self._transition(podcast, PodcastStatus.DRAFTING)
await self._session.flush()
return podcast

View file

@ -44,5 +44,10 @@ def open_audio_stream(podcast: Podcast) -> AsyncIterator[bytes]:
async def purge_audio(podcast: Podcast) -> None:
"""Delete a podcast's stored audio if present; a missing object is fine."""
if podcast.storage_key:
await get_storage_backend().delete(podcast.storage_key)
await purge_audio_object(podcast.storage_key)
async def purge_audio_object(key: str | None) -> None:
"""Delete a stored audio object by key, e.g. the one a re-render replaced."""
if key:
await get_storage_backend().delete(key)

View file

@ -1,8 +1,9 @@
"""Transcript-drafting task: DRAFTING -> AWAITING_REVIEW.
"""Transcript-drafting task: DRAFTING -> RENDERING.
The expensive, LLM-heavy step, so it runs under ``billable_call``. The API has
already moved the row to DRAFTING and stored the approved brief; this task
drafts the long-form transcript and opens the go/no-go gate.
drafts the long-form transcript and chains straight into the render the brief
gate is the only approval in the lifecycle.
"""
from __future__ import annotations
@ -23,6 +24,7 @@ from app.services.billable_calls import (
)
from app.tasks.celery_tasks import get_celery_session_maker, run_async_celery_task
from .render import render_audio_task
from .runtime import billable_session, mark_failed
logger = logging.getLogger(__name__)
@ -90,4 +92,8 @@ async def _draft_transcript(podcast_id: int, search_space_id: int) -> dict:
await service.attach_transcript(podcast, result["transcript"])
await session.commit()
return {"status": "awaiting_review", "podcast_id": podcast_id}
# Enqueue only after the transaction is committed, so the render worker can
# never pick up a row whose transcript isn't visible yet.
render_audio_task.delay(podcast_id)
return {"status": "rendering", "podcast_id": podcast_id}

View file

@ -15,7 +15,7 @@ from app.celery_app import celery_app
from app.podcasts.persistence import PodcastRepository
from app.podcasts.rendering import PodcastRenderer
from app.podcasts.service import PodcastService, read_spec, read_transcript
from app.podcasts.storage import store_audio
from app.podcasts.storage import purge_audio_object, store_audio
from app.podcasts.tts import get_text_to_speech
from app.podcasts.voices import get_voice_catalog
from app.tasks.celery_tasks import get_celery_session_maker, run_async_celery_task
@ -58,6 +58,8 @@ async def _render_audio(podcast_id: int) -> dict:
spec=spec, transcript=transcript, workdir=workdir
)
superseded_key = podcast.storage_key
backend_name, key = await store_audio(
search_space_id=podcast.search_space_id,
podcast_id=podcast_id,
@ -67,4 +69,8 @@ async def _render_audio(podcast_id: int) -> dict:
podcast, storage_backend=backend_name, storage_key=key
)
await session.commit()
return {"status": "ready", "podcast_id": podcast_id}
# Purge only after the new audio is committed, so a failed re-render never
# destroys the episode the user can still play.
await purge_audio_object(superseded_key)
return {"status": "ready", "podcast_id": podcast_id}

View file

@ -7,6 +7,7 @@ configured provider via :func:`provider_from_service`.
from __future__ import annotations
from .catalog import VoiceCatalog, get_voice_catalog
from .preview import render_voice_preview
from .provider import TtsProvider, provider_from_service
from .voice import ANY_LANGUAGE, CatalogVoice, VoiceGender
@ -18,4 +19,5 @@ __all__ = [
"VoiceGender",
"get_voice_catalog",
"provider_from_service",
"render_voice_preview",
]

View file

@ -0,0 +1,67 @@
"""Audible previews so users pick voices by sound, not by name.
A preview is a short sample sentence synthesised in the voice's own language.
Samples are served through the same content-addressed cache the renderer uses,
so each voice costs at most one synthesis per cache lifetime repeat listens
while comparing voices are free.
"""
from __future__ import annotations
import tempfile
from pathlib import Path
from app.podcasts.rendering.cache import SegmentCache
from app.podcasts.tts import SynthesisRequest, TextToSpeech
from .voice import ANY_LANGUAGE, CatalogVoice
# Previews are user-independent, so one rendered sample serves everyone.
PREVIEW_CACHE_ROOT = Path(tempfile.gettempdir()) / "surfsense_podcasts" / "previews"
_FALLBACK_LANGUAGE = "en"
# A voice previews best speaking its own language.
_SAMPLE_TEXTS = {
"en": "Hi there! This is how I sound when narrating your podcast.",
"es": "¡Hola! Así sueno cuando narro tu pódcast.",
"fr": "Bonjour ! Voici ma voix quand je raconte votre podcast.",
"hi": "नमस्ते! आपका पॉडकास्ट सुनाते समय मेरी आवाज़ ऐसी होती है।",
"it": "Ciao! Questa è la mia voce quando racconto il tuo podcast.",
"ja": "こんにちは。ポッドキャストをお届けするときの私の声です。",
"pt": "Olá! É assim que eu soo ao narrar o seu podcast.",
"zh": "你好!这就是我为你播报播客时的声音。",
}
_CONTENT_TYPES = {"mp3": "audio/mpeg", "wav": "audio/wav"}
async def render_voice_preview(
voice: CatalogVoice, tts: TextToSpeech
) -> tuple[bytes, str]:
"""Return ``(audio_bytes, content_type)`` for a sample spoken by ``voice``."""
language = (
_FALLBACK_LANGUAGE if voice.language == ANY_LANGUAGE else voice.language
)
request = SynthesisRequest(
text=_sample_text(language), voice=voice.native_ref, language=language
)
cache = SegmentCache(PREVIEW_CACHE_ROOT)
key = cache.key(request)
cached = cache.get(key, tts.container)
if cached is not None:
return cached.read_bytes(), _content_type(tts.container)
audio = await tts.synthesize(request)
cache.put(key, audio.container, audio.data)
return audio.data, _content_type(audio.container)
def _sample_text(language: str) -> str:
primary = language.split("-", 1)[0].strip().lower()
return _SAMPLE_TEXTS.get(primary, _SAMPLE_TEXTS[_FALLBACK_LANGUAGE])
def _content_type(container: str) -> str:
return _CONTENT_TYPES.get(container, "application/octet-stream")

View file

@ -166,13 +166,20 @@ def bind_task_session(db_session: AsyncSession, monkeypatch) -> AsyncSession:
class FakeTextToSpeech(TextToSpeech):
"""In-memory TTS provider: every segment yields fixed bytes (the boundary)."""
"""In-memory TTS provider: every segment yields fixed bytes (the boundary).
Records each request so tests can assert how often synthesis was paid for.
"""
def __init__(self) -> None:
self.requests: list[SynthesisRequest] = []
@property
def container(self) -> str:
return "mp3"
async def synthesize(self, request: SynthesisRequest) -> SynthesizedAudio:
self.requests.append(request)
return SynthesizedAudio(data=b"segment-audio", container="mp3")
@ -233,7 +240,6 @@ def make_podcast(db_session: AsyncSession):
_LADDER = [
PodcastStatus.AWAITING_BRIEF,
PodcastStatus.DRAFTING,
PodcastStatus.AWAITING_REVIEW,
PodcastStatus.RENDERING,
PodcastStatus.READY,
]
@ -259,10 +265,8 @@ def make_podcast(db_session: AsyncSession):
await service.attach_brief(podcast, build_spec())
elif target is PodcastStatus.DRAFTING:
await service.begin_drafting(podcast)
elif target is PodcastStatus.AWAITING_REVIEW:
await service.attach_transcript(podcast, build_transcript())
elif target is PodcastStatus.RENDERING:
await service.approve(podcast)
await service.attach_transcript(podcast, build_transcript())
elif target is PodcastStatus.READY:
await service.attach_audio(
podcast,

View file

@ -1,11 +1,12 @@
"""The transcript-drafting task against a real database.
Drafting is the expensive LLM step, so it runs under ``billable_call``. The
behavior that protects users' money: when billing succeeds, a drafted transcript
opens the review gate (DRAFTING -> AWAITING_REVIEW); when billing denies or
settlement fails, the podcast ends FAILED with no transcript left behind. The DB,
service, and transcript persistence run for real; only the true externals are
faked billing (the metering boundary) and the generation graph (the LLM).
behavior that protects users' money: when billing succeeds, the drafted
transcript is stored and rendering starts immediately (DRAFTING -> RENDERING,
render task enqueued the brief gate is the only approval); when billing denies
or settlement fails, the podcast ends FAILED with no transcript left behind. The
DB, service, and transcript persistence run for real; only the true externals
are faked billing (the metering boundary) and the generation graph (the LLM).
"""
from __future__ import annotations
@ -43,8 +44,8 @@ def _wire_billing(monkeypatch, *, billable_call, transcript=None) -> None:
monkeypatch.setattr(draft, "transcript_graph", SimpleNamespace(ainvoke=_ainvoke))
async def test_successful_billing_opens_review_gate_with_transcript(
monkeypatch, db_search_space, make_podcast, bind_task_session
async def test_successful_draft_stores_transcript_and_starts_rendering(
monkeypatch, db_search_space, make_podcast, bind_task_session, captured_tasks
):
podcast = await make_podcast(
search_space_id=db_search_space.id, status=PodcastStatus.DRAFTING
@ -58,9 +59,10 @@ async def test_successful_billing_opens_review_gate_with_transcript(
result = await draft._draft_transcript(podcast.id, db_search_space.id)
assert result["status"] == "awaiting_review"
assert podcast.status == PodcastStatus.AWAITING_REVIEW
assert result["status"] == "rendering"
assert podcast.status == PodcastStatus.RENDERING
assert read_transcript(podcast) is not None
assert captured_tasks.render == [((podcast.id,), {})]
async def test_quota_denial_fails_the_podcast_without_a_transcript(

View file

@ -0,0 +1,60 @@
"""Regeneration: the listen-then-redo loop after the brief gate.
The brief is the only approval; drafting flows straight into rendering. A user
who dislikes the finished audio sends the episode back with regenerate. These
pin the READY -> DRAFTING round trip (with the draft task enqueued) and the 409
for regenerating from states that have nothing to redo.
"""
from __future__ import annotations
import pytest
from app.podcasts.persistence import PodcastStatus
pytestmark = pytest.mark.integration
BASE = "/api/v1/podcasts"
async def test_regenerate_from_ready_returns_to_drafting_and_enqueues_draft(
client, db_search_space, make_podcast, captured_tasks
):
podcast = await make_podcast(
search_space_id=db_search_space.id, status=PodcastStatus.READY
)
resp = await client.post(f"{BASE}/{podcast.id}/transcript/regenerate")
assert resp.status_code == 200
assert resp.json()["status"] == "drafting"
assert captured_tasks.draft == [((podcast.id, db_search_space.id), {})]
assert captured_tasks.render == []
async def test_regenerate_from_brief_gate_is_rejected(
client, db_search_space, make_podcast, captured_tasks
):
# Nothing has been drafted yet, so there is nothing to regenerate.
podcast = await make_podcast(
search_space_id=db_search_space.id, status=PodcastStatus.AWAITING_BRIEF
)
resp = await client.post(f"{BASE}/{podcast.id}/transcript/regenerate")
assert resp.status_code == 409
assert captured_tasks.draft == []
async def test_regenerate_from_cancelled_is_rejected(
client, db_search_space, make_podcast, captured_tasks
):
podcast = await make_podcast(
search_space_id=db_search_space.id, status=PodcastStatus.AWAITING_BRIEF
)
await client.post(f"{BASE}/{podcast.id}/cancel")
resp = await client.post(f"{BASE}/{podcast.id}/transcript/regenerate")
assert resp.status_code == 409
assert captured_tasks.draft == []

View file

@ -11,8 +11,11 @@ from __future__ import annotations
import pytest
from app.podcasts.persistence import PodcastStatus
from app.podcasts.service import PodcastService
from app.podcasts.tasks import render
from .conftest import build_transcript
pytestmark = pytest.mark.integration
@ -30,3 +33,33 @@ async def test_render_marks_ready_and_stores_audio(
assert podcast.storage_backend == "memory"
assert podcast.storage_key
assert fake_storage.objects[podcast.storage_key] == b"merged-audio"
async def test_rerender_replaces_audio_and_purges_the_old_object(
db_session,
db_search_space,
make_podcast,
bind_task_session,
fake_tts,
fake_merge,
fake_storage,
):
# A regenerated episode keeps exactly one stored object: the new render
# must not leak the superseded audio in the object store.
podcast = await make_podcast(
search_space_id=db_search_space.id, status=PodcastStatus.READY
)
old_key = podcast.storage_key
fake_storage.objects[old_key] = b"old-audio"
service = PodcastService(db_session)
await service.regenerate(podcast)
await service.attach_transcript(podcast, build_transcript())
result = await render._render_audio(podcast.id)
assert result["status"] == "ready"
assert podcast.status == PodcastStatus.READY
assert podcast.storage_key != old_key
assert fake_storage.objects[podcast.storage_key] == b"merged-audio"
assert old_key in fake_storage.deleted

View file

@ -33,7 +33,7 @@ async def test_stream_serves_stored_audio(
async def test_stream_404_when_no_audio(client, db_search_space, make_podcast):
podcast = await make_podcast(
search_space_id=db_search_space.id, status=PodcastStatus.AWAITING_REVIEW
search_space_id=db_search_space.id, status=PodcastStatus.DRAFTING
)
resp = await client.get(f"{BASE}/{podcast.id}/stream")

View file

@ -1,81 +0,0 @@
"""The transcript go/no-go gate: approve to render, or regenerate to redraft.
From ``awaiting_review`` the user either approves (start rendering) or regenerates
(redraft). These pin the resulting state, the Celery task each enqueues, and the
HTTP codes for acting from the wrong state (409) or without a transcript (422).
"""
from __future__ import annotations
import pytest
from app.podcasts.persistence import Podcast, PodcastStatus
pytestmark = pytest.mark.integration
BASE = "/api/v1/podcasts"
async def test_approve_transcript_starts_rendering_and_enqueues_render(
client, db_search_space, make_podcast, captured_tasks
):
podcast = await make_podcast(
search_space_id=db_search_space.id, status=PodcastStatus.AWAITING_REVIEW
)
resp = await client.post(f"{BASE}/{podcast.id}/transcript/approve")
assert resp.status_code == 200
assert resp.json()["status"] == "rendering"
assert captured_tasks.render == [((podcast.id,), {})]
assert captured_tasks.draft == []
async def test_regenerate_returns_to_drafting_and_enqueues_draft(
client, db_search_space, make_podcast, captured_tasks
):
podcast = await make_podcast(
search_space_id=db_search_space.id, status=PodcastStatus.AWAITING_REVIEW
)
resp = await client.post(f"{BASE}/{podcast.id}/transcript/regenerate")
assert resp.status_code == 200
assert resp.json()["status"] == "drafting"
assert captured_tasks.draft == [((podcast.id, db_search_space.id), {})]
assert captured_tasks.render == []
async def test_approve_transcript_from_terminal_state_is_rejected(
client, db_search_space, make_podcast, captured_tasks
):
# A ready podcast still has its transcript, so the precondition passes and
# the disallowed terminal->rendering transition is what surfaces (409).
podcast = await make_podcast(
search_space_id=db_search_space.id, status=PodcastStatus.READY
)
resp = await client.post(f"{BASE}/{podcast.id}/transcript/approve")
assert resp.status_code == 409
assert captured_tasks.render == []
async def test_approve_without_transcript_is_unprocessable(
client, db_session, db_search_space, captured_tasks
):
# An anomalous awaiting_review row with no transcript exercises the route's
# precondition->422 mapping (the service refuses to render without one).
podcast = Podcast(
title="No transcript",
search_space_id=db_search_space.id,
status=PodcastStatus.AWAITING_REVIEW,
spec_version=1,
)
db_session.add(podcast)
await db_session.flush()
resp = await client.post(f"{BASE}/{podcast.id}/transcript/approve")
assert resp.status_code == 422
assert captured_tasks.render == []

View file

@ -0,0 +1,79 @@
"""Audible voice previews for the brief gate's voice picker.
A user choosing voices should hear them, not guess from names. The endpoint
synthesises a short sample for a catalog voice and caches it on disk so each
voice is paid for at most once per process lifetime. Unknown voices and voices
of an inactive provider are 404; no configured TTS is 503.
"""
from __future__ import annotations
import pytest
from app.config import config as app_config
from .conftest import FakeTextToSpeech
pytestmark = pytest.mark.integration
BASE = "/api/v1/podcasts"
@pytest.fixture
def preview_tts(monkeypatch, tmp_path) -> FakeTextToSpeech:
"""Route preview synthesis to the fake provider and an isolated cache."""
provider = FakeTextToSpeech()
monkeypatch.setattr(
"app.podcasts.api.routes.get_text_to_speech", lambda: provider
)
monkeypatch.setattr(
"app.podcasts.voices.preview.PREVIEW_CACHE_ROOT", tmp_path
)
return provider
async def test_preview_returns_playable_audio_for_a_catalog_voice(
client, preview_tts
):
resp = await client.get(f"{BASE}/voices/openai:alloy/preview")
assert resp.status_code == 200
assert resp.headers["content-type"] == "audio/mpeg"
assert resp.content == b"segment-audio"
async def test_preview_is_synthesised_once_then_served_from_cache(
client, preview_tts
):
first = await client.get(f"{BASE}/voices/openai:alloy/preview")
second = await client.get(f"{BASE}/voices/openai:alloy/preview")
assert first.status_code == second.status_code == 200
assert second.content == first.content
assert len(preview_tts.requests) == 1
async def test_preview_unknown_voice_is_404(client, preview_tts):
resp = await client.get(f"{BASE}/voices/openai:nope/preview")
assert resp.status_code == 404
assert preview_tts.requests == []
async def test_preview_voice_of_inactive_provider_is_404(client, preview_tts):
# The active provider is OpenAI (pinned in conftest); a Kokoro voice exists
# in the catalog but cannot be heard through the configured provider.
resp = await client.get(f"{BASE}/voices/kokoro:af_heart/preview")
assert resp.status_code == 404
assert preview_tts.requests == []
async def test_preview_without_tts_provider_is_503(
client, preview_tts, monkeypatch
):
monkeypatch.setattr(app_config, "TTS_SERVICE", None)
resp = await client.get(f"{BASE}/voices/openai:alloy/preview")
assert resp.status_code == 503

View file

@ -26,6 +26,7 @@ import {
import type { LivePodcast } from "@/hooks/use-podcast-live";
import { podcastsApiService } from "@/lib/apis/podcasts-api.service";
import { AppError } from "@/lib/error";
import { VoicePreviewButton } from "./voice-preview-button";
// A "*" voice speaks whatever language the text is in (mirrors the backend
// catalog's ANY_LANGUAGE sentinel).
@ -274,23 +275,26 @@ export function BriefReview({ podcast, spec, onApproved }: BriefReviewProps) {
</SelectContent>
</Select>
</div>
<div className="flex w-44 flex-col gap-1.5">
<div className="flex w-52 flex-col gap-1.5">
<Label className="text-xs">Voice</Label>
<Select
value={speaker.voice_id}
onValueChange={(value) => updateSpeaker(speaker.slot, { voice_id: value })}
>
<SelectTrigger>
<SelectValue placeholder={voices === null ? "Loading…" : "Voice"} />
</SelectTrigger>
<SelectContent>
{voiceItems(voicesForLanguage, speaker.voice_id).map((voice) => (
<SelectItem key={voice.voice_id} value={voice.voice_id}>
{voice.display_name} ({voice.gender})
</SelectItem>
))}
</SelectContent>
</Select>
<div className="flex items-center gap-1">
<Select
value={speaker.voice_id}
onValueChange={(value) => updateSpeaker(speaker.slot, { voice_id: value })}
>
<SelectTrigger>
<SelectValue placeholder={voices === null ? "Loading…" : "Voice"} />
</SelectTrigger>
<SelectContent>
{voiceItems(voicesForLanguage, speaker.voice_id).map((voice) => (
<SelectItem key={voice.voice_id} value={voice.voice_id}>
{voice.display_name} ({voice.gender})
</SelectItem>
))}
</SelectContent>
</Select>
<VoicePreviewButton voiceId={speaker.voice_id} />
</div>
</div>
<Button
type="button"

View file

@ -1,12 +1,15 @@
"use client";
import type { ToolCallMessagePartProps } from "@assistant-ui/react";
import { Loader2, RotateCcw } from "lucide-react";
import { usePathname } from "next/navigation";
import { useState } from "react";
import { toast } from "sonner";
import { TextShimmerLoader } from "@/components/prompt-kit/loader";
import { Button } from "@/components/ui/button";
import type { PodcastSpec } from "@/contracts/types/podcast.types";
import { usePodcastLive } from "@/hooks/use-podcast-live";
import { type LivePodcast, usePodcastLive } from "@/hooks/use-podcast-live";
import { podcastsApiService } from "@/lib/apis/podcasts-api.service";
import { PodcastErrorState, PodcastPlayer } from "./player";
import { PodcastReviewSheet } from "./review-sheet";
import type { GeneratePodcastArgs, GeneratePodcastResult } from "./schema";
@ -69,6 +72,66 @@ function ReviewGateCard({
);
}
/**
* Regenerating discards the current audio, so a stray click is guarded by an
* inline confirm step.
*/
function RegenerateButton({ podcast }: { podcast: LivePodcast }) {
const [confirming, setConfirming] = useState(false);
const [isSubmitting, setIsSubmitting] = useState(false);
const regenerate = async () => {
setIsSubmitting(true);
try {
await podcastsApiService.regenerate(podcast.id);
} catch (error) {
toast.error(error instanceof Error ? error.message : "Failed to regenerate the podcast");
} finally {
setIsSubmitting(false);
setConfirming(false);
}
};
if (!confirming) {
return (
<Button
type="button"
variant="ghost"
size="sm"
className="text-muted-foreground"
onClick={() => setConfirming(true)}
>
<RotateCcw className="size-3.5" /> Regenerate
</Button>
);
}
return (
<div className="flex items-center gap-2">
<span className="text-xs text-muted-foreground">Replace this episode with a new take?</span>
<Button
type="button"
variant="ghost"
size="sm"
onClick={() => setConfirming(false)}
disabled={isSubmitting}
>
Keep it
</Button>
<Button
type="button"
variant="destructive"
size="sm"
onClick={regenerate}
disabled={isSubmitting}
>
{isSubmitting ? <Loader2 className="size-3.5 animate-spin" /> : null}
Regenerate
</Button>
</div>
);
}
/** Status-driven card for an authenticated viewer, fed by Zero push. */
function LivePodcastCard({
podcastId,
@ -102,30 +165,47 @@ function LivePodcastCard({
case "rendering":
return <WorkingState title={title} label="Rendering audio" />;
case "awaiting_brief":
case "awaiting_review": {
const isBriefGate = podcast.status === "awaiting_brief";
return (
<>
<ReviewGateCard
title={title}
heading={
isBriefGate ? "Brief ready for your review" : "Transcript ready for your review"
}
heading="Brief ready for your review"
summary={briefSummary(podcast.spec)}
buttonLabel={isBriefGate ? "Review brief" : "Review transcript"}
buttonLabel="Review brief"
onReview={() => setReviewOpen(true)}
/>
<PodcastReviewSheet podcast={podcast} open={reviewOpen} onOpenChange={setReviewOpen} />
</>
);
}
case "awaiting_review":
// Legacy rows parked at the removed transcript gate; the only way
// forward is a fresh draft.
return (
<div className="my-4 max-w-lg overflow-hidden rounded-2xl border bg-muted/30 select-none">
<div className="px-5 pt-5 pb-4">
<p className="text-sm font-semibold text-foreground line-clamp-2">{title}</p>
<p className="text-xs text-muted-foreground mt-0.5">
This podcast was drafted before audio rendering became automatic.
</p>
</div>
<div className="mx-5 h-px bg-border/50" />
<div className="flex justify-end px-5 py-3">
<RegenerateButton podcast={podcast} />
</div>
</div>
);
case "ready":
return (
<PodcastPlayer
podcastId={podcast.id}
title={title}
durationMs={podcast.durationSeconds ? podcast.durationSeconds * 1000 : undefined}
/>
<div>
<PodcastPlayer
podcastId={podcast.id}
title={title}
durationMs={podcast.durationSeconds ? podcast.durationSeconds * 1000 : undefined}
/>
<div className="-mt-2 mb-4 flex max-w-lg justify-end">
<RegenerateButton podcast={podcast} />
</div>
</div>
);
case "failed":
return <PodcastErrorState title={title} error={podcast.error || "Generation failed"} />;

View file

@ -9,7 +9,6 @@ import {
} from "@/components/ui/sheet";
import type { LivePodcast } from "@/hooks/use-podcast-live";
import { BriefReview } from "./brief-review";
import { TranscriptReview } from "./transcript-review";
interface PodcastReviewSheetProps {
podcast: LivePodcast;
@ -18,49 +17,34 @@ interface PodcastReviewSheetProps {
}
/**
* The podcast panel: hosts whichever gate the lifecycle is waiting on. The
* pushed status decides the content, so the same sheet serves both gates and
* simply closes once the podcast moves on.
* The podcast panel: hosts the brief gate, the only approval in the lifecycle
* after it the episode generates unattended.
*/
export function PodcastReviewSheet({ podcast, open, onOpenChange }: PodcastReviewSheetProps) {
const close = () => onOpenChange(false);
const gate =
podcast.status === "awaiting_brief" && podcast.spec ? (
<>
<SheetHeader>
<SheetTitle>Review podcast brief</SheetTitle>
<SheetDescription>
Confirm the language, voices, and length before the transcript is drafted.
</SheetDescription>
</SheetHeader>
<div className="overflow-y-auto px-4 pb-4">
<BriefReview podcast={podcast} spec={podcast.spec} onApproved={close} />
</div>
</>
) : podcast.status === "awaiting_review" ? (
<>
<SheetHeader>
<SheetTitle>Review transcript</SheetTitle>
<SheetDescription>
Approve the script to render the audio, or regenerate a fresh draft.
</SheetDescription>
</SheetHeader>
<div className="min-h-0 flex-1 px-4 pb-4">
<TranscriptReview podcast={podcast} onDecided={close} />
</div>
</>
) : (
<SheetHeader>
<SheetTitle>{podcast.title}</SheetTitle>
<SheetDescription>Nothing is awaiting review right now.</SheetDescription>
</SheetHeader>
);
return (
<Sheet open={open} onOpenChange={onOpenChange}>
<SheetContent side="right" className="flex w-full flex-col sm:max-w-xl">
{gate}
{podcast.status === "awaiting_brief" && podcast.spec ? (
<>
<SheetHeader>
<SheetTitle>Review podcast brief</SheetTitle>
<SheetDescription>
Confirm the language, voices, and length the episode generates unattended after
this.
</SheetDescription>
</SheetHeader>
<div className="overflow-y-auto px-4 pb-4">
<BriefReview podcast={podcast} spec={podcast.spec} onApproved={close} />
</div>
</>
) : (
<SheetHeader>
<SheetTitle>{podcast.title}</SheetTitle>
<SheetDescription>Nothing is awaiting review right now.</SheetDescription>
</SheetHeader>
)}
</SheetContent>
</Sheet>
);

View file

@ -1,118 +0,0 @@
"use client";
import { Loader2 } from "lucide-react";
import { useEffect, useState } from "react";
import { toast } from "sonner";
import { TextShimmerLoader } from "@/components/prompt-kit/loader";
import { Button } from "@/components/ui/button";
import type { PodcastDetail } from "@/contracts/types/podcast.types";
import type { LivePodcast } from "@/hooks/use-podcast-live";
import { podcastsApiService } from "@/lib/apis/podcasts-api.service";
import { speakerLabel } from "./schema";
interface TranscriptReviewProps {
podcast: LivePodcast;
onDecided: () => void;
}
/**
* Gate 2: a go/no-go on the drafted script before the expensive render.
* Read-only by design approve it, regenerate a fresh draft, or cancel.
*/
export function TranscriptReview({ podcast, onDecided }: TranscriptReviewProps) {
const [detail, setDetail] = useState<PodcastDetail | null>(null);
const [loadError, setLoadError] = useState<string | null>(null);
const [pendingAction, setPendingAction] = useState<"approve" | "regenerate" | "cancel" | null>(
null
);
useEffect(() => {
let cancelled = false;
setDetail(null);
setLoadError(null);
podcastsApiService
.getDetail(podcast.id)
.then((data) => {
if (!cancelled) setDetail(data);
})
.catch((error) => {
if (!cancelled) {
setLoadError(error instanceof Error ? error.message : "Failed to load the transcript");
}
});
return () => {
cancelled = true;
};
}, [podcast.id]);
const act = async (action: "approve" | "regenerate" | "cancel", run: () => Promise<unknown>) => {
setPendingAction(action);
try {
await run();
onDecided();
} catch (error) {
toast.error(error instanceof Error ? error.message : "Action failed");
} finally {
setPendingAction(null);
}
};
if (loadError) {
return <p className="text-sm text-destructive">{loadError}</p>;
}
if (!detail) {
return <TextShimmerLoader text="Loading transcript" size="sm" />;
}
const turns = detail.transcript?.turns ?? [];
return (
<div className="flex h-full flex-col gap-4">
<div className="flex-1 space-y-3 overflow-y-auto rounded-lg border bg-muted/30 p-4 select-text">
{turns.map((turn, idx) => (
<div key={`${idx}-${turn.speaker}`} className="text-sm">
<span className="font-medium text-primary">
{speakerLabel(detail.spec, turn.speaker)}:
</span>{" "}
<span className="text-muted-foreground">{turn.text}</span>
</div>
))}
{turns.length === 0 ? (
<p className="text-sm text-muted-foreground">No transcript available.</p>
) : null}
</div>
<div className="flex justify-end gap-2">
<Button
type="button"
variant="ghost"
disabled={pendingAction !== null}
onClick={() => act("cancel", () => podcastsApiService.cancel(podcast.id))}
>
{pendingAction === "cancel" ? <Loader2 className="size-4 animate-spin" /> : null}
Cancel podcast
</Button>
<Button
type="button"
variant="outline"
disabled={pendingAction !== null}
onClick={() =>
act("regenerate", () => podcastsApiService.regenerateTranscript(podcast.id))
}
>
{pendingAction === "regenerate" ? <Loader2 className="size-4 animate-spin" /> : null}
Regenerate
</Button>
<Button
type="button"
disabled={pendingAction !== null || turns.length === 0}
onClick={() => act("approve", () => podcastsApiService.approveTranscript(podcast.id))}
>
{pendingAction === "approve" ? <Loader2 className="size-4 animate-spin" /> : null}
Approve &amp; render audio
</Button>
</div>
</div>
);
}

View file

@ -0,0 +1,98 @@
"use client";
import { Loader2, Play, Square } from "lucide-react";
import { useEffect, useRef, useState } from "react";
import { toast } from "sonner";
import { Button } from "@/components/ui/button";
import { podcastsApiService } from "@/lib/apis/podcasts-api.service";
// Comparing voices means replaying the same samples, so each voice is fetched
// at most once per page lifetime.
const sampleUrls = new Map<string, Promise<string>>();
// Overlapping samples are useless for comparison, so only one plays at a time.
let activeAudio: HTMLAudioElement | null = null;
let stopActive: (() => void) | null = null;
function getSampleUrl(voiceId: string): Promise<string> {
let url = sampleUrls.get(voiceId);
if (!url) {
url = podcastsApiService.previewVoice(voiceId).then((blob) => URL.createObjectURL(blob));
// A failed fetch must not poison the cache for retries.
url.catch(() => sampleUrls.delete(voiceId));
sampleUrls.set(voiceId, url);
}
return url;
}
/** Plays a short sample of `voiceId` so users pick voices by sound. */
export function VoicePreviewButton({ voiceId }: { voiceId: string }) {
const [state, setState] = useState<"idle" | "loading" | "playing">("idle");
const mountedRef = useRef(true);
useEffect(() => {
mountedRef.current = true;
return () => {
mountedRef.current = false;
if (stopActive && activeAudio?.dataset.voiceId === voiceId) {
stopActive();
}
};
}, [voiceId]);
const stop = () => {
if (stopActive) stopActive();
};
const play = async () => {
stop();
setState("loading");
try {
const url = await getSampleUrl(voiceId);
if (!mountedRef.current) return;
const audio = new Audio(url);
audio.dataset.voiceId = voiceId;
activeAudio = audio;
stopActive = () => {
audio.pause();
activeAudio = null;
stopActive = null;
if (mountedRef.current) setState("idle");
};
audio.onended = () => {
if (activeAudio === audio) {
activeAudio = null;
stopActive = null;
}
if (mountedRef.current) setState("idle");
};
await audio.play();
if (mountedRef.current) setState("playing");
} catch (error) {
if (mountedRef.current) setState("idle");
toast.error(error instanceof Error ? error.message : "Couldn't play the voice sample");
}
};
const isPlaying = state === "playing";
return (
<Button
type="button"
variant="ghost"
size="icon"
aria-label={isPlaying ? "Stop voice sample" : "Play voice sample"}
disabled={state === "loading"}
onClick={isPlaying ? stop : play}
>
{state === "loading" ? (
<Loader2 className="size-4 animate-spin" />
) : isPlaying ? (
<Square className="size-4" />
) : (
<Play className="size-4" />
)}
</Button>
);
}

View file

@ -16,18 +16,18 @@ export const podcastStatus = z.enum([
]);
export type PodcastStatus = z.infer<typeof podcastStatus>;
/** States waiting on user input before the lifecycle can proceed. */
export const GATE_STATUSES: ReadonlySet<PodcastStatus> = new Set([
"awaiting_brief",
"awaiting_review",
]);
/**
* States waiting on user input before the lifecycle can proceed. The brief is
* the only approval gate; `awaiting_review` survives in the enum for legacy
* rows but is never entered anymore.
*/
export const GATE_STATUSES: ReadonlySet<PodcastStatus> = new Set(["awaiting_brief"]);
/** States from which no further transition is possible. */
export const TERMINAL_STATUSES: ReadonlySet<PodcastStatus> = new Set([
"ready",
"failed",
"cancelled",
]);
/**
* States from which no further transition is possible. A `ready` episode is
* not terminal: it can be sent back to drafting for regeneration.
*/
export const TERMINAL_STATUSES: ReadonlySet<PodcastStatus> = new Set(["failed", "cancelled"]);
// =============================================================================
// Brief (spec) — mirror app/podcasts/schemas/spec.py

View file

@ -37,11 +37,8 @@ class PodcastsApiService {
return baseApiService.post(`${BASE}/${podcastId}/brief/approve`, podcastDetail);
};
approveTranscript = async (podcastId: number) => {
return baseApiService.post(`${BASE}/${podcastId}/transcript/approve`, podcastDetail);
};
regenerateTranscript = async (podcastId: number) => {
// Destructive: the current transcript and audio are replaced by a fresh take.
regenerate = async (podcastId: number) => {
return baseApiService.post(`${BASE}/${podcastId}/transcript/regenerate`, podcastDetail);
};
@ -53,6 +50,11 @@ class PodcastsApiService {
const qs = language ? `?${new URLSearchParams({ language })}` : "";
return baseApiService.get(`${BASE}/voices${qs}`, voiceOptionList);
};
// A short audio sample of a voice, cached server-side per voice.
previewVoice = async (voiceId: string) => {
return baseApiService.getBlob(`${BASE}/voices/${encodeURIComponent(voiceId)}/preview`);
};
}
export const podcastsApiService = new PodcastsApiService();