mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-05-23 19:05:16 +02:00
feat: init video presentation agent
This commit is contained in:
parent
40d949b7d5
commit
b28f135a96
37 changed files with 3567 additions and 24 deletions
|
|
@ -132,6 +132,17 @@ _TOOL_INSTRUCTIONS["generate_podcast"] = """
|
|||
- After calling this tool, inform the user that podcast generation has started and they will see the player when it's ready (takes 3-5 minutes).
|
||||
"""
|
||||
|
||||
_TOOL_INSTRUCTIONS["generate_video_presentation"] = """
|
||||
- generate_video_presentation: Generate a video presentation from provided content.
|
||||
- Use this when the user asks to create a video, presentation, slides, or slide deck.
|
||||
- Trigger phrases: "give me a presentation", "create slides", "generate a video", "make a slide deck", "turn this into a presentation"
|
||||
- Args:
|
||||
- source_content: The text content to turn into a presentation. The more detailed, the better.
|
||||
- video_title: Optional title (default: "SurfSense Presentation")
|
||||
- user_prompt: Optional style instructions (e.g., "Make it technical and detailed")
|
||||
- After calling this tool, inform the user that generation has started and they will see the presentation when it's ready.
|
||||
"""
|
||||
|
||||
_TOOL_INSTRUCTIONS["generate_report"] = """
|
||||
- generate_report: Generate or revise a structured Markdown report artifact.
|
||||
- WHEN TO CALL THIS TOOL — the message must contain a creation or modification VERB directed at producing a deliverable:
|
||||
|
|
@ -438,6 +449,16 @@ _TOOL_EXAMPLES["generate_podcast"] = """
|
|||
- Then: `generate_podcast(source_content="Key insights about quantum computing from the knowledge base:\\n\\n[Comprehensive summary of all relevant search results with key facts, concepts, and findings]", podcast_title="Quantum Computing Explained")`
|
||||
"""
|
||||
|
||||
_TOOL_EXAMPLES["generate_video_presentation"] = """
|
||||
- User: "Give me a presentation about AI trends based on what we discussed"
|
||||
- First search for relevant content, then call: `generate_video_presentation(source_content="Based on our conversation and search results: [detailed summary of chat + search findings]", video_title="AI Trends Presentation")`
|
||||
- User: "Create slides summarizing this conversation"
|
||||
- Call: `generate_video_presentation(source_content="Complete conversation summary:\\n\\nUser asked about [topic 1]:\\n[Your detailed response]\\n\\nUser then asked about [topic 2]:\\n[Your detailed response]\\n\\n[Continue for all exchanges in the conversation]", video_title="Conversation Summary")`
|
||||
- User: "Make a video presentation about quantum computing"
|
||||
- First search: `search_knowledge_base(query="quantum computing")`
|
||||
- Then: `generate_video_presentation(source_content="Key insights about quantum computing from the knowledge base:\\n\\n[Comprehensive summary of all relevant search results with key facts, concepts, and findings]", video_title="Quantum Computing Explained")`
|
||||
"""
|
||||
|
||||
_TOOL_EXAMPLES["generate_report"] = """
|
||||
- User: "Generate a report about AI trends"
|
||||
- Call: `generate_report(topic="AI Trends Report", source_strategy="kb_search", search_queries=["AI trends recent developments", "artificial intelligence industry trends", "AI market growth and predictions"], report_style="detailed")`
|
||||
|
|
@ -499,6 +520,7 @@ _ALL_TOOL_NAMES_ORDERED = [
|
|||
"search_knowledge_base",
|
||||
"web_search",
|
||||
"generate_podcast",
|
||||
"generate_video_presentation",
|
||||
"generate_report",
|
||||
"link_preview",
|
||||
"display_image",
|
||||
|
|
|
|||
|
|
@ -8,6 +8,7 @@ Available tools:
|
|||
- search_knowledge_base: Search the user's personal knowledge base
|
||||
- search_surfsense_docs: Search Surfsense documentation for usage help
|
||||
- generate_podcast: Generate audio podcasts from content
|
||||
- generate_video_presentation: Generate video presentations with slides and narration
|
||||
- generate_image: Generate images from text descriptions using AI models
|
||||
- link_preview: Fetch rich previews for URLs
|
||||
- display_image: Display images in chat
|
||||
|
|
@ -39,6 +40,7 @@ from .registry import (
|
|||
from .scrape_webpage import create_scrape_webpage_tool
|
||||
from .search_surfsense_docs import create_search_surfsense_docs_tool
|
||||
from .user_memory import create_recall_memory_tool, create_save_memory_tool
|
||||
from .video_presentation import create_generate_video_presentation_tool
|
||||
|
||||
__all__ = [
|
||||
# Registry
|
||||
|
|
@ -51,6 +53,7 @@ __all__ = [
|
|||
"create_display_image_tool",
|
||||
"create_generate_image_tool",
|
||||
"create_generate_podcast_tool",
|
||||
"create_generate_video_presentation_tool",
|
||||
"create_link_preview_tool",
|
||||
"create_recall_memory_tool",
|
||||
"create_save_memory_tool",
|
||||
|
|
|
|||
|
|
@ -73,6 +73,7 @@ from .shared_memory import (
|
|||
create_save_shared_memory_tool,
|
||||
)
|
||||
from .user_memory import create_recall_memory_tool, create_save_memory_tool
|
||||
from .video_presentation import create_generate_video_presentation_tool
|
||||
from .web_search import create_web_search_tool
|
||||
|
||||
# =============================================================================
|
||||
|
|
@ -136,6 +137,17 @@ BUILTIN_TOOLS: list[ToolDefinition] = [
|
|||
),
|
||||
requires=["search_space_id", "db_session", "thread_id"],
|
||||
),
|
||||
# Video presentation generation tool
|
||||
ToolDefinition(
|
||||
name="generate_video_presentation",
|
||||
description="Generate a video presentation with slides and narration from provided content",
|
||||
factory=lambda deps: create_generate_video_presentation_tool(
|
||||
search_space_id=deps["search_space_id"],
|
||||
db_session=deps["db_session"],
|
||||
thread_id=deps["thread_id"],
|
||||
),
|
||||
requires=["search_space_id", "db_session", "thread_id"],
|
||||
),
|
||||
# Report generation tool (inline, short-lived sessions for DB ops)
|
||||
# Supports internal KB search via source_strategy so the agent doesn't
|
||||
# need to call search_knowledge_base separately before generating.
|
||||
|
|
|
|||
|
|
@ -0,0 +1,171 @@
|
|||
"""
|
||||
Video presentation generation tool for the SurfSense agent.
|
||||
|
||||
This module provides a factory function for creating the generate_video_presentation
|
||||
tool that submits a Celery task for background video presentation generation.
|
||||
The frontend polls for completion and auto-updates when the presentation is ready.
|
||||
|
||||
Duplicate request prevention:
|
||||
- Only one video presentation can be generated at a time per search space
|
||||
- Uses Redis to track active video presentation tasks
|
||||
- Validates the Redis marker against actual DB status to avoid stale locks
|
||||
"""
|
||||
|
||||
from typing import Any
|
||||
|
||||
import redis
|
||||
from langchain_core.tools import tool
|
||||
from sqlalchemy import select
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
|
||||
from app.config import config
|
||||
from app.db import VideoPresentation, VideoPresentationStatus
|
||||
|
||||
REDIS_URL = config.REDIS_APP_URL
|
||||
_redis_client: redis.Redis | None = None
|
||||
|
||||
|
||||
def get_redis_client() -> redis.Redis:
|
||||
"""Get or create Redis client for video presentation task tracking."""
|
||||
global _redis_client
|
||||
if _redis_client is None:
|
||||
_redis_client = redis.from_url(REDIS_URL, decode_responses=True)
|
||||
return _redis_client
|
||||
|
||||
|
||||
def _redis_key(search_space_id: int) -> str:
|
||||
return f"video_presentation:generating:{search_space_id}"
|
||||
|
||||
|
||||
def get_generating_video_presentation_id(search_space_id: int) -> int | None:
|
||||
"""Get the video presentation ID currently being generated for this search space."""
|
||||
try:
|
||||
client = get_redis_client()
|
||||
value = client.get(_redis_key(search_space_id))
|
||||
return int(value) if value else None
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
|
||||
def clear_generating_video_presentation(search_space_id: int) -> None:
|
||||
"""Clear the generating marker (used when we detect a stale lock)."""
|
||||
try:
|
||||
client = get_redis_client()
|
||||
client.delete(_redis_key(search_space_id))
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
|
||||
def set_generating_video_presentation(
|
||||
search_space_id: int, video_presentation_id: int
|
||||
) -> None:
|
||||
"""Mark a video presentation as currently generating for this search space."""
|
||||
try:
|
||||
client = get_redis_client()
|
||||
client.setex(_redis_key(search_space_id), 1800, str(video_presentation_id))
|
||||
except Exception as e:
|
||||
print(
|
||||
f"[generate_video_presentation] Warning: Could not set generating video presentation in Redis: {e}"
|
||||
)
|
||||
|
||||
|
||||
def create_generate_video_presentation_tool(
|
||||
search_space_id: int,
|
||||
db_session: AsyncSession,
|
||||
thread_id: int | None = None,
|
||||
):
|
||||
"""
|
||||
Factory function to create the generate_video_presentation tool with injected dependencies.
|
||||
|
||||
Pre-creates video presentation record with pending status so the ID is available
|
||||
immediately for frontend polling.
|
||||
"""
|
||||
|
||||
@tool
|
||||
async def generate_video_presentation(
|
||||
source_content: str,
|
||||
video_title: str = "SurfSense Presentation",
|
||||
user_prompt: str | None = None,
|
||||
) -> dict[str, Any]:
|
||||
"""Generate a video presentation from the provided content.
|
||||
|
||||
Use this tool when the user asks to create a video, presentation, slides, or slide deck.
|
||||
|
||||
Args:
|
||||
source_content: The text content to turn into a presentation.
|
||||
video_title: Title for the presentation (default: "SurfSense Presentation")
|
||||
user_prompt: Optional style/tone instructions.
|
||||
"""
|
||||
try:
|
||||
generating_id = get_generating_video_presentation_id(search_space_id)
|
||||
if generating_id:
|
||||
result = await db_session.execute(
|
||||
select(VideoPresentation).filter(
|
||||
VideoPresentation.id == generating_id
|
||||
)
|
||||
)
|
||||
existing = result.scalars().first()
|
||||
|
||||
if existing and existing.status == VideoPresentationStatus.GENERATING:
|
||||
print(
|
||||
f"[generate_video_presentation] Blocked duplicate — "
|
||||
f"presentation {generating_id} is actively generating"
|
||||
)
|
||||
return {
|
||||
"status": VideoPresentationStatus.GENERATING.value,
|
||||
"video_presentation_id": generating_id,
|
||||
"title": video_title,
|
||||
"message": "A video presentation is already being generated. Please wait for it to complete.",
|
||||
}
|
||||
|
||||
print(
|
||||
f"[generate_video_presentation] Stale Redis lock for presentation {generating_id} "
|
||||
f"(status={existing.status if existing else 'not found'}). Clearing and proceeding."
|
||||
)
|
||||
clear_generating_video_presentation(search_space_id)
|
||||
|
||||
video_pres = VideoPresentation(
|
||||
title=video_title,
|
||||
status=VideoPresentationStatus.PENDING,
|
||||
search_space_id=search_space_id,
|
||||
thread_id=thread_id,
|
||||
)
|
||||
db_session.add(video_pres)
|
||||
await db_session.commit()
|
||||
await db_session.refresh(video_pres)
|
||||
|
||||
from app.tasks.celery_tasks.video_presentation_tasks import (
|
||||
generate_video_presentation_task,
|
||||
)
|
||||
|
||||
task = generate_video_presentation_task.delay(
|
||||
video_presentation_id=video_pres.id,
|
||||
source_content=source_content,
|
||||
search_space_id=search_space_id,
|
||||
user_prompt=user_prompt,
|
||||
)
|
||||
|
||||
set_generating_video_presentation(search_space_id, video_pres.id)
|
||||
|
||||
print(
|
||||
f"[generate_video_presentation] Created video presentation {video_pres.id}, task: {task.id}"
|
||||
)
|
||||
|
||||
return {
|
||||
"status": VideoPresentationStatus.PENDING.value,
|
||||
"video_presentation_id": video_pres.id,
|
||||
"title": video_title,
|
||||
"message": "Video presentation generation started. This may take a few minutes.",
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
error_message = str(e)
|
||||
print(f"[generate_video_presentation] Error: {error_message}")
|
||||
return {
|
||||
"status": VideoPresentationStatus.FAILED.value,
|
||||
"error": error_message,
|
||||
"title": video_title,
|
||||
"video_presentation_id": None,
|
||||
}
|
||||
|
||||
return generate_video_presentation
|
||||
10
surfsense_backend/app/agents/video_presentation/__init__.py
Normal file
10
surfsense_backend/app/agents/video_presentation/__init__.py
Normal file
|
|
@ -0,0 +1,10 @@
|
|||
"""Video Presentation LangGraph Agent.
|
||||
|
||||
This module defines a graph for generating video presentations
|
||||
from source content, similar to the podcaster agent but producing
|
||||
slide-based video presentations with TTS narration.
|
||||
"""
|
||||
|
||||
from .graph import graph
|
||||
|
||||
__all__ = ["graph"]
|
||||
|
|
@ -0,0 +1,25 @@
|
|||
"""Define the configurable parameters for the video presentation agent."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass, fields
|
||||
|
||||
from langchain_core.runnables import RunnableConfig
|
||||
|
||||
|
||||
@dataclass(kw_only=True)
|
||||
class Configuration:
|
||||
"""The configuration for the video presentation agent."""
|
||||
|
||||
video_title: str
|
||||
search_space_id: int
|
||||
user_prompt: str | None = None
|
||||
|
||||
@classmethod
|
||||
def from_runnable_config(
|
||||
cls, config: RunnableConfig | None = None
|
||||
) -> Configuration:
|
||||
"""Create a Configuration instance from a RunnableConfig object."""
|
||||
configurable = (config.get("configurable") or {}) if config else {}
|
||||
_fields = {f.name for f in fields(cls) if f.init}
|
||||
return cls(**{k: v for k, v in configurable.items() if k in _fields})
|
||||
30
surfsense_backend/app/agents/video_presentation/graph.py
Normal file
30
surfsense_backend/app/agents/video_presentation/graph.py
Normal file
|
|
@ -0,0 +1,30 @@
|
|||
from langgraph.graph import StateGraph
|
||||
|
||||
from .configuration import Configuration
|
||||
from .nodes import (
|
||||
create_presentation_slides,
|
||||
create_slide_audio,
|
||||
generate_slide_scene_codes,
|
||||
)
|
||||
from .state import State
|
||||
|
||||
|
||||
def build_graph():
|
||||
workflow = StateGraph(State, config_schema=Configuration)
|
||||
|
||||
workflow.add_node("create_presentation_slides", create_presentation_slides)
|
||||
workflow.add_node("create_slide_audio", create_slide_audio)
|
||||
workflow.add_node("generate_slide_scene_codes", generate_slide_scene_codes)
|
||||
|
||||
workflow.add_edge("__start__", "create_presentation_slides")
|
||||
workflow.add_edge("create_presentation_slides", "create_slide_audio")
|
||||
workflow.add_edge("create_slide_audio", "generate_slide_scene_codes")
|
||||
workflow.add_edge("generate_slide_scene_codes", "__end__")
|
||||
|
||||
graph = workflow.compile()
|
||||
graph.name = "Surfsense Video Presentation"
|
||||
|
||||
return graph
|
||||
|
||||
|
||||
graph = build_graph()
|
||||
552
surfsense_backend/app/agents/video_presentation/nodes.py
Normal file
552
surfsense_backend/app/agents/video_presentation/nodes.py
Normal file
|
|
@ -0,0 +1,552 @@
|
|||
import asyncio
|
||||
import contextlib
|
||||
import json
|
||||
import math
|
||||
import os
|
||||
import shutil
|
||||
import uuid
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
from ffmpeg.asyncio import FFmpeg
|
||||
from langchain_core.messages import HumanMessage, SystemMessage
|
||||
from langchain_core.runnables import RunnableConfig
|
||||
from litellm import aspeech
|
||||
|
||||
from app.config import config as app_config
|
||||
from app.services.kokoro_tts_service import get_kokoro_tts_service
|
||||
from app.services.llm_service import get_agent_llm
|
||||
|
||||
from .configuration import Configuration
|
||||
from .prompts import (
|
||||
DEFAULT_DURATION_IN_FRAMES,
|
||||
FPS,
|
||||
REFINE_SCENE_SYSTEM_PROMPT,
|
||||
REMOTION_SCENE_SYSTEM_PROMPT,
|
||||
THEME_PRESETS,
|
||||
build_scene_generation_user_prompt,
|
||||
build_theme_assignment_user_prompt,
|
||||
get_slide_generation_prompt,
|
||||
get_theme_assignment_system_prompt,
|
||||
pick_theme_and_mode_fallback,
|
||||
)
|
||||
from .state import (
|
||||
PresentationSlides,
|
||||
SlideAudioResult,
|
||||
SlideContent,
|
||||
SlideSceneCode,
|
||||
State,
|
||||
)
|
||||
from .utils import get_voice_for_provider
|
||||
|
||||
MAX_REFINE_ATTEMPTS = 3
|
||||
|
||||
|
||||
async def create_presentation_slides(
|
||||
state: State, config: RunnableConfig
|
||||
) -> dict[str, Any]:
|
||||
"""Parse source content into structured presentation slides using LLM."""
|
||||
|
||||
configuration = Configuration.from_runnable_config(config)
|
||||
search_space_id = configuration.search_space_id
|
||||
user_prompt = configuration.user_prompt
|
||||
|
||||
llm = await get_agent_llm(state.db_session, search_space_id)
|
||||
if not llm:
|
||||
error_message = f"No LLM configured for search space {search_space_id}"
|
||||
print(error_message)
|
||||
raise RuntimeError(error_message)
|
||||
|
||||
prompt = get_slide_generation_prompt(user_prompt)
|
||||
|
||||
messages = [
|
||||
SystemMessage(content=prompt),
|
||||
HumanMessage(
|
||||
content=f"<source_content>{state.source_content}</source_content>"
|
||||
),
|
||||
]
|
||||
|
||||
llm_response = await llm.ainvoke(messages)
|
||||
|
||||
try:
|
||||
presentation = PresentationSlides.model_validate(
|
||||
json.loads(llm_response.content)
|
||||
)
|
||||
except (json.JSONDecodeError, ValueError) as e:
|
||||
print(f"Direct JSON parsing failed, trying fallback approach: {e!s}")
|
||||
|
||||
try:
|
||||
content = llm_response.content
|
||||
json_start = content.find("{")
|
||||
json_end = content.rfind("}") + 1
|
||||
if json_start >= 0 and json_end > json_start:
|
||||
json_str = content[json_start:json_end]
|
||||
parsed_data = json.loads(json_str)
|
||||
presentation = PresentationSlides.model_validate(parsed_data)
|
||||
print("Successfully parsed presentation slides using fallback approach")
|
||||
else:
|
||||
error_message = f"Could not find valid JSON in LLM response. Raw response: {content}"
|
||||
print(error_message)
|
||||
raise ValueError(error_message)
|
||||
|
||||
except (json.JSONDecodeError, ValueError) as e2:
|
||||
error_message = f"Error parsing LLM response (fallback also failed): {e2!s}"
|
||||
print(f"Error parsing LLM response: {e2!s}")
|
||||
print(f"Raw response: {llm_response.content}")
|
||||
raise
|
||||
|
||||
return {"slides": presentation.slides}
|
||||
|
||||
|
||||
async def create_slide_audio(state: State, config: RunnableConfig) -> dict[str, Any]:
|
||||
"""Generate TTS audio for each slide.
|
||||
|
||||
Each slide's speaker_transcripts are generated as individual TTS chunks,
|
||||
then concatenated with ffmpeg (matching the POC in RemotionTets/api/tts).
|
||||
"""
|
||||
|
||||
session_id = str(uuid.uuid4())
|
||||
temp_dir = Path("temp_audio")
|
||||
temp_dir.mkdir(exist_ok=True)
|
||||
output_dir = Path("video_presentation_audio")
|
||||
output_dir.mkdir(exist_ok=True)
|
||||
|
||||
slides = state.slides or []
|
||||
voice = get_voice_for_provider(app_config.TTS_SERVICE, speaker_id=0)
|
||||
ext = "wav" if app_config.TTS_SERVICE == "local/kokoro" else "mp3"
|
||||
|
||||
async def _generate_tts_chunk(text: str, chunk_path: str) -> str:
|
||||
"""Generate a single TTS chunk and write it to *chunk_path*."""
|
||||
if app_config.TTS_SERVICE == "local/kokoro":
|
||||
kokoro_service = await get_kokoro_tts_service(lang_code="a")
|
||||
await kokoro_service.generate_speech(
|
||||
text=text,
|
||||
voice=voice,
|
||||
speed=1.0,
|
||||
output_path=chunk_path,
|
||||
)
|
||||
else:
|
||||
kwargs: dict[str, Any] = {
|
||||
"model": app_config.TTS_SERVICE,
|
||||
"api_key": app_config.TTS_SERVICE_API_KEY,
|
||||
"voice": voice,
|
||||
"input": text,
|
||||
"max_retries": 2,
|
||||
"timeout": 600,
|
||||
}
|
||||
if app_config.TTS_SERVICE_API_BASE:
|
||||
kwargs["api_base"] = app_config.TTS_SERVICE_API_BASE
|
||||
|
||||
response = await aspeech(**kwargs)
|
||||
with open(chunk_path, "wb") as f:
|
||||
f.write(response.content)
|
||||
|
||||
return chunk_path
|
||||
|
||||
async def _concat_with_ffmpeg(chunk_paths: list[str], output_file: str) -> None:
|
||||
"""Concatenate multiple audio chunks into one file using async ffmpeg."""
|
||||
ffmpeg = FFmpeg().option("y")
|
||||
for chunk in chunk_paths:
|
||||
ffmpeg = ffmpeg.input(chunk)
|
||||
|
||||
filter_parts = [f"[{i}:0]" for i in range(len(chunk_paths))]
|
||||
filter_str = (
|
||||
"".join(filter_parts) + f"concat=n={len(chunk_paths)}:v=0:a=1[outa]"
|
||||
)
|
||||
ffmpeg = ffmpeg.option("filter_complex", filter_str)
|
||||
ffmpeg = ffmpeg.output(output_file, map="[outa]")
|
||||
await ffmpeg.execute()
|
||||
|
||||
async def generate_audio_for_slide(slide: SlideContent) -> SlideAudioResult:
|
||||
has_transcripts = (
|
||||
slide.speaker_transcripts and len(slide.speaker_transcripts) > 0
|
||||
)
|
||||
|
||||
if not has_transcripts:
|
||||
print(
|
||||
f"Slide {slide.slide_number}: no speaker_transcripts, "
|
||||
f"using default duration ({DEFAULT_DURATION_IN_FRAMES} frames)"
|
||||
)
|
||||
return SlideAudioResult(
|
||||
slide_number=slide.slide_number,
|
||||
audio_file="",
|
||||
duration_seconds=DEFAULT_DURATION_IN_FRAMES / FPS,
|
||||
duration_in_frames=DEFAULT_DURATION_IN_FRAMES,
|
||||
)
|
||||
|
||||
output_file = str(output_dir / f"{session_id}_slide_{slide.slide_number}.{ext}")
|
||||
|
||||
chunk_paths: list[str] = []
|
||||
try:
|
||||
for i, text in enumerate(slide.speaker_transcripts):
|
||||
chunk_path = str(
|
||||
temp_dir
|
||||
/ f"{session_id}_slide_{slide.slide_number}_chunk_{i}.{ext}"
|
||||
)
|
||||
print(
|
||||
f" Slide {slide.slide_number} chunk {i + 1}/"
|
||||
f"{len(slide.speaker_transcripts)}: "
|
||||
f'"{text[:60]}..."'
|
||||
)
|
||||
await _generate_tts_chunk(text, chunk_path)
|
||||
chunk_paths.append(chunk_path)
|
||||
|
||||
if len(chunk_paths) == 1:
|
||||
shutil.move(chunk_paths[0], output_file)
|
||||
else:
|
||||
print(
|
||||
f" Concatenating {len(chunk_paths)} chunks for slide "
|
||||
f"{slide.slide_number} with ffmpeg"
|
||||
)
|
||||
await _concat_with_ffmpeg(chunk_paths, output_file)
|
||||
|
||||
duration_seconds = await _get_audio_duration(output_file)
|
||||
duration_in_frames = math.ceil(duration_seconds * FPS)
|
||||
|
||||
return SlideAudioResult(
|
||||
slide_number=slide.slide_number,
|
||||
audio_file=output_file,
|
||||
duration_seconds=duration_seconds,
|
||||
duration_in_frames=max(duration_in_frames, DEFAULT_DURATION_IN_FRAMES),
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error generating audio for slide {slide.slide_number}: {e!s}")
|
||||
raise
|
||||
finally:
|
||||
for p in chunk_paths:
|
||||
with contextlib.suppress(OSError):
|
||||
os.remove(p)
|
||||
|
||||
tasks = [generate_audio_for_slide(slide) for slide in slides]
|
||||
audio_results = await asyncio.gather(*tasks)
|
||||
|
||||
audio_results_sorted = sorted(audio_results, key=lambda r: r.slide_number)
|
||||
|
||||
print(
|
||||
f"Generated audio for {len(audio_results_sorted)} slides "
|
||||
f"(total duration: {sum(r.duration_seconds for r in audio_results_sorted):.1f}s)"
|
||||
)
|
||||
|
||||
return {"slide_audio_results": audio_results_sorted}
|
||||
|
||||
|
||||
async def _get_audio_duration(file_path: str) -> float:
|
||||
"""Get audio duration in seconds using ffprobe (via python-ffmpeg).
|
||||
|
||||
Falls back to file-size estimation if ffprobe fails.
|
||||
"""
|
||||
try:
|
||||
import subprocess
|
||||
|
||||
proc = await asyncio.create_subprocess_exec(
|
||||
"ffprobe",
|
||||
"-v",
|
||||
"error",
|
||||
"-show_entries",
|
||||
"format=duration",
|
||||
"-of",
|
||||
"default=noprint_wrappers=1:nokey=1",
|
||||
file_path,
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
)
|
||||
stdout, _ = await asyncio.wait_for(proc.communicate(), timeout=10)
|
||||
if proc.returncode == 0 and stdout.strip():
|
||||
return float(stdout.strip())
|
||||
except Exception as e:
|
||||
print(f"ffprobe failed for {file_path}: {e!s}, using file-size estimation")
|
||||
|
||||
try:
|
||||
file_size = os.path.getsize(file_path)
|
||||
if file_path.endswith(".wav"):
|
||||
return file_size / (16000 * 2)
|
||||
else:
|
||||
return file_size / 16000
|
||||
except Exception:
|
||||
return DEFAULT_DURATION_IN_FRAMES / FPS
|
||||
|
||||
|
||||
async def _assign_themes_with_llm(
|
||||
llm, slides: list[SlideContent]
|
||||
) -> dict[int, tuple[str, str]]:
|
||||
"""Ask the LLM to assign a theme+mode to each slide in one call.
|
||||
|
||||
Returns a dict mapping slide_number → (theme, mode).
|
||||
Falls back to round-robin if the LLM response can't be parsed.
|
||||
"""
|
||||
total = len(slides)
|
||||
slide_summaries = [
|
||||
{
|
||||
"slide_number": s.slide_number,
|
||||
"title": s.title,
|
||||
"subtitle": s.subtitle or "",
|
||||
"background_explanation": s.background_explanation or "",
|
||||
}
|
||||
for s in slides
|
||||
]
|
||||
|
||||
system = get_theme_assignment_system_prompt()
|
||||
user = build_theme_assignment_user_prompt(slide_summaries)
|
||||
|
||||
try:
|
||||
response = await llm.ainvoke(
|
||||
[
|
||||
SystemMessage(content=system),
|
||||
HumanMessage(content=user),
|
||||
]
|
||||
)
|
||||
|
||||
text = response.content.strip()
|
||||
if text.startswith("```"):
|
||||
lines = text.split("\n")
|
||||
text = "\n".join(
|
||||
line for line in lines if not line.strip().startswith("```")
|
||||
).strip()
|
||||
|
||||
assignments = json.loads(text)
|
||||
valid_themes = set(THEME_PRESETS)
|
||||
result: dict[int, tuple[str, str]] = {}
|
||||
for entry in assignments:
|
||||
sn = entry.get("slide_number")
|
||||
theme = entry.get("theme", "").upper()
|
||||
mode = entry.get("mode", "dark").lower()
|
||||
if sn and theme in valid_themes and mode in ("dark", "light"):
|
||||
result[sn] = (theme, mode)
|
||||
|
||||
if len(result) == total:
|
||||
print(
|
||||
"LLM theme assignment: "
|
||||
+ ", ".join(f"S{sn}={t}/{m}" for sn, (t, m) in sorted(result.items()))
|
||||
)
|
||||
return result
|
||||
|
||||
print(
|
||||
f"LLM returned {len(result)}/{total} valid assignments, "
|
||||
"filling gaps with fallback"
|
||||
)
|
||||
for s in slides:
|
||||
if s.slide_number not in result:
|
||||
result[s.slide_number] = pick_theme_and_mode_fallback(
|
||||
s.slide_number - 1, total
|
||||
)
|
||||
return result
|
||||
|
||||
except Exception as e:
|
||||
print(f"LLM theme assignment failed ({e!s}), using fallback")
|
||||
return {
|
||||
s.slide_number: pick_theme_and_mode_fallback(s.slide_number - 1, total)
|
||||
for s in slides
|
||||
}
|
||||
|
||||
|
||||
async def generate_slide_scene_codes(
|
||||
state: State, config: RunnableConfig
|
||||
) -> dict[str, Any]:
|
||||
"""Generate Remotion component code for each slide using LLM.
|
||||
|
||||
First assigns a theme+mode to every slide via a single LLM call,
|
||||
then generates scene code per slide with the assigned theme.
|
||||
"""
|
||||
|
||||
configuration = Configuration.from_runnable_config(config)
|
||||
search_space_id = configuration.search_space_id
|
||||
|
||||
llm = await get_agent_llm(state.db_session, search_space_id)
|
||||
if not llm:
|
||||
raise RuntimeError(f"No LLM configured for search space {search_space_id}")
|
||||
|
||||
slides = state.slides or []
|
||||
audio_results = state.slide_audio_results or []
|
||||
|
||||
audio_map: dict[int, SlideAudioResult] = {r.slide_number: r for r in audio_results}
|
||||
total_slides = len(slides)
|
||||
|
||||
theme_assignments = await _assign_themes_with_llm(llm, slides)
|
||||
|
||||
scene_codes: list[SlideSceneCode] = []
|
||||
|
||||
for slide in slides:
|
||||
audio = audio_map.get(slide.slide_number)
|
||||
duration = audio.duration_in_frames if audio else DEFAULT_DURATION_IN_FRAMES
|
||||
|
||||
theme, mode = theme_assignments.get(
|
||||
slide.slide_number,
|
||||
pick_theme_and_mode_fallback(slide.slide_number - 1, total_slides),
|
||||
)
|
||||
|
||||
user_prompt = build_scene_generation_user_prompt(
|
||||
slide_number=slide.slide_number,
|
||||
total_slides=total_slides,
|
||||
title=slide.title,
|
||||
subtitle=slide.subtitle,
|
||||
content_in_markdown=slide.content_in_markdown,
|
||||
background_explanation=slide.background_explanation,
|
||||
duration_in_frames=duration,
|
||||
theme=theme,
|
||||
mode=mode,
|
||||
)
|
||||
|
||||
messages = [
|
||||
SystemMessage(content=REMOTION_SCENE_SYSTEM_PROMPT),
|
||||
HumanMessage(content=user_prompt),
|
||||
]
|
||||
|
||||
print(
|
||||
f"Generating scene code for slide {slide.slide_number}/{total_slides}: "
|
||||
f'"{slide.title}" ({duration} frames)'
|
||||
)
|
||||
|
||||
llm_response = await llm.ainvoke(messages)
|
||||
code, scene_title = _extract_code_and_title(llm_response.content)
|
||||
|
||||
code = await _refine_if_needed(llm, code, slide.slide_number)
|
||||
|
||||
scene_codes.append(
|
||||
SlideSceneCode(
|
||||
slide_number=slide.slide_number,
|
||||
code=code,
|
||||
title=scene_title or slide.title,
|
||||
)
|
||||
)
|
||||
|
||||
print(f"Scene code ready for slide {slide.slide_number} ({len(code)} chars)")
|
||||
|
||||
return {"slide_scene_codes": scene_codes}
|
||||
|
||||
|
||||
def _extract_code_and_title(content: str) -> tuple[str, str | None]:
|
||||
"""Extract code and optional title from LLM response.
|
||||
|
||||
The LLM may return a JSON object like the POC's structured output:
|
||||
{ "code": "...", "title": "..." }
|
||||
Or it may return raw code (with optional markdown fences).
|
||||
|
||||
Returns (code, title) where title may be None.
|
||||
"""
|
||||
text = content.strip()
|
||||
|
||||
if text.startswith("{"):
|
||||
try:
|
||||
parsed = json.loads(text)
|
||||
if isinstance(parsed, dict) and "code" in parsed:
|
||||
return parsed["code"], parsed.get("title")
|
||||
except (json.JSONDecodeError, ValueError):
|
||||
pass
|
||||
|
||||
json_start = text.find("{")
|
||||
json_end = text.rfind("}") + 1
|
||||
if json_start >= 0 and json_end > json_start:
|
||||
try:
|
||||
parsed = json.loads(text[json_start:json_end])
|
||||
if isinstance(parsed, dict) and "code" in parsed:
|
||||
return parsed["code"], parsed.get("title")
|
||||
except (json.JSONDecodeError, ValueError):
|
||||
pass
|
||||
|
||||
code = text
|
||||
if code.startswith("```"):
|
||||
lines = code.split("\n")
|
||||
start = 1
|
||||
end = len(lines)
|
||||
for i in range(len(lines) - 1, 0, -1):
|
||||
if lines[i].strip().startswith("```"):
|
||||
end = i
|
||||
break
|
||||
code = "\n".join(lines[start:end]).strip()
|
||||
|
||||
return code, None
|
||||
|
||||
|
||||
async def _refine_if_needed(llm, code: str, slide_number: int) -> str:
|
||||
"""Attempt basic syntax validation and auto-repair via LLM if needed.
|
||||
|
||||
Raises RuntimeError if the code is still invalid after MAX_REFINE_ATTEMPTS,
|
||||
matching the POC's behavior where a failed slide aborts the pipeline.
|
||||
"""
|
||||
error = _basic_syntax_check(code)
|
||||
if error is None:
|
||||
return code
|
||||
|
||||
for attempt in range(1, MAX_REFINE_ATTEMPTS + 1):
|
||||
print(
|
||||
f"Slide {slide_number}: syntax issue (attempt {attempt}/{MAX_REFINE_ATTEMPTS}): {error}"
|
||||
)
|
||||
|
||||
messages = [
|
||||
SystemMessage(content=REFINE_SCENE_SYSTEM_PROMPT),
|
||||
HumanMessage(
|
||||
content=(
|
||||
f"Here is the broken Remotion component code:\n\n{code}\n\n"
|
||||
f"Compilation error:\n{error}\n\nFix the code."
|
||||
)
|
||||
),
|
||||
]
|
||||
|
||||
response = await llm.ainvoke(messages)
|
||||
code, _ = _extract_code_and_title(response.content)
|
||||
|
||||
error = _basic_syntax_check(code)
|
||||
if error is None:
|
||||
print(f"Slide {slide_number}: fixed on attempt {attempt}")
|
||||
return code
|
||||
|
||||
raise RuntimeError(
|
||||
f"Slide {slide_number} failed to compile after {MAX_REFINE_ATTEMPTS} "
|
||||
f"refine attempts. Last error: {error}"
|
||||
)
|
||||
|
||||
|
||||
def _basic_syntax_check(code: str) -> str | None:
|
||||
"""Run a lightweight syntax check on the generated code.
|
||||
|
||||
Full Babel-based compilation happens on the frontend. This backend check
|
||||
catches the most common LLM code-generation mistakes so the refine loop
|
||||
can fix them before persisting.
|
||||
|
||||
Returns an error description or None if the code looks valid.
|
||||
"""
|
||||
if not code or not code.strip():
|
||||
return "Empty code"
|
||||
|
||||
if "export" not in code and "MyComposition" not in code:
|
||||
return "Missing exported component (expected 'export const MyComposition')"
|
||||
|
||||
brace_count = 0
|
||||
paren_count = 0
|
||||
bracket_count = 0
|
||||
for ch in code:
|
||||
if ch == "{":
|
||||
brace_count += 1
|
||||
elif ch == "}":
|
||||
brace_count -= 1
|
||||
elif ch == "(":
|
||||
paren_count += 1
|
||||
elif ch == ")":
|
||||
paren_count -= 1
|
||||
elif ch == "[":
|
||||
bracket_count += 1
|
||||
elif ch == "]":
|
||||
bracket_count -= 1
|
||||
|
||||
if brace_count < 0:
|
||||
return "Unmatched closing brace '}'"
|
||||
if paren_count < 0:
|
||||
return "Unmatched closing parenthesis ')'"
|
||||
if bracket_count < 0:
|
||||
return "Unmatched closing bracket ']'"
|
||||
|
||||
if brace_count != 0:
|
||||
return f"Unbalanced braces: {brace_count} unclosed"
|
||||
if paren_count != 0:
|
||||
return f"Unbalanced parentheses: {paren_count} unclosed"
|
||||
if bracket_count != 0:
|
||||
return f"Unbalanced brackets: {bracket_count} unclosed"
|
||||
|
||||
if "useCurrentFrame" not in code:
|
||||
return "Missing useCurrentFrame() — required for Remotion animations"
|
||||
|
||||
if "AbsoluteFill" not in code:
|
||||
return "Missing AbsoluteFill — required as the root layout component"
|
||||
|
||||
return None
|
||||
509
surfsense_backend/app/agents/video_presentation/prompts.py
Normal file
509
surfsense_backend/app/agents/video_presentation/prompts.py
Normal file
|
|
@ -0,0 +1,509 @@
|
|||
import datetime
|
||||
|
||||
# TODO: move these to config file
|
||||
MAX_SLIDES = 5
|
||||
FPS = 30
|
||||
DEFAULT_DURATION_IN_FRAMES = 300
|
||||
|
||||
THEME_PRESETS = [
|
||||
"TERRA",
|
||||
"OCEAN",
|
||||
"SUNSET",
|
||||
"EMERALD",
|
||||
"ECLIPSE",
|
||||
"ROSE",
|
||||
"FROST",
|
||||
"NEBULA",
|
||||
"AURORA",
|
||||
"CORAL",
|
||||
"MIDNIGHT",
|
||||
"AMBER",
|
||||
"LAVENDER",
|
||||
"STEEL",
|
||||
"CITRUS",
|
||||
"CHERRY",
|
||||
]
|
||||
|
||||
THEME_DESCRIPTIONS: dict[str, str] = {
|
||||
"TERRA": "Warm earthy tones — terracotta, olive. Heritage, tradition, organic warmth.",
|
||||
"OCEAN": "Cool oceanic depth — teal, coral accents. Calm, marine, fluid elegance.",
|
||||
"SUNSET": "Vibrant warm energy — orange, purple. Passion, creativity, bold expression.",
|
||||
"EMERALD": "Fresh natural life — green, mint. Growth, health, sustainability.",
|
||||
"ECLIPSE": "Dramatic luxury — black, gold. Premium, power, prestige.",
|
||||
"ROSE": "Soft elegance — dusty pink, mauve. Beauty, care, refined femininity.",
|
||||
"FROST": "Crisp clarity — ice blue, silver. Tech, data, precision analytics.",
|
||||
"NEBULA": "Cosmic mystery — magenta, deep purple. AI, innovation, cutting-edge future.",
|
||||
"AURORA": "Ethereal northern lights — green-teal, violet. Mystical, transformative, wonder.",
|
||||
"CORAL": "Tropical warmth — coral, turquoise. Inviting, lively, community.",
|
||||
"MIDNIGHT": "Deep sophistication — navy, silver. Contemplative, trust, authority.",
|
||||
"AMBER": "Rich honey warmth — amber, brown. Comfort, wisdom, organic richness.",
|
||||
"LAVENDER": "Gentle dreaminess — purple, lilac. Calm, imaginative, serene.",
|
||||
"STEEL": "Industrial strength — gray, steel blue. Modern professional, reliability.",
|
||||
"CITRUS": "Bright optimism — yellow, lime. Energy, joy, fresh starts.",
|
||||
"CHERRY": "Bold impact — deep red, dark. Power, urgency, passionate conviction.",
|
||||
}
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# LLM-based theme assignment (replaces keyword-based pick_theme_and_mode)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
THEME_ASSIGNMENT_SYSTEM_PROMPT = """You are a visual design director assigning color themes to presentation slides.
|
||||
Given a list of slides, assign each slide a theme preset and color mode (dark or light).
|
||||
|
||||
Available themes (name — description):
|
||||
{theme_list}
|
||||
|
||||
Rules:
|
||||
1. Pick the theme that best matches each slide's mood, content, and visual direction.
|
||||
2. Maximize visual variety — avoid repeating the same theme on consecutive slides.
|
||||
3. Mix dark and light modes across the presentation for contrast and rhythm.
|
||||
4. Opening slides often benefit from a bold dark theme; closing/summary slides can go either way.
|
||||
5. The "background_explanation" field is the primary signal — it describes the intended mood and color direction.
|
||||
|
||||
Return ONLY a JSON array (no markdown fences, no explanation):
|
||||
[
|
||||
{{"slide_number": 1, "theme": "THEME_NAME", "mode": "dark"}},
|
||||
{{"slide_number": 2, "theme": "THEME_NAME", "mode": "light"}}
|
||||
]
|
||||
""".strip()
|
||||
|
||||
|
||||
def build_theme_assignment_user_prompt(
|
||||
slides: list[dict[str, str]],
|
||||
) -> str:
|
||||
"""Build the user prompt for LLM theme assignment.
|
||||
|
||||
*slides* is a list of dicts with keys: slide_number, title, subtitle,
|
||||
background_explanation (mood).
|
||||
"""
|
||||
lines = ["Assign a theme and mode to each of these slides:", ""]
|
||||
for s in slides:
|
||||
lines.append(
|
||||
f'Slide {s["slide_number"]}: "{s["title"]}" '
|
||||
f'(subtitle: "{s.get("subtitle", "")}") — '
|
||||
f'Mood: "{s.get("background_explanation", "neutral")}"'
|
||||
)
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
def get_theme_assignment_system_prompt() -> str:
|
||||
"""Return the theme assignment system prompt with the full theme list injected."""
|
||||
theme_list = "\n".join(
|
||||
f"- {name}: {desc}" for name, desc in THEME_DESCRIPTIONS.items()
|
||||
)
|
||||
return THEME_ASSIGNMENT_SYSTEM_PROMPT.format(theme_list=theme_list)
|
||||
|
||||
|
||||
def pick_theme_and_mode_fallback(
|
||||
slide_index: int, total_slides: int
|
||||
) -> tuple[str, str]:
|
||||
"""Simple round-robin fallback when LLM theme assignment fails."""
|
||||
theme = THEME_PRESETS[slide_index % len(THEME_PRESETS)]
|
||||
mode = "dark" if slide_index % 2 == 0 else "light"
|
||||
if total_slides == 1:
|
||||
mode = "dark"
|
||||
return theme, mode
|
||||
|
||||
|
||||
def get_slide_generation_prompt(user_prompt: str | None = None) -> str:
|
||||
return f"""
|
||||
Today's date: {datetime.datetime.now().strftime("%Y-%m-%d")}
|
||||
<video_presentation_system>
|
||||
You are a content-to-slides converter. You receive raw source content (articles, notes, transcripts,
|
||||
product descriptions, chat conversations, etc.) and break it into a sequence of presentation slides
|
||||
for a video presentation with voiceover narration.
|
||||
|
||||
{
|
||||
f'''
|
||||
You **MUST** strictly adhere to the following user instruction while generating the slides:
|
||||
<user_instruction>
|
||||
{user_prompt}
|
||||
</user_instruction>
|
||||
'''
|
||||
if user_prompt
|
||||
else ""
|
||||
}
|
||||
|
||||
<input>
|
||||
- '<source_content>': A block of text containing the information to be presented. This could be
|
||||
research findings, an article summary, a detailed outline, user chat history, or any relevant
|
||||
raw information. The content serves as the factual basis for the video presentation.
|
||||
</input>
|
||||
|
||||
<output_format>
|
||||
A JSON object containing the presentation slides:
|
||||
{{
|
||||
"slides": [
|
||||
{{
|
||||
"slide_number": 1,
|
||||
"title": "Concise slide title",
|
||||
"subtitle": "One-line subtitle or tagline",
|
||||
"content_in_markdown": "## Heading\\n- Bullet point 1\\n- **Bold text**\\n- Bullet point 3",
|
||||
"speaker_transcripts": [
|
||||
"First narration sentence for this slide.",
|
||||
"Second narration sentence expanding on the point.",
|
||||
"Third sentence wrapping up this slide."
|
||||
],
|
||||
"background_explanation": "Emotional mood and color direction for this slide"
|
||||
}}
|
||||
]
|
||||
}}
|
||||
</output_format>
|
||||
|
||||
<guidelines>
|
||||
=== SLIDE COUNT ===
|
||||
|
||||
Dynamically decide the number of slides between 1 and {MAX_SLIDES} (inclusive).
|
||||
Base your decision entirely on the content's depth, richness, and how many distinct ideas it contains.
|
||||
Thin or simple content should produce fewer slides; dense or multi-faceted content may use more.
|
||||
Do NOT inflate or pad slides to reach {
|
||||
MAX_SLIDES
|
||||
} — only use what the content genuinely warrants.
|
||||
Do NOT treat {MAX_SLIDES} as a target; it is a hard ceiling, not a goal.
|
||||
|
||||
=== SLIDE STRUCTURE ===
|
||||
|
||||
- Each slide should cover ONE distinct key idea or section.
|
||||
- Keep slides focused: 2-5 bullet points of content per slide max.
|
||||
- The first slide should be a title/intro slide.
|
||||
- The last slide should be a summary or closing slide ONLY if there are 3+ slides.
|
||||
For 1-2 slides, skip the closing slide — just cover the content.
|
||||
- Do NOT create a separate closing slide if its content would just repeat earlier slides.
|
||||
|
||||
=== CONTENT FIELDS ===
|
||||
|
||||
- Write speaker_transcripts as if a human presenter is narrating — natural, conversational, 2-4 sentences per slide.
|
||||
These will be converted to TTS audio, so write in a way that sounds great when spoken aloud.
|
||||
- background_explanation should describe a visual style matching the slide's mood:
|
||||
- Describe the emotional feel: "warm and organic", "dramatic and urgent", "clean and optimistic",
|
||||
"technical and precise", "celebratory", "earthy and grounded", "cosmic and futuristic"
|
||||
- Mention color direction: warm tones, cool tones, earth tones, neon accents, gold/black, etc.
|
||||
- Vary the mood across slides — do NOT always say "dark blue gradient".
|
||||
- content_in_markdown should use proper markdown: ## headings, **bold**, - bullets, etc.
|
||||
|
||||
=== NARRATION QUALITY ===
|
||||
|
||||
- Speaker transcripts should explain the slide content in an engaging, presenter-like voice.
|
||||
- Keep narration concise: 2-4 sentences per slide (targeting ~10-15 seconds of audio per slide).
|
||||
- The narration should add context beyond what's on the slide — don't just read the bullets.
|
||||
- Use natural language: contractions, conversational tone, occasional enthusiasm.
|
||||
</guidelines>
|
||||
|
||||
<examples>
|
||||
Input: "Quantum computing uses quantum bits or qubits which can exist in multiple states simultaneously due to superposition."
|
||||
|
||||
Output:
|
||||
{{
|
||||
"slides": [
|
||||
{{
|
||||
"slide_number": 1,
|
||||
"title": "Quantum Computing",
|
||||
"subtitle": "Beyond Classical Bits",
|
||||
"content_in_markdown": "## The Quantum Leap\\n- Classical computers use **bits** (0 or 1)\\n- Quantum computers use **qubits**\\n- Qubits leverage **superposition**",
|
||||
"speaker_transcripts": [
|
||||
"Let's explore quantum computing, a technology that's fundamentally different from the computers we use every day.",
|
||||
"While traditional computers work with bits that are either zero or one, quantum computers use something called qubits.",
|
||||
"The magic of qubits is superposition — they can exist in multiple states at the same time."
|
||||
],
|
||||
"background_explanation": "Cosmic and futuristic with deep purple and magenta tones, evoking the mystery of quantum mechanics"
|
||||
}}
|
||||
]
|
||||
}}
|
||||
</examples>
|
||||
|
||||
Transform the source material into well-structured presentation slides with engaging narration.
|
||||
Ensure each slide has a clear visual mood and natural-sounding speaker transcripts.
|
||||
</video_presentation_system>
|
||||
"""
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Remotion scene code generation prompt
|
||||
# Ported from RemotionTets POC /api/generate system prompt
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
REMOTION_SCENE_SYSTEM_PROMPT = """
|
||||
You are a Remotion component generator that creates cinematic, modern motion graphics.
|
||||
Generate a single self-contained React component that uses Remotion.
|
||||
|
||||
=== THEME PRESETS (pick ONE per slide — see user prompt for which to use) ===
|
||||
|
||||
Each slide MUST use a DIFFERENT preset. The user prompt will tell you which preset to use.
|
||||
Use ALL colors from that preset — background, surface, text, accent, glow. Do NOT mix presets.
|
||||
|
||||
TERRA (warm earth — terracotta + olive):
|
||||
dark: bg #1C1510 surface #261E16 border #3D3024 text #E8DDD0 muted #9A8A78 accent #C2623D secondary #7D8C52 glow rgba(194,98,61,0.12)
|
||||
light: bg #F7F0E8 surface #FFF8F0 border #DDD0BF text #2C1D0E muted #8A7A68 accent #B85430 secondary #6B7A42 glow rgba(184,84,48,0.08)
|
||||
gradient-dark: radial-gradient(ellipse at 30% 80%, rgba(194,98,61,0.18), transparent 60%), linear-gradient(180deg, #1C1510, #261E16)
|
||||
gradient-light: radial-gradient(ellipse at 70% 20%, rgba(107,122,66,0.12), transparent 55%), linear-gradient(180deg, #F7F0E8, #FFF8F0)
|
||||
|
||||
OCEAN (cool depth — teal + coral):
|
||||
dark: bg #0B1A1E surface #122428 border #1E3740 text #D5EAF0 muted #6A9AA8 accent #1DB6A8 secondary #E87461 glow rgba(29,182,168,0.12)
|
||||
light: bg #F0F8FA surface #FFFFFF border #C8E0E8 text #0E2830 muted #5A8A98 accent #0EA69A secondary #D05F4E glow rgba(14,166,154,0.08)
|
||||
gradient-dark: radial-gradient(ellipse at 80% 30%, rgba(29,182,168,0.20), transparent 55%), radial-gradient(circle at 20% 80%, rgba(232,116,97,0.10), transparent 50%), #0B1A1E
|
||||
gradient-light: radial-gradient(ellipse at 20% 40%, rgba(14,166,154,0.10), transparent 55%), linear-gradient(180deg, #F0F8FA, #FFFFFF)
|
||||
|
||||
SUNSET (warm energy — orange + purple):
|
||||
dark: bg #1E130F surface #2A1B14 border #42291C text #F0DDD0 muted #A08878 accent #E86A20 secondary #A855C0 glow rgba(232,106,32,0.12)
|
||||
light: bg #FFF5ED surface #FFFFFF border #EADAC8 text #2E1508 muted #907860 accent #D05A18 secondary #9045A8 glow rgba(208,90,24,0.08)
|
||||
gradient-dark: linear-gradient(135deg, rgba(232,106,32,0.15) 0%, transparent 40%), radial-gradient(circle at 80% 70%, rgba(168,85,192,0.15), transparent 50%), #1E130F
|
||||
gradient-light: linear-gradient(135deg, rgba(208,90,24,0.08) 0%, rgba(144,69,168,0.06) 100%), #FFF5ED
|
||||
|
||||
EMERALD (fresh life — green + mint):
|
||||
dark: bg #0B1E14 surface #12281A border #1E3C28 text #D0F0E0 muted #5EA880 accent #10B981 secondary #84CC16 glow rgba(16,185,129,0.12)
|
||||
light: bg #F0FAF5 surface #FFFFFF border #C0E8D0 text #0E2C18 muted #489068 accent #059669 secondary #65A30D glow rgba(5,150,105,0.08)
|
||||
gradient-dark: radial-gradient(ellipse at 50% 50%, rgba(16,185,129,0.18), transparent 60%), linear-gradient(180deg, #0B1E14, #12281A)
|
||||
gradient-light: radial-gradient(ellipse at 60% 30%, rgba(101,163,13,0.10), transparent 55%), linear-gradient(180deg, #F0FAF5, #FFFFFF)
|
||||
|
||||
ECLIPSE (dramatic — black + gold):
|
||||
dark: bg #100C05 surface #1A1508 border #2E2510 text #D4B96A muted #8A7840 accent #E8B830 secondary #C09020 glow rgba(232,184,48,0.14)
|
||||
light: bg #FAF6ED surface #FFFFFF border #E0D8C0 text #1A1408 muted #7A6818 accent #C09820 secondary #A08018 glow rgba(192,152,32,0.08)
|
||||
gradient-dark: radial-gradient(circle at 50% 40%, rgba(232,184,48,0.20), transparent 50%), radial-gradient(ellipse at 50% 90%, rgba(192,144,32,0.08), transparent 50%), #100C05
|
||||
gradient-light: radial-gradient(circle at 50% 40%, rgba(192,152,32,0.10), transparent 55%), linear-gradient(180deg, #FAF6ED, #FFFFFF)
|
||||
|
||||
ROSE (soft elegance — dusty pink + mauve):
|
||||
dark: bg #1E1018 surface #281820 border #3D2830 text #F0D8E0 muted #A08090 accent #E4508C secondary #B06498 glow rgba(228,80,140,0.12)
|
||||
light: bg #FDF2F5 surface #FFFFFF border #F0D0D8 text #2C1018 muted #906878 accent #D43D78 secondary #9A5080 glow rgba(212,61,120,0.08)
|
||||
gradient-dark: radial-gradient(ellipse at 70% 30%, rgba(228,80,140,0.18), transparent 55%), radial-gradient(circle at 20% 80%, rgba(176,100,152,0.10), transparent 50%), #1E1018
|
||||
gradient-light: radial-gradient(ellipse at 30% 60%, rgba(212,61,120,0.08), transparent 55%), linear-gradient(180deg, #FDF2F5, #FFFFFF)
|
||||
|
||||
FROST (crisp clarity — ice blue + silver):
|
||||
dark: bg #0A1520 surface #101D2A border #1A3040 text #D0E5F5 muted #6090B0 accent #5AB4E8 secondary #8BA8C0 glow rgba(90,180,232,0.12)
|
||||
light: bg #F0F6FC surface #FFFFFF border #C8D8E8 text #0C1820 muted #5080A0 accent #3A96D0 secondary #7090A8 glow rgba(58,150,208,0.08)
|
||||
gradient-dark: radial-gradient(ellipse at 40% 20%, rgba(90,180,232,0.16), transparent 55%), linear-gradient(180deg, #0A1520, #101D2A)
|
||||
gradient-light: radial-gradient(ellipse at 50% 50%, rgba(58,150,208,0.08), transparent 55%), linear-gradient(180deg, #F0F6FC, #FFFFFF)
|
||||
|
||||
NEBULA (cosmic — magenta + deep purple):
|
||||
dark: bg #150A1E surface #1E1028 border #351A48 text #E0D0F0 muted #8060A0 accent #C850E0 secondary #8030C0 glow rgba(200,80,224,0.14)
|
||||
light: bg #F8F0FF surface #FFFFFF border #E0C8F0 text #1A0A24 muted #7050A0 accent #A840C0 secondary #6820A0 glow rgba(168,64,192,0.08)
|
||||
gradient-dark: radial-gradient(circle at 60% 40%, rgba(200,80,224,0.18), transparent 50%), radial-gradient(ellipse at 30% 80%, rgba(128,48,192,0.12), transparent 50%), #150A1E
|
||||
gradient-light: radial-gradient(circle at 40% 30%, rgba(168,64,192,0.10), transparent 55%), linear-gradient(180deg, #F8F0FF, #FFFFFF)
|
||||
|
||||
AURORA (ethereal lights — green-teal + violet):
|
||||
dark: bg #0A1A1A surface #102020 border #1A3838 text #D0F0F0 muted #60A0A0 accent #30D0B0 secondary #8040D0 glow rgba(48,208,176,0.12)
|
||||
light: bg #F0FAF8 surface #FFFFFF border #C0E8E0 text #0A2020 muted #508080 accent #20B090 secondary #6830B0 glow rgba(32,176,144,0.08)
|
||||
gradient-dark: radial-gradient(ellipse at 30% 70%, rgba(48,208,176,0.18), transparent 55%), radial-gradient(circle at 70% 30%, rgba(128,64,208,0.12), transparent 50%), #0A1A1A
|
||||
gradient-light: radial-gradient(ellipse at 50% 40%, rgba(32,176,144,0.10), transparent 55%), linear-gradient(180deg, #F0FAF8, #FFFFFF)
|
||||
|
||||
CORAL (tropical warmth — coral + turquoise):
|
||||
dark: bg #1E0F0F surface #281818 border #402828 text #F0D8D8 muted #A07070 accent #F06050 secondary #30B8B0 glow rgba(240,96,80,0.12)
|
||||
light: bg #FFF5F3 surface #FFFFFF border #F0D0C8 text #2E1010 muted #906060 accent #E04838 secondary #20A098 glow rgba(224,72,56,0.08)
|
||||
gradient-dark: radial-gradient(ellipse at 60% 60%, rgba(240,96,80,0.18), transparent 55%), radial-gradient(circle at 30% 30%, rgba(48,184,176,0.10), transparent 50%), #1E0F0F
|
||||
gradient-light: radial-gradient(ellipse at 40% 50%, rgba(224,72,56,0.08), transparent 55%), linear-gradient(180deg, #FFF5F3, #FFFFFF)
|
||||
|
||||
MIDNIGHT (deep sophistication — navy + silver):
|
||||
dark: bg #080C18 surface #0E1420 border #1A2438 text #C8D8F0 muted #5070A0 accent #4080E0 secondary #A0B0D0 glow rgba(64,128,224,0.12)
|
||||
light: bg #F0F2F8 surface #FFFFFF border #C8D0E0 text #101828 muted #506080 accent #3060C0 secondary #8090B0 glow rgba(48,96,192,0.08)
|
||||
gradient-dark: radial-gradient(ellipse at 50% 30%, rgba(64,128,224,0.16), transparent 55%), linear-gradient(180deg, #080C18, #0E1420)
|
||||
gradient-light: radial-gradient(ellipse at 50% 50%, rgba(48,96,192,0.08), transparent 55%), linear-gradient(180deg, #F0F2F8, #FFFFFF)
|
||||
|
||||
AMBER (rich honey warmth — amber + brown):
|
||||
dark: bg #1A1208 surface #221A0E border #3A2C18 text #F0E0C0 muted #A09060 accent #E0A020 secondary #C08030 glow rgba(224,160,32,0.12)
|
||||
light: bg #FFF8E8 surface #FFFFFF border #E8D8B8 text #2A1C08 muted #907840 accent #C88810 secondary #A86820 glow rgba(200,136,16,0.08)
|
||||
gradient-dark: radial-gradient(ellipse at 40% 60%, rgba(224,160,32,0.18), transparent 55%), linear-gradient(180deg, #1A1208, #221A0E)
|
||||
gradient-light: radial-gradient(ellipse at 60% 40%, rgba(200,136,16,0.10), transparent 55%), linear-gradient(180deg, #FFF8E8, #FFFFFF)
|
||||
|
||||
LAVENDER (gentle dreaminess — purple + lilac):
|
||||
dark: bg #14101E surface #1C1628 border #302840 text #E0D8F0 muted #8070A0 accent #A060E0 secondary #C090D0 glow rgba(160,96,224,0.12)
|
||||
light: bg #F8F0FF surface #FFFFFF border #E0D0F0 text #1C1028 muted #706090 accent #8848C0 secondary #A878B8 glow rgba(136,72,192,0.08)
|
||||
gradient-dark: radial-gradient(ellipse at 60% 40%, rgba(160,96,224,0.18), transparent 55%), radial-gradient(circle at 30% 70%, rgba(192,144,208,0.10), transparent 50%), #14101E
|
||||
gradient-light: radial-gradient(ellipse at 40% 30%, rgba(136,72,192,0.10), transparent 55%), linear-gradient(180deg, #F8F0FF, #FFFFFF)
|
||||
|
||||
STEEL (industrial strength — gray + steel blue):
|
||||
dark: bg #101214 surface #181C20 border #282E38 text #D0D8E0 muted #708090 accent #5088B0 secondary #90A0B0 glow rgba(80,136,176,0.12)
|
||||
light: bg #F2F4F6 surface #FFFFFF border #D0D8E0 text #181C24 muted #607080 accent #3870A0 secondary #708898 glow rgba(56,112,160,0.08)
|
||||
gradient-dark: radial-gradient(ellipse at 50% 50%, rgba(80,136,176,0.14), transparent 55%), linear-gradient(180deg, #101214, #181C20)
|
||||
gradient-light: radial-gradient(ellipse at 50% 40%, rgba(56,112,160,0.08), transparent 55%), linear-gradient(180deg, #F2F4F6, #FFFFFF)
|
||||
|
||||
CITRUS (bright optimism — yellow + lime):
|
||||
dark: bg #181808 surface #202010 border #383818 text #F0F0C0 muted #A0A060 accent #E8D020 secondary #90D030 glow rgba(232,208,32,0.12)
|
||||
light: bg #FFFFF0 surface #FFFFFF border #E8E8C0 text #282808 muted #808040 accent #C8B010 secondary #70B020 glow rgba(200,176,16,0.08)
|
||||
gradient-dark: radial-gradient(ellipse at 40% 40%, rgba(232,208,32,0.18), transparent 55%), radial-gradient(circle at 70% 70%, rgba(144,208,48,0.10), transparent 50%), #181808
|
||||
gradient-light: radial-gradient(ellipse at 50% 30%, rgba(200,176,16,0.10), transparent 55%), linear-gradient(180deg, #FFFFF0, #FFFFFF)
|
||||
|
||||
CHERRY (bold impact — deep red + dark):
|
||||
dark: bg #1A0808 surface #241010 border #401818 text #F0D0D0 muted #A06060 accent #D02030 secondary #E05060 glow rgba(208,32,48,0.14)
|
||||
light: bg #FFF0F0 surface #FFFFFF border #F0C8C8 text #280808 muted #904848 accent #B01828 secondary #C83848 glow rgba(176,24,40,0.08)
|
||||
gradient-dark: radial-gradient(ellipse at 50% 40%, rgba(208,32,48,0.20), transparent 50%), linear-gradient(180deg, #1A0808, #241010)
|
||||
gradient-light: radial-gradient(ellipse at 50% 50%, rgba(176,24,40,0.10), transparent 55%), linear-gradient(180deg, #FFF0F0, #FFFFFF)
|
||||
|
||||
=== SHARED TOKENS (use with any theme above) ===
|
||||
|
||||
SPACING: xs 8px, sm 16px, md 24px, lg 32px, xl 48px, 2xl 64px, 3xl 96px, 4xl 128px
|
||||
TYPOGRAPHY: fontFamily "Inter, system-ui, -apple-system, sans-serif"
|
||||
caption 14px/1.4, body 18px/1.6, subhead 24px/1.4, title 40px/1.2 w600, headline 64px/1.1 w700, display 96px/1.0 w800
|
||||
letterSpacing: tight "-0.02em", normal "0", wide "0.05em"
|
||||
BORDER RADIUS: 12px (cards), 8px (buttons), 9999px (pills)
|
||||
|
||||
=== VISUAL VARIETY (CRITICAL) ===
|
||||
|
||||
The user prompt assigns each slide a specific theme preset AND mode (dark/light).
|
||||
You MUST use EXACTLY the assigned preset and mode. Additionally:
|
||||
|
||||
1. Use the preset's gradient as the AbsoluteFill background.
|
||||
2. Use the preset's accent/secondary colors for highlights, pill badges, and card accents.
|
||||
3. Use the preset's glow value for all boxShadow effects.
|
||||
4. LAYOUT VARIATION: Vary layout between slides:
|
||||
- One slide: bold centered headline + subtle stat
|
||||
- Another: two-column card layout
|
||||
- Another: single large number or quote as hero
|
||||
Do NOT use the same layout pattern for every slide.
|
||||
|
||||
=== LAYOUT RULES (CRITICAL — elements must NEVER overlap) ===
|
||||
|
||||
The canvas is 1920x1080. You MUST use a SINGLE-LAYER layout. NO stacking, NO multiple AbsoluteFill layers.
|
||||
|
||||
STRUCTURE — every component must follow this exact pattern:
|
||||
<AbsoluteFill style={{ backgroundColor: "...", display: "flex", flexDirection: "column", justifyContent: "center", alignItems: "center", padding: 80 }}>
|
||||
{/* ALL content goes here as direct children in normal flow */}
|
||||
</AbsoluteFill>
|
||||
|
||||
ABSOLUTE RULES:
|
||||
- Use exactly ONE AbsoluteFill as the root. Set its background color/gradient via its style prop.
|
||||
- NEVER nest AbsoluteFill inside AbsoluteFill.
|
||||
- NEVER use position "absolute" or position "fixed" on ANY element.
|
||||
- NEVER use multiple layers or z-index.
|
||||
- ALL elements must be in normal document flow inside the single root AbsoluteFill.
|
||||
|
||||
SPACING:
|
||||
- Root padding: 80px on all sides (safe area).
|
||||
- Use flexDirection "column" with gap for vertical stacking, flexDirection "row" with gap for horizontal.
|
||||
- Minimum gap between elements: 24px vertical, 32px horizontal.
|
||||
- Text hierarchy gaps: headline→subheading 16px, subheading→body 12px, body→button 32px.
|
||||
- Cards/panels: padding 32px-48px, borderRadius 12px.
|
||||
- NEVER use margin to space siblings — always use the parent's gap property.
|
||||
|
||||
=== DESIGN STYLE ===
|
||||
|
||||
- Premium aesthetic — use the exact colors from the assigned theme preset (do NOT invent your own)
|
||||
- Background: use the preset's gradient-dark or gradient-light value directly as the AbsoluteFill's background
|
||||
- Card/surface backgrounds: use the preset's surface color
|
||||
- Text colors: use the preset's text, muted values
|
||||
- Borders: use the preset's border color
|
||||
- Glows: use the preset's glow value for all boxShadow — do NOT substitute other colors
|
||||
- Generous whitespace — less is more, let elements breathe
|
||||
- NO decorative background shapes, blurs, or overlapping ornaments
|
||||
|
||||
=== REMOTION RULES ===
|
||||
|
||||
- Export the component as: export const MyComposition = () => { ... }
|
||||
- Use useCurrentFrame() and useVideoConfig() from "remotion"
|
||||
- Do NOT use Sequence
|
||||
- Do NOT manually calculate animation timings or frame offsets
|
||||
|
||||
=== ANIMATION (use the stagger() helper for ALL element animations) ===
|
||||
|
||||
A pre-built helper function called stagger() is available globally.
|
||||
It handles enter, hold, and exit phases automatically — you MUST use it.
|
||||
|
||||
Signature:
|
||||
stagger(frame, fps, index, total) → { opacity: number, transform: string }
|
||||
|
||||
Parameters:
|
||||
frame — from useCurrentFrame()
|
||||
fps — from useVideoConfig()
|
||||
index — 0-based index of this element in the entrance order
|
||||
total — total number of animated elements in the scene
|
||||
|
||||
It returns a style object with opacity and transform that you spread onto the element.
|
||||
Timing is handled for you: staggered spring entrances, ambient hold motion, and a graceful exit.
|
||||
|
||||
Usage pattern:
|
||||
const frame = useCurrentFrame();
|
||||
const { fps } = useVideoConfig();
|
||||
|
||||
<div style={stagger(frame, fps, 0, 4)}>Headline</div>
|
||||
<div style={stagger(frame, fps, 1, 4)}>Subtitle</div>
|
||||
<div style={stagger(frame, fps, 2, 4)}>Card</div>
|
||||
<div style={stagger(frame, fps, 3, 4)}>Footer</div>
|
||||
|
||||
Rules:
|
||||
- Count ALL animated elements in your scene and pass that count as the "total" parameter.
|
||||
- Assign each element a sequential index starting from 0.
|
||||
- You can merge stagger's return with additional styles:
|
||||
<div style={{ ...stagger(frame, fps, 0, 3), fontSize: 64, color: "#fafafa" }}>
|
||||
- For non-animated static elements (backgrounds, borders), just use normal styles without stagger.
|
||||
- You may still use spring() and interpolate() for EXTRA custom effects (e.g., a number counter,
|
||||
color shift, or typewriter effect), but stagger() must drive all entrance/exit animations.
|
||||
|
||||
=== AVAILABLE GLOBALS (injected at runtime, do NOT import anything else) ===
|
||||
|
||||
- React (available globally)
|
||||
- AbsoluteFill, useCurrentFrame, useVideoConfig, spring, interpolate, Easing from "remotion"
|
||||
- stagger(frame, fps, index, total) — animation helper described above
|
||||
|
||||
=== CODE RULES ===
|
||||
|
||||
- Output ONLY the raw code, no markdown fences, no explanations
|
||||
- Keep it fully self-contained, no external dependencies or images
|
||||
- Use inline styles only (no CSS imports, no className)
|
||||
- Target 1920x1080 resolution
|
||||
- Every container must use display "flex" with explicit gap values
|
||||
- NEVER use marginTop/marginBottom to space siblings — use the parent's gap instead
|
||||
""".strip()
|
||||
|
||||
|
||||
def build_scene_generation_user_prompt(
|
||||
slide_number: int,
|
||||
total_slides: int,
|
||||
title: str,
|
||||
subtitle: str,
|
||||
content_in_markdown: str,
|
||||
background_explanation: str,
|
||||
duration_in_frames: int,
|
||||
theme: str,
|
||||
mode: str,
|
||||
) -> str:
|
||||
"""Build the user prompt for generating a single slide's Remotion scene code.
|
||||
|
||||
*theme* and *mode* are pre-assigned (by LLM or fallback) before this is called.
|
||||
"""
|
||||
return "\n".join(
|
||||
[
|
||||
"Create a cinematic, visually striking Remotion scene.",
|
||||
f"The video is {duration_in_frames} frames at {FPS}fps ({duration_in_frames / FPS:.1f}s total).",
|
||||
"",
|
||||
f"This is slide {slide_number} of {total_slides} in the video.",
|
||||
"",
|
||||
f"=== ASSIGNED THEME: {theme} / {mode.upper()} mode ===",
|
||||
f"You MUST use the {theme} preset in {mode} mode from the theme presets above.",
|
||||
f"Use its exact background gradient (gradient-{mode}), surface, text, accent, secondary, border, and glow colors.",
|
||||
"Do NOT substitute, invent, or default to blue/violet colors.",
|
||||
"",
|
||||
f'The scene should communicate this message: "{title} — {subtitle}"',
|
||||
"",
|
||||
"Key ideas to convey (use as creative inspiration, NOT literal text to dump on screen):",
|
||||
content_in_markdown,
|
||||
"",
|
||||
"Pick only the 1-2 most impactful phrases or numbers to display as text.",
|
||||
"",
|
||||
f"Mood & tone: {background_explanation}",
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
REFINE_SCENE_SYSTEM_PROMPT = """
|
||||
You are a code repair assistant. You will receive a Remotion React component that failed to compile,
|
||||
along with the exact error message from the Babel transpiler.
|
||||
|
||||
Your job is to fix the code so it compiles and runs correctly.
|
||||
|
||||
RULES:
|
||||
- Output ONLY the fixed raw code as a string — no markdown fences, no explanations.
|
||||
- Preserve the original intent, design, and animations as closely as possible.
|
||||
- The component must be exported as: export const MyComposition = () => { ... }
|
||||
- Only these globals are available at runtime (they are injected, not actually imported):
|
||||
React, AbsoluteFill, useCurrentFrame, useVideoConfig, spring, interpolate, Easing,
|
||||
stagger (a helper: stagger(frame, fps, index, total) → { opacity, transform })
|
||||
- Keep import statements at the top (they get stripped by the compiler) but do NOT import anything
|
||||
other than "react" and "remotion".
|
||||
- Use inline styles only (no CSS, no className).
|
||||
- Common fixes:
|
||||
- Mismatched braces/brackets in JSX style objects (e.g. }}, instead of }}>)
|
||||
- Missing closing tags
|
||||
- Trailing commas before > in JSX
|
||||
- Undefined variables or typos
|
||||
- Invalid JSX expressions
|
||||
- After fixing, mentally walk through every brace pair { } and JSX tag to verify they match.
|
||||
""".strip()
|
||||
72
surfsense_backend/app/agents/video_presentation/state.py
Normal file
72
surfsense_backend/app/agents/video_presentation/state.py
Normal file
|
|
@ -0,0 +1,72 @@
|
|||
"""Define the state structures for the video presentation agent."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass
|
||||
|
||||
from pydantic import BaseModel, Field
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
|
||||
|
||||
class SlideContent(BaseModel):
|
||||
"""Represents a single parsed slide from content analysis."""
|
||||
|
||||
slide_number: int = Field(..., description="1-based slide number")
|
||||
title: str = Field(..., description="Concise slide title")
|
||||
subtitle: str = Field(..., description="One-line subtitle or tagline")
|
||||
content_in_markdown: str = Field(
|
||||
..., description="Slide body content formatted as markdown"
|
||||
)
|
||||
speaker_transcripts: list[str] = Field(
|
||||
...,
|
||||
description="2-4 short sentences a presenter would say while this slide is shown",
|
||||
)
|
||||
background_explanation: str = Field(
|
||||
...,
|
||||
description="Emotional mood and color direction for this slide",
|
||||
)
|
||||
|
||||
|
||||
class PresentationSlides(BaseModel):
|
||||
"""Represents the full set of parsed slides from the LLM."""
|
||||
|
||||
slides: list[SlideContent] = Field(
|
||||
..., description="Ordered array of presentation slides"
|
||||
)
|
||||
|
||||
|
||||
class SlideAudioResult(BaseModel):
|
||||
"""Audio generation result for a single slide."""
|
||||
|
||||
slide_number: int
|
||||
audio_file: str = Field(..., description="Path to the per-slide audio file")
|
||||
duration_seconds: float = Field(..., description="Audio duration in seconds")
|
||||
duration_in_frames: int = Field(
|
||||
..., description="Audio duration in frames (at 30fps)"
|
||||
)
|
||||
|
||||
|
||||
class SlideSceneCode(BaseModel):
|
||||
"""Generated Remotion component code for a single slide."""
|
||||
|
||||
slide_number: int
|
||||
code: str = Field(
|
||||
..., description="Raw Remotion React component source code for this slide"
|
||||
)
|
||||
title: str = Field(..., description="Short title for the composition")
|
||||
|
||||
|
||||
@dataclass
|
||||
class State:
|
||||
"""State for the video presentation agent graph.
|
||||
|
||||
Pipeline: parse slides → generate per-slide TTS audio → generate per-slide Remotion code
|
||||
The frontend receives the slides + code + audio and handles compilation/rendering.
|
||||
"""
|
||||
|
||||
db_session: AsyncSession
|
||||
source_content: str
|
||||
|
||||
slides: list[SlideContent] | None = None
|
||||
slide_audio_results: list[SlideAudioResult] | None = None
|
||||
slide_scene_codes: list[SlideSceneCode] | None = None
|
||||
30
surfsense_backend/app/agents/video_presentation/utils.py
Normal file
30
surfsense_backend/app/agents/video_presentation/utils.py
Normal file
|
|
@ -0,0 +1,30 @@
|
|||
def get_voice_for_provider(provider: str, speaker_id: int = 0) -> dict | str:
|
||||
"""
|
||||
Get the appropriate voice configuration based on the TTS provider.
|
||||
|
||||
Currently single-speaker only (speaker_id=0). Multi-speaker support
|
||||
will be added in a future iteration.
|
||||
|
||||
Args:
|
||||
provider: The TTS provider (e.g., "openai/tts-1", "vertex_ai/test")
|
||||
speaker_id: The ID of the speaker (default 0, single speaker for now)
|
||||
|
||||
Returns:
|
||||
Voice configuration - string for OpenAI, dict for Vertex AI
|
||||
"""
|
||||
if provider == "local/kokoro":
|
||||
return "af_heart"
|
||||
|
||||
provider_type = (
|
||||
provider.split("/")[0].lower() if "/" in provider else provider.lower()
|
||||
)
|
||||
|
||||
voices = {
|
||||
"openai": "alloy",
|
||||
"vertex_ai": {
|
||||
"languageCode": "en-US",
|
||||
"name": "en-US-Studio-O",
|
||||
},
|
||||
"azure": "alloy",
|
||||
}
|
||||
return voices.get(provider_type, {})
|
||||
|
|
@ -341,7 +341,7 @@ if config.NEXT_FRONTEND_URL:
|
|||
allowed_origins.append(www_url)
|
||||
|
||||
allowed_origins.extend(
|
||||
[ # For local development and desktop app
|
||||
[ # For local development and desktop app
|
||||
"http://localhost:3000",
|
||||
"http://127.0.0.1:3000",
|
||||
]
|
||||
|
|
|
|||
|
|
@ -77,6 +77,7 @@ celery_app = Celery(
|
|||
include=[
|
||||
"app.tasks.celery_tasks.document_tasks",
|
||||
"app.tasks.celery_tasks.podcast_tasks",
|
||||
"app.tasks.celery_tasks.video_presentation_tasks",
|
||||
"app.tasks.celery_tasks.connector_tasks",
|
||||
"app.tasks.celery_tasks.schedule_checker_task",
|
||||
"app.tasks.celery_tasks.document_reindex_tasks",
|
||||
|
|
|
|||
|
|
@ -103,6 +103,13 @@ class PodcastStatus(StrEnum):
|
|||
FAILED = "failed"
|
||||
|
||||
|
||||
class VideoPresentationStatus(StrEnum):
|
||||
PENDING = "pending"
|
||||
GENERATING = "generating"
|
||||
READY = "ready"
|
||||
FAILED = "failed"
|
||||
|
||||
|
||||
class DocumentStatus:
|
||||
"""
|
||||
Helper class for document processing status (stored as JSONB).
|
||||
|
|
@ -337,6 +344,12 @@ class Permission(StrEnum):
|
|||
PODCASTS_UPDATE = "podcasts:update"
|
||||
PODCASTS_DELETE = "podcasts:delete"
|
||||
|
||||
# Video Presentations
|
||||
VIDEO_PRESENTATIONS_CREATE = "video_presentations:create"
|
||||
VIDEO_PRESENTATIONS_READ = "video_presentations:read"
|
||||
VIDEO_PRESENTATIONS_UPDATE = "video_presentations:update"
|
||||
VIDEO_PRESENTATIONS_DELETE = "video_presentations:delete"
|
||||
|
||||
# Image Generations
|
||||
IMAGE_GENERATIONS_CREATE = "image_generations:create"
|
||||
IMAGE_GENERATIONS_READ = "image_generations:read"
|
||||
|
|
@ -403,6 +416,10 @@ DEFAULT_ROLE_PERMISSIONS = {
|
|||
Permission.PODCASTS_CREATE.value,
|
||||
Permission.PODCASTS_READ.value,
|
||||
Permission.PODCASTS_UPDATE.value,
|
||||
# Video Presentations (no delete)
|
||||
Permission.VIDEO_PRESENTATIONS_CREATE.value,
|
||||
Permission.VIDEO_PRESENTATIONS_READ.value,
|
||||
Permission.VIDEO_PRESENTATIONS_UPDATE.value,
|
||||
# Image Generations (create and read, no delete)
|
||||
Permission.IMAGE_GENERATIONS_CREATE.value,
|
||||
Permission.IMAGE_GENERATIONS_READ.value,
|
||||
|
|
@ -435,6 +452,8 @@ DEFAULT_ROLE_PERMISSIONS = {
|
|||
Permission.LLM_CONFIGS_READ.value,
|
||||
# Podcasts (read only)
|
||||
Permission.PODCASTS_READ.value,
|
||||
# Video Presentations (read only)
|
||||
Permission.VIDEO_PRESENTATIONS_READ.value,
|
||||
# Image Generations (read only)
|
||||
Permission.IMAGE_GENERATIONS_READ.value,
|
||||
# Connectors (read only)
|
||||
|
|
@ -1044,6 +1063,46 @@ class Podcast(BaseModel, TimestampMixin):
|
|||
thread = relationship("NewChatThread")
|
||||
|
||||
|
||||
class VideoPresentation(BaseModel, TimestampMixin):
|
||||
"""Video presentation model for storing AI-generated video presentations.
|
||||
|
||||
The slides JSONB stores per-slide data including Remotion component code,
|
||||
audio file paths, and durations. The frontend compiles the code and renders
|
||||
the video using Remotion Player.
|
||||
"""
|
||||
|
||||
__tablename__ = "video_presentations"
|
||||
|
||||
title = Column(String(500), nullable=False)
|
||||
slides = Column(JSONB, nullable=True)
|
||||
scene_codes = Column(JSONB, nullable=True)
|
||||
status = Column(
|
||||
SQLAlchemyEnum(
|
||||
VideoPresentationStatus,
|
||||
name="video_presentation_status",
|
||||
create_type=False,
|
||||
values_callable=lambda x: [e.value for e in x],
|
||||
),
|
||||
nullable=False,
|
||||
default=VideoPresentationStatus.READY,
|
||||
server_default="ready",
|
||||
index=True,
|
||||
)
|
||||
|
||||
search_space_id = Column(
|
||||
Integer, ForeignKey("searchspaces.id", ondelete="CASCADE"), nullable=False
|
||||
)
|
||||
search_space = relationship("SearchSpace", back_populates="video_presentations")
|
||||
|
||||
thread_id = Column(
|
||||
Integer,
|
||||
ForeignKey("new_chat_threads.id", ondelete="SET NULL"),
|
||||
nullable=True,
|
||||
index=True,
|
||||
)
|
||||
thread = relationship("NewChatThread")
|
||||
|
||||
|
||||
class Report(BaseModel, TimestampMixin):
|
||||
"""Report model for storing generated Markdown reports."""
|
||||
|
||||
|
|
@ -1228,6 +1287,12 @@ class SearchSpace(BaseModel, TimestampMixin):
|
|||
order_by="Podcast.id.desc()",
|
||||
cascade="all, delete-orphan",
|
||||
)
|
||||
video_presentations = relationship(
|
||||
"VideoPresentation",
|
||||
back_populates="search_space",
|
||||
order_by="VideoPresentation.id.desc()",
|
||||
cascade="all, delete-orphan",
|
||||
)
|
||||
reports = relationship(
|
||||
"Report",
|
||||
back_populates="search_space",
|
||||
|
|
|
|||
|
|
@ -42,6 +42,7 @@ from .search_spaces_routes import router as search_spaces_router
|
|||
from .slack_add_connector_route import router as slack_add_connector_router
|
||||
from .surfsense_docs_routes import router as surfsense_docs_router
|
||||
from .teams_add_connector_route import router as teams_add_connector_router
|
||||
from .video_presentations_routes import router as video_presentations_router
|
||||
from .youtube_routes import router as youtube_router
|
||||
|
||||
router = APIRouter()
|
||||
|
|
@ -55,6 +56,9 @@ router.include_router(new_chat_router) # Chat with assistant-ui persistence
|
|||
router.include_router(sandbox_router) # Sandbox file downloads (Daytona)
|
||||
router.include_router(chat_comments_router)
|
||||
router.include_router(podcasts_router) # Podcast task status and audio
|
||||
router.include_router(
|
||||
video_presentations_router
|
||||
) # Video presentation status and streaming
|
||||
router.include_router(reports_router) # Report CRUD and multi-format export
|
||||
router.include_router(image_generation_router) # Image generation via litellm
|
||||
router.include_router(search_source_connectors_router)
|
||||
|
|
|
|||
242
surfsense_backend/app/routes/video_presentations_routes.py
Normal file
242
surfsense_backend/app/routes/video_presentations_routes.py
Normal file
|
|
@ -0,0 +1,242 @@
|
|||
"""
|
||||
Video presentation routes for CRUD operations and per-slide audio streaming.
|
||||
|
||||
These routes support the video presentation generation feature in new-chat.
|
||||
Frontend polls GET /video-presentations/{id} to check status field.
|
||||
When ready, the slides JSONB contains per-slide Remotion code and audio file paths.
|
||||
The frontend compiles the Remotion code via Babel and renders with Remotion Player.
|
||||
"""
|
||||
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
from fastapi import APIRouter, Depends, HTTPException
|
||||
from fastapi.responses import StreamingResponse
|
||||
from sqlalchemy import select
|
||||
from sqlalchemy.exc import SQLAlchemyError
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
|
||||
from app.db import (
|
||||
Permission,
|
||||
SearchSpace,
|
||||
SearchSpaceMembership,
|
||||
User,
|
||||
VideoPresentation,
|
||||
get_async_session,
|
||||
)
|
||||
from app.schemas import VideoPresentationRead
|
||||
from app.users import current_active_user
|
||||
from app.utils.rbac import check_permission
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
|
||||
@router.get("/video-presentations", response_model=list[VideoPresentationRead])
|
||||
async def read_video_presentations(
|
||||
skip: int = 0,
|
||||
limit: int = 100,
|
||||
search_space_id: int | None = None,
|
||||
session: AsyncSession = Depends(get_async_session),
|
||||
user: User = Depends(current_active_user),
|
||||
):
|
||||
"""
|
||||
List video presentations the user has access to.
|
||||
Requires VIDEO_PRESENTATIONS_READ permission for the search space(s).
|
||||
"""
|
||||
if skip < 0 or limit < 1:
|
||||
raise HTTPException(status_code=400, detail="Invalid pagination parameters")
|
||||
try:
|
||||
if search_space_id is not None:
|
||||
await check_permission(
|
||||
session,
|
||||
user,
|
||||
search_space_id,
|
||||
Permission.VIDEO_PRESENTATIONS_READ.value,
|
||||
"You don't have permission to read video presentations in this search space",
|
||||
)
|
||||
result = await session.execute(
|
||||
select(VideoPresentation)
|
||||
.filter(VideoPresentation.search_space_id == search_space_id)
|
||||
.offset(skip)
|
||||
.limit(limit)
|
||||
)
|
||||
else:
|
||||
result = await session.execute(
|
||||
select(VideoPresentation)
|
||||
.join(SearchSpace)
|
||||
.join(SearchSpaceMembership)
|
||||
.filter(SearchSpaceMembership.user_id == user.id)
|
||||
.offset(skip)
|
||||
.limit(limit)
|
||||
)
|
||||
return [
|
||||
VideoPresentationRead.from_orm_with_slides(vp)
|
||||
for vp in result.scalars().all()
|
||||
]
|
||||
except HTTPException:
|
||||
raise
|
||||
except SQLAlchemyError:
|
||||
raise HTTPException(
|
||||
status_code=500,
|
||||
detail="Database error occurred while fetching video presentations",
|
||||
) from None
|
||||
|
||||
|
||||
@router.get(
|
||||
"/video-presentations/{video_presentation_id}",
|
||||
response_model=VideoPresentationRead,
|
||||
)
|
||||
async def read_video_presentation(
|
||||
video_presentation_id: int,
|
||||
session: AsyncSession = Depends(get_async_session),
|
||||
user: User = Depends(current_active_user),
|
||||
):
|
||||
"""
|
||||
Get a specific video presentation by ID.
|
||||
Requires authentication with VIDEO_PRESENTATIONS_READ permission.
|
||||
|
||||
When status is "ready", the response includes:
|
||||
- slides: parsed slide data with per-slide audio_url and durations
|
||||
- scene_codes: Remotion component source code per slide
|
||||
"""
|
||||
try:
|
||||
result = await session.execute(
|
||||
select(VideoPresentation).filter(
|
||||
VideoPresentation.id == video_presentation_id
|
||||
)
|
||||
)
|
||||
video_pres = result.scalars().first()
|
||||
|
||||
if not video_pres:
|
||||
raise HTTPException(status_code=404, detail="Video presentation not found")
|
||||
|
||||
await check_permission(
|
||||
session,
|
||||
user,
|
||||
video_pres.search_space_id,
|
||||
Permission.VIDEO_PRESENTATIONS_READ.value,
|
||||
"You don't have permission to read video presentations in this search space",
|
||||
)
|
||||
|
||||
return VideoPresentationRead.from_orm_with_slides(video_pres)
|
||||
except HTTPException as he:
|
||||
raise he
|
||||
except SQLAlchemyError:
|
||||
raise HTTPException(
|
||||
status_code=500,
|
||||
detail="Database error occurred while fetching video presentation",
|
||||
) from None
|
||||
|
||||
|
||||
@router.delete("/video-presentations/{video_presentation_id}", response_model=dict)
|
||||
async def delete_video_presentation(
|
||||
video_presentation_id: int,
|
||||
session: AsyncSession = Depends(get_async_session),
|
||||
user: User = Depends(current_active_user),
|
||||
):
|
||||
"""
|
||||
Delete a video presentation.
|
||||
Requires VIDEO_PRESENTATIONS_DELETE permission for the search space.
|
||||
"""
|
||||
try:
|
||||
result = await session.execute(
|
||||
select(VideoPresentation).filter(
|
||||
VideoPresentation.id == video_presentation_id
|
||||
)
|
||||
)
|
||||
db_video_pres = result.scalars().first()
|
||||
|
||||
if not db_video_pres:
|
||||
raise HTTPException(status_code=404, detail="Video presentation not found")
|
||||
|
||||
await check_permission(
|
||||
session,
|
||||
user,
|
||||
db_video_pres.search_space_id,
|
||||
Permission.VIDEO_PRESENTATIONS_DELETE.value,
|
||||
"You don't have permission to delete video presentations in this search space",
|
||||
)
|
||||
|
||||
await session.delete(db_video_pres)
|
||||
await session.commit()
|
||||
return {"message": "Video presentation deleted successfully"}
|
||||
except HTTPException as he:
|
||||
raise he
|
||||
except SQLAlchemyError:
|
||||
await session.rollback()
|
||||
raise HTTPException(
|
||||
status_code=500,
|
||||
detail="Database error occurred while deleting video presentation",
|
||||
) from None
|
||||
|
||||
|
||||
@router.get("/video-presentations/{video_presentation_id}/slides/{slide_number}/audio")
|
||||
async def stream_slide_audio(
|
||||
video_presentation_id: int,
|
||||
slide_number: int,
|
||||
session: AsyncSession = Depends(get_async_session),
|
||||
user: User = Depends(current_active_user),
|
||||
):
|
||||
"""
|
||||
Stream the audio file for a specific slide in a video presentation.
|
||||
The slide_number is 1-based. Audio path is read from the slides JSONB.
|
||||
"""
|
||||
try:
|
||||
result = await session.execute(
|
||||
select(VideoPresentation).filter(
|
||||
VideoPresentation.id == video_presentation_id
|
||||
)
|
||||
)
|
||||
video_pres = result.scalars().first()
|
||||
|
||||
if not video_pres:
|
||||
raise HTTPException(status_code=404, detail="Video presentation not found")
|
||||
|
||||
await check_permission(
|
||||
session,
|
||||
user,
|
||||
video_pres.search_space_id,
|
||||
Permission.VIDEO_PRESENTATIONS_READ.value,
|
||||
"You don't have permission to access video presentations in this search space",
|
||||
)
|
||||
|
||||
slides = video_pres.slides or []
|
||||
slide_data = None
|
||||
for s in slides:
|
||||
if s.get("slide_number") == slide_number:
|
||||
slide_data = s
|
||||
break
|
||||
|
||||
if not slide_data:
|
||||
raise HTTPException(
|
||||
status_code=404,
|
||||
detail=f"Slide {slide_number} not found",
|
||||
)
|
||||
|
||||
file_path = slide_data.get("audio_file")
|
||||
if not file_path or not os.path.isfile(file_path):
|
||||
raise HTTPException(status_code=404, detail="Slide audio file not found")
|
||||
|
||||
ext = Path(file_path).suffix.lower()
|
||||
media_type = "audio/wav" if ext == ".wav" else "audio/mpeg"
|
||||
|
||||
def iterfile():
|
||||
with open(file_path, mode="rb") as file_like:
|
||||
yield from file_like
|
||||
|
||||
return StreamingResponse(
|
||||
iterfile(),
|
||||
media_type=media_type,
|
||||
headers={
|
||||
"Accept-Ranges": "bytes",
|
||||
"Content-Disposition": f"inline; filename={Path(file_path).name}",
|
||||
},
|
||||
)
|
||||
|
||||
except HTTPException as he:
|
||||
raise he
|
||||
except Exception as e:
|
||||
raise HTTPException(
|
||||
status_code=500,
|
||||
detail=f"Error streaming slide audio: {e!s}",
|
||||
) from e
|
||||
|
|
@ -101,6 +101,12 @@ from .search_space import (
|
|||
SearchSpaceWithStats,
|
||||
)
|
||||
from .users import UserCreate, UserRead, UserUpdate
|
||||
from .video_presentations import (
|
||||
VideoPresentationBase,
|
||||
VideoPresentationCreate,
|
||||
VideoPresentationRead,
|
||||
VideoPresentationUpdate,
|
||||
)
|
||||
|
||||
__all__ = [
|
||||
# Chat schemas (assistant-ui integration)
|
||||
|
|
@ -220,4 +226,9 @@ __all__ = [
|
|||
"UserRead",
|
||||
"UserSearchSpaceAccess",
|
||||
"UserUpdate",
|
||||
# Video Presentation schemas
|
||||
"VideoPresentationBase",
|
||||
"VideoPresentationCreate",
|
||||
"VideoPresentationRead",
|
||||
"VideoPresentationUpdate",
|
||||
]
|
||||
|
|
|
|||
103
surfsense_backend/app/schemas/video_presentations.py
Normal file
103
surfsense_backend/app/schemas/video_presentations.py
Normal file
|
|
@ -0,0 +1,103 @@
|
|||
"""Video presentation schemas for API responses."""
|
||||
|
||||
from datetime import datetime
|
||||
from enum import StrEnum
|
||||
from typing import Any
|
||||
|
||||
from pydantic import BaseModel
|
||||
|
||||
|
||||
class VideoPresentationStatusEnum(StrEnum):
|
||||
PENDING = "pending"
|
||||
GENERATING = "generating"
|
||||
READY = "ready"
|
||||
FAILED = "failed"
|
||||
|
||||
|
||||
class VideoPresentationBase(BaseModel):
|
||||
"""Base video presentation schema."""
|
||||
|
||||
title: str
|
||||
slides: list[dict[str, Any]] | None = None
|
||||
scene_codes: list[dict[str, Any]] | None = None
|
||||
search_space_id: int
|
||||
|
||||
|
||||
class VideoPresentationCreate(VideoPresentationBase):
|
||||
"""Schema for creating a video presentation."""
|
||||
|
||||
pass
|
||||
|
||||
|
||||
class VideoPresentationUpdate(BaseModel):
|
||||
"""Schema for updating a video presentation."""
|
||||
|
||||
title: str | None = None
|
||||
slides: list[dict[str, Any]] | None = None
|
||||
scene_codes: list[dict[str, Any]] | None = None
|
||||
|
||||
|
||||
class VideoPresentationRead(VideoPresentationBase):
|
||||
"""Schema for reading a video presentation."""
|
||||
|
||||
id: int
|
||||
status: VideoPresentationStatusEnum = VideoPresentationStatusEnum.READY
|
||||
created_at: datetime
|
||||
slide_count: int | None = None
|
||||
|
||||
class Config:
|
||||
from_attributes = True
|
||||
|
||||
@classmethod
|
||||
def from_orm_with_slides(cls, obj):
|
||||
"""Create VideoPresentationRead with slide_count computed.
|
||||
|
||||
Replaces raw server file paths in `audio_file` with API streaming
|
||||
URLs so the frontend can use them directly in Remotion <Audio />.
|
||||
"""
|
||||
slides = obj.slides
|
||||
if slides:
|
||||
slides = _replace_audio_paths_with_urls(obj.id, slides)
|
||||
|
||||
data = {
|
||||
"id": obj.id,
|
||||
"title": obj.title,
|
||||
"slides": slides,
|
||||
"scene_codes": obj.scene_codes,
|
||||
"search_space_id": obj.search_space_id,
|
||||
"status": obj.status,
|
||||
"created_at": obj.created_at,
|
||||
"slide_count": len(obj.slides) if obj.slides else None,
|
||||
}
|
||||
return cls(**data)
|
||||
|
||||
|
||||
def _replace_audio_paths_with_urls(
|
||||
video_presentation_id: int,
|
||||
slides: list[dict[str, Any]],
|
||||
) -> list[dict[str, Any]]:
|
||||
"""Replace server-local audio_file paths with streaming API URLs.
|
||||
|
||||
Transforms:
|
||||
"audio_file": "video_presentation_audio/abc_slide_1.mp3"
|
||||
Into:
|
||||
"audio_url": "/api/v1/video-presentations/42/slides/1/audio"
|
||||
|
||||
The frontend passes this URL to Remotion's <Audio src={slide.audio_url} />.
|
||||
"""
|
||||
result = []
|
||||
for slide in slides:
|
||||
slide_copy = dict(slide)
|
||||
slide_number = slide_copy.get("slide_number")
|
||||
audio_file = slide_copy.pop("audio_file", None)
|
||||
|
||||
if audio_file and slide_number is not None:
|
||||
slide_copy["audio_url"] = (
|
||||
f"/api/v1/video-presentations/{video_presentation_id}"
|
||||
f"/slides/{slide_number}/audio"
|
||||
)
|
||||
else:
|
||||
slide_copy["audio_url"] = None
|
||||
|
||||
result.append(slide_copy)
|
||||
return result
|
||||
|
|
@ -0,0 +1,178 @@
|
|||
"""Celery tasks for video presentation generation."""
|
||||
|
||||
import asyncio
|
||||
import logging
|
||||
import sys
|
||||
|
||||
from sqlalchemy import select
|
||||
|
||||
from app.agents.video_presentation.graph import graph as video_presentation_graph
|
||||
from app.agents.video_presentation.state import State as VideoPresentationState
|
||||
from app.celery_app import celery_app
|
||||
from app.config import config
|
||||
from app.db import VideoPresentation, VideoPresentationStatus
|
||||
from app.tasks.celery_tasks import get_celery_session_maker
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
if sys.platform.startswith("win"):
|
||||
try:
|
||||
asyncio.set_event_loop_policy(asyncio.WindowsProactorEventLoopPolicy())
|
||||
except AttributeError:
|
||||
logger.warning(
|
||||
"WindowsProactorEventLoopPolicy is unavailable; async subprocess support may fail."
|
||||
)
|
||||
|
||||
|
||||
def _clear_generating_video_presentation(search_space_id: int) -> None:
|
||||
"""Clear the generating video presentation marker from Redis when task completes."""
|
||||
import redis
|
||||
|
||||
try:
|
||||
client = redis.from_url(config.REDIS_APP_URL, decode_responses=True)
|
||||
key = f"video_presentation:generating:{search_space_id}"
|
||||
client.delete(key)
|
||||
logger.info(
|
||||
f"Cleared generating video presentation key for search_space_id={search_space_id}"
|
||||
)
|
||||
except Exception as e:
|
||||
logger.warning(f"Could not clear generating video presentation key: {e}")
|
||||
|
||||
|
||||
@celery_app.task(name="generate_video_presentation", bind=True)
|
||||
def generate_video_presentation_task(
|
||||
self,
|
||||
video_presentation_id: int,
|
||||
source_content: str,
|
||||
search_space_id: int,
|
||||
user_prompt: str | None = None,
|
||||
) -> dict:
|
||||
"""
|
||||
Celery task to generate video presentation from source content.
|
||||
Updates existing video presentation record created by the tool.
|
||||
"""
|
||||
loop = asyncio.new_event_loop()
|
||||
asyncio.set_event_loop(loop)
|
||||
|
||||
try:
|
||||
result = loop.run_until_complete(
|
||||
_generate_video_presentation(
|
||||
video_presentation_id,
|
||||
source_content,
|
||||
search_space_id,
|
||||
user_prompt,
|
||||
)
|
||||
)
|
||||
loop.run_until_complete(loop.shutdown_asyncgens())
|
||||
return result
|
||||
except Exception as e:
|
||||
logger.error(f"Error generating video presentation: {e!s}")
|
||||
loop.run_until_complete(_mark_video_presentation_failed(video_presentation_id))
|
||||
return {"status": "failed", "video_presentation_id": video_presentation_id}
|
||||
finally:
|
||||
_clear_generating_video_presentation(search_space_id)
|
||||
asyncio.set_event_loop(None)
|
||||
loop.close()
|
||||
|
||||
|
||||
async def _mark_video_presentation_failed(video_presentation_id: int) -> None:
|
||||
"""Mark a video presentation as failed in the database."""
|
||||
async with get_celery_session_maker()() as session:
|
||||
try:
|
||||
result = await session.execute(
|
||||
select(VideoPresentation).filter(
|
||||
VideoPresentation.id == video_presentation_id
|
||||
)
|
||||
)
|
||||
video_pres = result.scalars().first()
|
||||
if video_pres:
|
||||
video_pres.status = VideoPresentationStatus.FAILED
|
||||
await session.commit()
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to mark video presentation as failed: {e}")
|
||||
|
||||
|
||||
async def _generate_video_presentation(
|
||||
video_presentation_id: int,
|
||||
source_content: str,
|
||||
search_space_id: int,
|
||||
user_prompt: str | None = None,
|
||||
) -> dict:
|
||||
"""Generate video presentation and update existing record."""
|
||||
async with get_celery_session_maker()() as session:
|
||||
result = await session.execute(
|
||||
select(VideoPresentation).filter(
|
||||
VideoPresentation.id == video_presentation_id
|
||||
)
|
||||
)
|
||||
video_pres = result.scalars().first()
|
||||
|
||||
if not video_pres:
|
||||
raise ValueError(f"VideoPresentation {video_presentation_id} not found")
|
||||
|
||||
try:
|
||||
video_pres.status = VideoPresentationStatus.GENERATING
|
||||
await session.commit()
|
||||
|
||||
graph_config = {
|
||||
"configurable": {
|
||||
"video_title": video_pres.title,
|
||||
"search_space_id": search_space_id,
|
||||
"user_prompt": user_prompt,
|
||||
}
|
||||
}
|
||||
|
||||
initial_state = VideoPresentationState(
|
||||
source_content=source_content,
|
||||
db_session=session,
|
||||
)
|
||||
|
||||
graph_result = await video_presentation_graph.ainvoke(
|
||||
initial_state, config=graph_config
|
||||
)
|
||||
|
||||
# Serialize slides (parsed content + audio info merged)
|
||||
slides_raw = graph_result.get("slides", [])
|
||||
audio_results_raw = graph_result.get("slide_audio_results", [])
|
||||
scene_codes_raw = graph_result.get("slide_scene_codes", [])
|
||||
|
||||
audio_map = {}
|
||||
for ar in audio_results_raw:
|
||||
data = ar.model_dump() if hasattr(ar, "model_dump") else ar
|
||||
audio_map[data.get("slide_number", 0)] = data
|
||||
|
||||
serializable_slides = []
|
||||
for slide in slides_raw:
|
||||
slide_data = (
|
||||
slide.model_dump() if hasattr(slide, "model_dump") else dict(slide)
|
||||
)
|
||||
audio_data = audio_map.get(slide_data.get("slide_number", 0), {})
|
||||
slide_data["audio_file"] = audio_data.get("audio_file")
|
||||
slide_data["duration_seconds"] = audio_data.get("duration_seconds")
|
||||
slide_data["duration_in_frames"] = audio_data.get("duration_in_frames")
|
||||
serializable_slides.append(slide_data)
|
||||
|
||||
serializable_scene_codes = []
|
||||
for sc in scene_codes_raw:
|
||||
sc_data = sc.model_dump() if hasattr(sc, "model_dump") else dict(sc)
|
||||
serializable_scene_codes.append(sc_data)
|
||||
|
||||
video_pres.slides = serializable_slides
|
||||
video_pres.scene_codes = serializable_scene_codes
|
||||
video_pres.status = VideoPresentationStatus.READY
|
||||
await session.commit()
|
||||
|
||||
logger.info(f"Successfully generated video presentation: {video_pres.id}")
|
||||
|
||||
return {
|
||||
"status": "ready",
|
||||
"video_presentation_id": video_pres.id,
|
||||
"title": video_pres.title,
|
||||
"slide_count": len(serializable_slides),
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error in _generate_video_presentation: {e!s}")
|
||||
video_pres.status = VideoPresentationStatus.FAILED
|
||||
await session.commit()
|
||||
raise
|
||||
|
|
@ -613,6 +613,41 @@ async def _stream_agent_events(
|
|||
status="completed",
|
||||
items=completed_items,
|
||||
)
|
||||
elif tool_name == "generate_video_presentation":
|
||||
vp_status = (
|
||||
tool_output.get("status", "unknown")
|
||||
if isinstance(tool_output, dict)
|
||||
else "unknown"
|
||||
)
|
||||
vp_title = (
|
||||
tool_output.get("title", "Presentation")
|
||||
if isinstance(tool_output, dict)
|
||||
else "Presentation"
|
||||
)
|
||||
if vp_status in ("pending", "generating"):
|
||||
completed_items = [
|
||||
f"Title: {vp_title}",
|
||||
"Presentation generation started",
|
||||
"Processing in background...",
|
||||
]
|
||||
elif vp_status == "failed":
|
||||
error_msg = (
|
||||
tool_output.get("error", "Unknown error")
|
||||
if isinstance(tool_output, dict)
|
||||
else "Unknown error"
|
||||
)
|
||||
completed_items = [
|
||||
f"Title: {vp_title}",
|
||||
f"Error: {error_msg[:50]}",
|
||||
]
|
||||
else:
|
||||
completed_items = last_active_step_items
|
||||
yield streaming_service.format_thinking_step(
|
||||
step_id=original_step_id,
|
||||
title="Generating video presentation",
|
||||
status="completed",
|
||||
items=completed_items,
|
||||
)
|
||||
elif tool_name == "generate_report":
|
||||
report_status = (
|
||||
tool_output.get("status", "unknown")
|
||||
|
|
@ -756,6 +791,34 @@ async def _stream_agent_events(
|
|||
f"Podcast generation failed: {error_msg}",
|
||||
"error",
|
||||
)
|
||||
elif tool_name == "generate_video_presentation":
|
||||
yield streaming_service.format_tool_output_available(
|
||||
tool_call_id,
|
||||
tool_output
|
||||
if isinstance(tool_output, dict)
|
||||
else {"result": tool_output},
|
||||
)
|
||||
if (
|
||||
isinstance(tool_output, dict)
|
||||
and tool_output.get("status") == "pending"
|
||||
):
|
||||
yield streaming_service.format_terminal_info(
|
||||
f"Video presentation queued: {tool_output.get('title', 'Presentation')}",
|
||||
"success",
|
||||
)
|
||||
elif (
|
||||
isinstance(tool_output, dict)
|
||||
and tool_output.get("status") == "failed"
|
||||
):
|
||||
error_msg = (
|
||||
tool_output.get("error", "Unknown error")
|
||||
if isinstance(tool_output, dict)
|
||||
else "Unknown error"
|
||||
)
|
||||
yield streaming_service.format_terminal_info(
|
||||
f"Presentation generation failed: {error_msg}",
|
||||
"error",
|
||||
)
|
||||
elif tool_name == "link_preview":
|
||||
yield streaming_service.format_tool_output_available(
|
||||
tool_call_id,
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue