From b28f135a963fa15d4ba99edf2aaf8e666f8e8721 Mon Sep 17 00:00:00 2001 From: "DESKTOP-RTLN3BA\\$punk" Date: Sat, 21 Mar 2026 22:13:41 -0700 Subject: [PATCH] feat: init video presentation agent --- .gitignore | 3 +- .vscode/launch.json | 35 +- surfsense_backend/.gitignore | 1 + .../107_add_video_presentations_table.py | 85 +++ .../app/agents/new_chat/system_prompt.py | 22 + .../app/agents/new_chat/tools/__init__.py | 3 + .../app/agents/new_chat/tools/registry.py | 12 + .../new_chat/tools/video_presentation.py | 171 +++++ .../app/agents/video_presentation/__init__.py | 10 + .../video_presentation/configuration.py | 25 + .../app/agents/video_presentation/graph.py | 30 + .../app/agents/video_presentation/nodes.py | 552 ++++++++++++++ .../app/agents/video_presentation/prompts.py | 509 +++++++++++++ .../app/agents/video_presentation/state.py | 72 ++ .../app/agents/video_presentation/utils.py | 30 + surfsense_backend/app/app.py | 2 +- surfsense_backend/app/celery_app.py | 1 + surfsense_backend/app/db.py | 65 ++ surfsense_backend/app/routes/__init__.py | 4 + .../app/routes/video_presentations_routes.py | 242 +++++++ surfsense_backend/app/schemas/__init__.py | 11 + .../app/schemas/video_presentations.py | 103 +++ .../celery_tasks/video_presentation_tasks.py | 178 +++++ .../app/tasks/chat/stream_new_chat.py | 63 ++ .../components/DocumentsTableShell.tsx | 32 +- .../new-chat/[[...chat_id]]/page.tsx | 3 + .../public-chat/public-chat-view.tsx | 2 + surfsense_web/components/tool-ui/index.ts | 1 + .../video-presentation/combined-player.tsx | 74 ++ .../generate-video-presentation.tsx | 682 ++++++++++++++++++ .../tool-ui/video-presentation/index.ts | 1 + surfsense_web/contracts/enums/toolIcons.tsx | 2 + surfsense_web/lib/remotion/compile-check.ts | 154 ++++ surfsense_web/lib/remotion/constants.ts | 2 + surfsense_web/lib/remotion/dom-to-pptx.d.ts | 18 + surfsense_web/package.json | 7 + surfsense_web/pnpm-lock.yaml | 384 ++++++++++ 37 files changed, 3567 insertions(+), 24 deletions(-) create mode 100644 surfsense_backend/alembic/versions/107_add_video_presentations_table.py create mode 100644 surfsense_backend/app/agents/new_chat/tools/video_presentation.py create mode 100644 surfsense_backend/app/agents/video_presentation/__init__.py create mode 100644 surfsense_backend/app/agents/video_presentation/configuration.py create mode 100644 surfsense_backend/app/agents/video_presentation/graph.py create mode 100644 surfsense_backend/app/agents/video_presentation/nodes.py create mode 100644 surfsense_backend/app/agents/video_presentation/prompts.py create mode 100644 surfsense_backend/app/agents/video_presentation/state.py create mode 100644 surfsense_backend/app/agents/video_presentation/utils.py create mode 100644 surfsense_backend/app/routes/video_presentations_routes.py create mode 100644 surfsense_backend/app/schemas/video_presentations.py create mode 100644 surfsense_backend/app/tasks/celery_tasks/video_presentation_tasks.py create mode 100644 surfsense_web/components/tool-ui/video-presentation/combined-player.tsx create mode 100644 surfsense_web/components/tool-ui/video-presentation/generate-video-presentation.tsx create mode 100644 surfsense_web/components/tool-ui/video-presentation/index.ts create mode 100644 surfsense_web/lib/remotion/compile-check.ts create mode 100644 surfsense_web/lib/remotion/constants.ts create mode 100644 surfsense_web/lib/remotion/dom-to-pptx.d.ts diff --git a/.gitignore b/.gitignore index a44664ad..a5c44ce7 100644 --- a/.gitignore +++ b/.gitignore @@ -5,5 +5,4 @@ node_modules/ .ruff_cache/ .venv .pnpm-store -.DS_Store -RemotionTets/ \ No newline at end of file +.DS_Store \ No newline at end of file diff --git a/.vscode/launch.json b/.vscode/launch.json index 2c4784c0..029e7c64 100644 --- a/.vscode/launch.json +++ b/.vscode/launch.json @@ -22,7 +22,11 @@ "console": "integratedTerminal", "justMyCode": false, "cwd": "${workspaceFolder}/surfsense_backend", - "python": "${command:python.interpreterPath}" + "python": "uv", + "pythonArgs": [ + "run", + "python" + ] }, { "name": "Backend: FastAPI (No Reload)", @@ -32,7 +36,11 @@ "console": "integratedTerminal", "justMyCode": false, "cwd": "${workspaceFolder}/surfsense_backend", - "python": "${command:python.interpreterPath}" + "python": "uv", + "pythonArgs": [ + "run", + "python" + ] }, { "name": "Backend: FastAPI (main.py)", @@ -41,14 +49,19 @@ "program": "${workspaceFolder}/surfsense_backend/main.py", "console": "integratedTerminal", "justMyCode": false, - "cwd": "${workspaceFolder}/surfsense_backend" + "cwd": "${workspaceFolder}/surfsense_backend", + "python": "uv", + "pythonArgs": [ + "run", + "python" + ] }, { "name": "Frontend: Next.js", "type": "node", "request": "launch", "cwd": "${workspaceFolder}/surfsense_web", - "runtimeExecutable": "npm", + "runtimeExecutable": "pnpm", "runtimeArgs": ["run", "dev"], "console": "integratedTerminal", "serverReadyAction": { @@ -62,7 +75,7 @@ "type": "node", "request": "launch", "cwd": "${workspaceFolder}/surfsense_web", - "runtimeExecutable": "npm", + "runtimeExecutable": "pnpm", "runtimeArgs": ["run", "debug:server"], "console": "integratedTerminal", "serverReadyAction": { @@ -87,7 +100,11 @@ "console": "integratedTerminal", "justMyCode": false, "cwd": "${workspaceFolder}/surfsense_backend", - "python": "${command:python.interpreterPath}" + "python": "uv", + "pythonArgs": [ + "run", + "python" + ] }, { "name": "Celery: Beat Scheduler", @@ -103,7 +120,11 @@ "console": "integratedTerminal", "justMyCode": false, "cwd": "${workspaceFolder}/surfsense_backend", - "python": "${command:python.interpreterPath}" + "python": "uv", + "pythonArgs": [ + "run", + "python" + ] } ], "compounds": [ diff --git a/surfsense_backend/.gitignore b/surfsense_backend/.gitignore index 443c85e9..1cd7fd32 100644 --- a/surfsense_backend/.gitignore +++ b/surfsense_backend/.gitignore @@ -6,6 +6,7 @@ __pycache__/ .flashrank_cache surf_new_backend.egg-info/ podcasts/ +video_presentation_audio/ sandbox_files/ temp_audio/ celerybeat-schedule* diff --git a/surfsense_backend/alembic/versions/107_add_video_presentations_table.py b/surfsense_backend/alembic/versions/107_add_video_presentations_table.py new file mode 100644 index 00000000..e6f928b5 --- /dev/null +++ b/surfsense_backend/alembic/versions/107_add_video_presentations_table.py @@ -0,0 +1,85 @@ +"""Add video_presentations table and video_presentation_status enum + +Revision ID: 107 +Revises: 106 +""" + +from collections.abc import Sequence + +import sqlalchemy as sa +from sqlalchemy.dialects.postgresql import JSONB + +from alembic import op + +revision: str = "107" +down_revision: str | None = "106" +branch_labels: str | Sequence[str] | None = None +depends_on: str | Sequence[str] | None = None + +video_presentation_status_enum = sa.Enum( + "pending", + "generating", + "ready", + "failed", + name="video_presentation_status", +) + + +def upgrade() -> None: + video_presentation_status_enum.create(op.get_bind(), checkfirst=True) + + op.create_table( + "video_presentations", + sa.Column("id", sa.Integer(), autoincrement=True, nullable=False), + sa.Column("title", sa.String(length=500), nullable=False), + sa.Column("slides", JSONB(), nullable=True), + sa.Column("scene_codes", JSONB(), nullable=True), + sa.Column( + "status", + video_presentation_status_enum, + server_default="ready", + nullable=False, + ), + sa.Column("search_space_id", sa.Integer(), nullable=False), + sa.Column("thread_id", sa.Integer(), nullable=True), + sa.Column( + "created_at", + sa.TIMESTAMP(timezone=True), + server_default=sa.text("now()"), + nullable=False, + ), + sa.ForeignKeyConstraint( + ["search_space_id"], + ["searchspaces.id"], + ondelete="CASCADE", + ), + sa.ForeignKeyConstraint( + ["thread_id"], + ["new_chat_threads.id"], + ondelete="SET NULL", + ), + sa.PrimaryKeyConstraint("id"), + ) + op.create_index( + "ix_video_presentations_status", + "video_presentations", + ["status"], + ) + op.create_index( + "ix_video_presentations_thread_id", + "video_presentations", + ["thread_id"], + ) + op.create_index( + "ix_video_presentations_created_at", + "video_presentations", + ["created_at"], + ) + + +def downgrade() -> None: + op.drop_index("ix_video_presentations_created_at", table_name="video_presentations") + op.drop_index("ix_video_presentations_thread_id", table_name="video_presentations") + op.drop_index("ix_video_presentations_status", table_name="video_presentations") + op.drop_table("video_presentations") + video_presentation_status_enum.drop(op.get_bind(), checkfirst=True) diff --git a/surfsense_backend/app/agents/new_chat/system_prompt.py b/surfsense_backend/app/agents/new_chat/system_prompt.py index cff13e8c..f8ac6278 100644 --- a/surfsense_backend/app/agents/new_chat/system_prompt.py +++ b/surfsense_backend/app/agents/new_chat/system_prompt.py @@ -132,6 +132,17 @@ _TOOL_INSTRUCTIONS["generate_podcast"] = """ - After calling this tool, inform the user that podcast generation has started and they will see the player when it's ready (takes 3-5 minutes). """ +_TOOL_INSTRUCTIONS["generate_video_presentation"] = """ +- generate_video_presentation: Generate a video presentation from provided content. + - Use this when the user asks to create a video, presentation, slides, or slide deck. + - Trigger phrases: "give me a presentation", "create slides", "generate a video", "make a slide deck", "turn this into a presentation" + - Args: + - source_content: The text content to turn into a presentation. The more detailed, the better. + - video_title: Optional title (default: "SurfSense Presentation") + - user_prompt: Optional style instructions (e.g., "Make it technical and detailed") + - After calling this tool, inform the user that generation has started and they will see the presentation when it's ready. +""" + _TOOL_INSTRUCTIONS["generate_report"] = """ - generate_report: Generate or revise a structured Markdown report artifact. - WHEN TO CALL THIS TOOL — the message must contain a creation or modification VERB directed at producing a deliverable: @@ -438,6 +449,16 @@ _TOOL_EXAMPLES["generate_podcast"] = """ - Then: `generate_podcast(source_content="Key insights about quantum computing from the knowledge base:\\n\\n[Comprehensive summary of all relevant search results with key facts, concepts, and findings]", podcast_title="Quantum Computing Explained")` """ +_TOOL_EXAMPLES["generate_video_presentation"] = """ +- User: "Give me a presentation about AI trends based on what we discussed" + - First search for relevant content, then call: `generate_video_presentation(source_content="Based on our conversation and search results: [detailed summary of chat + search findings]", video_title="AI Trends Presentation")` +- User: "Create slides summarizing this conversation" + - Call: `generate_video_presentation(source_content="Complete conversation summary:\\n\\nUser asked about [topic 1]:\\n[Your detailed response]\\n\\nUser then asked about [topic 2]:\\n[Your detailed response]\\n\\n[Continue for all exchanges in the conversation]", video_title="Conversation Summary")` +- User: "Make a video presentation about quantum computing" + - First search: `search_knowledge_base(query="quantum computing")` + - Then: `generate_video_presentation(source_content="Key insights about quantum computing from the knowledge base:\\n\\n[Comprehensive summary of all relevant search results with key facts, concepts, and findings]", video_title="Quantum Computing Explained")` +""" + _TOOL_EXAMPLES["generate_report"] = """ - User: "Generate a report about AI trends" - Call: `generate_report(topic="AI Trends Report", source_strategy="kb_search", search_queries=["AI trends recent developments", "artificial intelligence industry trends", "AI market growth and predictions"], report_style="detailed")` @@ -499,6 +520,7 @@ _ALL_TOOL_NAMES_ORDERED = [ "search_knowledge_base", "web_search", "generate_podcast", + "generate_video_presentation", "generate_report", "link_preview", "display_image", diff --git a/surfsense_backend/app/agents/new_chat/tools/__init__.py b/surfsense_backend/app/agents/new_chat/tools/__init__.py index 0a11951f..5002e69b 100644 --- a/surfsense_backend/app/agents/new_chat/tools/__init__.py +++ b/surfsense_backend/app/agents/new_chat/tools/__init__.py @@ -8,6 +8,7 @@ Available tools: - search_knowledge_base: Search the user's personal knowledge base - search_surfsense_docs: Search Surfsense documentation for usage help - generate_podcast: Generate audio podcasts from content +- generate_video_presentation: Generate video presentations with slides and narration - generate_image: Generate images from text descriptions using AI models - link_preview: Fetch rich previews for URLs - display_image: Display images in chat @@ -39,6 +40,7 @@ from .registry import ( from .scrape_webpage import create_scrape_webpage_tool from .search_surfsense_docs import create_search_surfsense_docs_tool from .user_memory import create_recall_memory_tool, create_save_memory_tool +from .video_presentation import create_generate_video_presentation_tool __all__ = [ # Registry @@ -51,6 +53,7 @@ __all__ = [ "create_display_image_tool", "create_generate_image_tool", "create_generate_podcast_tool", + "create_generate_video_presentation_tool", "create_link_preview_tool", "create_recall_memory_tool", "create_save_memory_tool", diff --git a/surfsense_backend/app/agents/new_chat/tools/registry.py b/surfsense_backend/app/agents/new_chat/tools/registry.py index 6f2e36b0..4feff7d9 100644 --- a/surfsense_backend/app/agents/new_chat/tools/registry.py +++ b/surfsense_backend/app/agents/new_chat/tools/registry.py @@ -73,6 +73,7 @@ from .shared_memory import ( create_save_shared_memory_tool, ) from .user_memory import create_recall_memory_tool, create_save_memory_tool +from .video_presentation import create_generate_video_presentation_tool from .web_search import create_web_search_tool # ============================================================================= @@ -136,6 +137,17 @@ BUILTIN_TOOLS: list[ToolDefinition] = [ ), requires=["search_space_id", "db_session", "thread_id"], ), + # Video presentation generation tool + ToolDefinition( + name="generate_video_presentation", + description="Generate a video presentation with slides and narration from provided content", + factory=lambda deps: create_generate_video_presentation_tool( + search_space_id=deps["search_space_id"], + db_session=deps["db_session"], + thread_id=deps["thread_id"], + ), + requires=["search_space_id", "db_session", "thread_id"], + ), # Report generation tool (inline, short-lived sessions for DB ops) # Supports internal KB search via source_strategy so the agent doesn't # need to call search_knowledge_base separately before generating. diff --git a/surfsense_backend/app/agents/new_chat/tools/video_presentation.py b/surfsense_backend/app/agents/new_chat/tools/video_presentation.py new file mode 100644 index 00000000..68539910 --- /dev/null +++ b/surfsense_backend/app/agents/new_chat/tools/video_presentation.py @@ -0,0 +1,171 @@ +""" +Video presentation generation tool for the SurfSense agent. + +This module provides a factory function for creating the generate_video_presentation +tool that submits a Celery task for background video presentation generation. +The frontend polls for completion and auto-updates when the presentation is ready. + +Duplicate request prevention: +- Only one video presentation can be generated at a time per search space +- Uses Redis to track active video presentation tasks +- Validates the Redis marker against actual DB status to avoid stale locks +""" + +from typing import Any + +import redis +from langchain_core.tools import tool +from sqlalchemy import select +from sqlalchemy.ext.asyncio import AsyncSession + +from app.config import config +from app.db import VideoPresentation, VideoPresentationStatus + +REDIS_URL = config.REDIS_APP_URL +_redis_client: redis.Redis | None = None + + +def get_redis_client() -> redis.Redis: + """Get or create Redis client for video presentation task tracking.""" + global _redis_client + if _redis_client is None: + _redis_client = redis.from_url(REDIS_URL, decode_responses=True) + return _redis_client + + +def _redis_key(search_space_id: int) -> str: + return f"video_presentation:generating:{search_space_id}" + + +def get_generating_video_presentation_id(search_space_id: int) -> int | None: + """Get the video presentation ID currently being generated for this search space.""" + try: + client = get_redis_client() + value = client.get(_redis_key(search_space_id)) + return int(value) if value else None + except Exception: + return None + + +def clear_generating_video_presentation(search_space_id: int) -> None: + """Clear the generating marker (used when we detect a stale lock).""" + try: + client = get_redis_client() + client.delete(_redis_key(search_space_id)) + except Exception: + pass + + +def set_generating_video_presentation( + search_space_id: int, video_presentation_id: int +) -> None: + """Mark a video presentation as currently generating for this search space.""" + try: + client = get_redis_client() + client.setex(_redis_key(search_space_id), 1800, str(video_presentation_id)) + except Exception as e: + print( + f"[generate_video_presentation] Warning: Could not set generating video presentation in Redis: {e}" + ) + + +def create_generate_video_presentation_tool( + search_space_id: int, + db_session: AsyncSession, + thread_id: int | None = None, +): + """ + Factory function to create the generate_video_presentation tool with injected dependencies. + + Pre-creates video presentation record with pending status so the ID is available + immediately for frontend polling. + """ + + @tool + async def generate_video_presentation( + source_content: str, + video_title: str = "SurfSense Presentation", + user_prompt: str | None = None, + ) -> dict[str, Any]: + """Generate a video presentation from the provided content. + + Use this tool when the user asks to create a video, presentation, slides, or slide deck. + + Args: + source_content: The text content to turn into a presentation. + video_title: Title for the presentation (default: "SurfSense Presentation") + user_prompt: Optional style/tone instructions. + """ + try: + generating_id = get_generating_video_presentation_id(search_space_id) + if generating_id: + result = await db_session.execute( + select(VideoPresentation).filter( + VideoPresentation.id == generating_id + ) + ) + existing = result.scalars().first() + + if existing and existing.status == VideoPresentationStatus.GENERATING: + print( + f"[generate_video_presentation] Blocked duplicate — " + f"presentation {generating_id} is actively generating" + ) + return { + "status": VideoPresentationStatus.GENERATING.value, + "video_presentation_id": generating_id, + "title": video_title, + "message": "A video presentation is already being generated. Please wait for it to complete.", + } + + print( + f"[generate_video_presentation] Stale Redis lock for presentation {generating_id} " + f"(status={existing.status if existing else 'not found'}). Clearing and proceeding." + ) + clear_generating_video_presentation(search_space_id) + + video_pres = VideoPresentation( + title=video_title, + status=VideoPresentationStatus.PENDING, + search_space_id=search_space_id, + thread_id=thread_id, + ) + db_session.add(video_pres) + await db_session.commit() + await db_session.refresh(video_pres) + + from app.tasks.celery_tasks.video_presentation_tasks import ( + generate_video_presentation_task, + ) + + task = generate_video_presentation_task.delay( + video_presentation_id=video_pres.id, + source_content=source_content, + search_space_id=search_space_id, + user_prompt=user_prompt, + ) + + set_generating_video_presentation(search_space_id, video_pres.id) + + print( + f"[generate_video_presentation] Created video presentation {video_pres.id}, task: {task.id}" + ) + + return { + "status": VideoPresentationStatus.PENDING.value, + "video_presentation_id": video_pres.id, + "title": video_title, + "message": "Video presentation generation started. This may take a few minutes.", + } + + except Exception as e: + error_message = str(e) + print(f"[generate_video_presentation] Error: {error_message}") + return { + "status": VideoPresentationStatus.FAILED.value, + "error": error_message, + "title": video_title, + "video_presentation_id": None, + } + + return generate_video_presentation diff --git a/surfsense_backend/app/agents/video_presentation/__init__.py b/surfsense_backend/app/agents/video_presentation/__init__.py new file mode 100644 index 00000000..caf88521 --- /dev/null +++ b/surfsense_backend/app/agents/video_presentation/__init__.py @@ -0,0 +1,10 @@ +"""Video Presentation LangGraph Agent. + +This module defines a graph for generating video presentations +from source content, similar to the podcaster agent but producing +slide-based video presentations with TTS narration. +""" + +from .graph import graph + +__all__ = ["graph"] diff --git a/surfsense_backend/app/agents/video_presentation/configuration.py b/surfsense_backend/app/agents/video_presentation/configuration.py new file mode 100644 index 00000000..18724a2a --- /dev/null +++ b/surfsense_backend/app/agents/video_presentation/configuration.py @@ -0,0 +1,25 @@ +"""Define the configurable parameters for the video presentation agent.""" + +from __future__ import annotations + +from dataclasses import dataclass, fields + +from langchain_core.runnables import RunnableConfig + + +@dataclass(kw_only=True) +class Configuration: + """The configuration for the video presentation agent.""" + + video_title: str + search_space_id: int + user_prompt: str | None = None + + @classmethod + def from_runnable_config( + cls, config: RunnableConfig | None = None + ) -> Configuration: + """Create a Configuration instance from a RunnableConfig object.""" + configurable = (config.get("configurable") or {}) if config else {} + _fields = {f.name for f in fields(cls) if f.init} + return cls(**{k: v for k, v in configurable.items() if k in _fields}) diff --git a/surfsense_backend/app/agents/video_presentation/graph.py b/surfsense_backend/app/agents/video_presentation/graph.py new file mode 100644 index 00000000..2fe54802 --- /dev/null +++ b/surfsense_backend/app/agents/video_presentation/graph.py @@ -0,0 +1,30 @@ +from langgraph.graph import StateGraph + +from .configuration import Configuration +from .nodes import ( + create_presentation_slides, + create_slide_audio, + generate_slide_scene_codes, +) +from .state import State + + +def build_graph(): + workflow = StateGraph(State, config_schema=Configuration) + + workflow.add_node("create_presentation_slides", create_presentation_slides) + workflow.add_node("create_slide_audio", create_slide_audio) + workflow.add_node("generate_slide_scene_codes", generate_slide_scene_codes) + + workflow.add_edge("__start__", "create_presentation_slides") + workflow.add_edge("create_presentation_slides", "create_slide_audio") + workflow.add_edge("create_slide_audio", "generate_slide_scene_codes") + workflow.add_edge("generate_slide_scene_codes", "__end__") + + graph = workflow.compile() + graph.name = "Surfsense Video Presentation" + + return graph + + +graph = build_graph() diff --git a/surfsense_backend/app/agents/video_presentation/nodes.py b/surfsense_backend/app/agents/video_presentation/nodes.py new file mode 100644 index 00000000..c11174c6 --- /dev/null +++ b/surfsense_backend/app/agents/video_presentation/nodes.py @@ -0,0 +1,552 @@ +import asyncio +import contextlib +import json +import math +import os +import shutil +import uuid +from pathlib import Path +from typing import Any + +from ffmpeg.asyncio import FFmpeg +from langchain_core.messages import HumanMessage, SystemMessage +from langchain_core.runnables import RunnableConfig +from litellm import aspeech + +from app.config import config as app_config +from app.services.kokoro_tts_service import get_kokoro_tts_service +from app.services.llm_service import get_agent_llm + +from .configuration import Configuration +from .prompts import ( + DEFAULT_DURATION_IN_FRAMES, + FPS, + REFINE_SCENE_SYSTEM_PROMPT, + REMOTION_SCENE_SYSTEM_PROMPT, + THEME_PRESETS, + build_scene_generation_user_prompt, + build_theme_assignment_user_prompt, + get_slide_generation_prompt, + get_theme_assignment_system_prompt, + pick_theme_and_mode_fallback, +) +from .state import ( + PresentationSlides, + SlideAudioResult, + SlideContent, + SlideSceneCode, + State, +) +from .utils import get_voice_for_provider + +MAX_REFINE_ATTEMPTS = 3 + + +async def create_presentation_slides( + state: State, config: RunnableConfig +) -> dict[str, Any]: + """Parse source content into structured presentation slides using LLM.""" + + configuration = Configuration.from_runnable_config(config) + search_space_id = configuration.search_space_id + user_prompt = configuration.user_prompt + + llm = await get_agent_llm(state.db_session, search_space_id) + if not llm: + error_message = f"No LLM configured for search space {search_space_id}" + print(error_message) + raise RuntimeError(error_message) + + prompt = get_slide_generation_prompt(user_prompt) + + messages = [ + SystemMessage(content=prompt), + HumanMessage( + content=f"{state.source_content}" + ), + ] + + llm_response = await llm.ainvoke(messages) + + try: + presentation = PresentationSlides.model_validate( + json.loads(llm_response.content) + ) + except (json.JSONDecodeError, ValueError) as e: + print(f"Direct JSON parsing failed, trying fallback approach: {e!s}") + + try: + content = llm_response.content + json_start = content.find("{") + json_end = content.rfind("}") + 1 + if json_start >= 0 and json_end > json_start: + json_str = content[json_start:json_end] + parsed_data = json.loads(json_str) + presentation = PresentationSlides.model_validate(parsed_data) + print("Successfully parsed presentation slides using fallback approach") + else: + error_message = f"Could not find valid JSON in LLM response. Raw response: {content}" + print(error_message) + raise ValueError(error_message) + + except (json.JSONDecodeError, ValueError) as e2: + error_message = f"Error parsing LLM response (fallback also failed): {e2!s}" + print(f"Error parsing LLM response: {e2!s}") + print(f"Raw response: {llm_response.content}") + raise + + return {"slides": presentation.slides} + + +async def create_slide_audio(state: State, config: RunnableConfig) -> dict[str, Any]: + """Generate TTS audio for each slide. + + Each slide's speaker_transcripts are generated as individual TTS chunks, + then concatenated with ffmpeg (matching the POC in RemotionTets/api/tts). + """ + + session_id = str(uuid.uuid4()) + temp_dir = Path("temp_audio") + temp_dir.mkdir(exist_ok=True) + output_dir = Path("video_presentation_audio") + output_dir.mkdir(exist_ok=True) + + slides = state.slides or [] + voice = get_voice_for_provider(app_config.TTS_SERVICE, speaker_id=0) + ext = "wav" if app_config.TTS_SERVICE == "local/kokoro" else "mp3" + + async def _generate_tts_chunk(text: str, chunk_path: str) -> str: + """Generate a single TTS chunk and write it to *chunk_path*.""" + if app_config.TTS_SERVICE == "local/kokoro": + kokoro_service = await get_kokoro_tts_service(lang_code="a") + await kokoro_service.generate_speech( + text=text, + voice=voice, + speed=1.0, + output_path=chunk_path, + ) + else: + kwargs: dict[str, Any] = { + "model": app_config.TTS_SERVICE, + "api_key": app_config.TTS_SERVICE_API_KEY, + "voice": voice, + "input": text, + "max_retries": 2, + "timeout": 600, + } + if app_config.TTS_SERVICE_API_BASE: + kwargs["api_base"] = app_config.TTS_SERVICE_API_BASE + + response = await aspeech(**kwargs) + with open(chunk_path, "wb") as f: + f.write(response.content) + + return chunk_path + + async def _concat_with_ffmpeg(chunk_paths: list[str], output_file: str) -> None: + """Concatenate multiple audio chunks into one file using async ffmpeg.""" + ffmpeg = FFmpeg().option("y") + for chunk in chunk_paths: + ffmpeg = ffmpeg.input(chunk) + + filter_parts = [f"[{i}:0]" for i in range(len(chunk_paths))] + filter_str = ( + "".join(filter_parts) + f"concat=n={len(chunk_paths)}:v=0:a=1[outa]" + ) + ffmpeg = ffmpeg.option("filter_complex", filter_str) + ffmpeg = ffmpeg.output(output_file, map="[outa]") + await ffmpeg.execute() + + async def generate_audio_for_slide(slide: SlideContent) -> SlideAudioResult: + has_transcripts = ( + slide.speaker_transcripts and len(slide.speaker_transcripts) > 0 + ) + + if not has_transcripts: + print( + f"Slide {slide.slide_number}: no speaker_transcripts, " + f"using default duration ({DEFAULT_DURATION_IN_FRAMES} frames)" + ) + return SlideAudioResult( + slide_number=slide.slide_number, + audio_file="", + duration_seconds=DEFAULT_DURATION_IN_FRAMES / FPS, + duration_in_frames=DEFAULT_DURATION_IN_FRAMES, + ) + + output_file = str(output_dir / f"{session_id}_slide_{slide.slide_number}.{ext}") + + chunk_paths: list[str] = [] + try: + for i, text in enumerate(slide.speaker_transcripts): + chunk_path = str( + temp_dir + / f"{session_id}_slide_{slide.slide_number}_chunk_{i}.{ext}" + ) + print( + f" Slide {slide.slide_number} chunk {i + 1}/" + f"{len(slide.speaker_transcripts)}: " + f'"{text[:60]}..."' + ) + await _generate_tts_chunk(text, chunk_path) + chunk_paths.append(chunk_path) + + if len(chunk_paths) == 1: + shutil.move(chunk_paths[0], output_file) + else: + print( + f" Concatenating {len(chunk_paths)} chunks for slide " + f"{slide.slide_number} with ffmpeg" + ) + await _concat_with_ffmpeg(chunk_paths, output_file) + + duration_seconds = await _get_audio_duration(output_file) + duration_in_frames = math.ceil(duration_seconds * FPS) + + return SlideAudioResult( + slide_number=slide.slide_number, + audio_file=output_file, + duration_seconds=duration_seconds, + duration_in_frames=max(duration_in_frames, DEFAULT_DURATION_IN_FRAMES), + ) + + except Exception as e: + print(f"Error generating audio for slide {slide.slide_number}: {e!s}") + raise + finally: + for p in chunk_paths: + with contextlib.suppress(OSError): + os.remove(p) + + tasks = [generate_audio_for_slide(slide) for slide in slides] + audio_results = await asyncio.gather(*tasks) + + audio_results_sorted = sorted(audio_results, key=lambda r: r.slide_number) + + print( + f"Generated audio for {len(audio_results_sorted)} slides " + f"(total duration: {sum(r.duration_seconds for r in audio_results_sorted):.1f}s)" + ) + + return {"slide_audio_results": audio_results_sorted} + + +async def _get_audio_duration(file_path: str) -> float: + """Get audio duration in seconds using ffprobe (via python-ffmpeg). + + Falls back to file-size estimation if ffprobe fails. + """ + try: + import subprocess + + proc = await asyncio.create_subprocess_exec( + "ffprobe", + "-v", + "error", + "-show_entries", + "format=duration", + "-of", + "default=noprint_wrappers=1:nokey=1", + file_path, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + ) + stdout, _ = await asyncio.wait_for(proc.communicate(), timeout=10) + if proc.returncode == 0 and stdout.strip(): + return float(stdout.strip()) + except Exception as e: + print(f"ffprobe failed for {file_path}: {e!s}, using file-size estimation") + + try: + file_size = os.path.getsize(file_path) + if file_path.endswith(".wav"): + return file_size / (16000 * 2) + else: + return file_size / 16000 + except Exception: + return DEFAULT_DURATION_IN_FRAMES / FPS + + +async def _assign_themes_with_llm( + llm, slides: list[SlideContent] +) -> dict[int, tuple[str, str]]: + """Ask the LLM to assign a theme+mode to each slide in one call. + + Returns a dict mapping slide_number → (theme, mode). + Falls back to round-robin if the LLM response can't be parsed. + """ + total = len(slides) + slide_summaries = [ + { + "slide_number": s.slide_number, + "title": s.title, + "subtitle": s.subtitle or "", + "background_explanation": s.background_explanation or "", + } + for s in slides + ] + + system = get_theme_assignment_system_prompt() + user = build_theme_assignment_user_prompt(slide_summaries) + + try: + response = await llm.ainvoke( + [ + SystemMessage(content=system), + HumanMessage(content=user), + ] + ) + + text = response.content.strip() + if text.startswith("```"): + lines = text.split("\n") + text = "\n".join( + line for line in lines if not line.strip().startswith("```") + ).strip() + + assignments = json.loads(text) + valid_themes = set(THEME_PRESETS) + result: dict[int, tuple[str, str]] = {} + for entry in assignments: + sn = entry.get("slide_number") + theme = entry.get("theme", "").upper() + mode = entry.get("mode", "dark").lower() + if sn and theme in valid_themes and mode in ("dark", "light"): + result[sn] = (theme, mode) + + if len(result) == total: + print( + "LLM theme assignment: " + + ", ".join(f"S{sn}={t}/{m}" for sn, (t, m) in sorted(result.items())) + ) + return result + + print( + f"LLM returned {len(result)}/{total} valid assignments, " + "filling gaps with fallback" + ) + for s in slides: + if s.slide_number not in result: + result[s.slide_number] = pick_theme_and_mode_fallback( + s.slide_number - 1, total + ) + return result + + except Exception as e: + print(f"LLM theme assignment failed ({e!s}), using fallback") + return { + s.slide_number: pick_theme_and_mode_fallback(s.slide_number - 1, total) + for s in slides + } + + +async def generate_slide_scene_codes( + state: State, config: RunnableConfig +) -> dict[str, Any]: + """Generate Remotion component code for each slide using LLM. + + First assigns a theme+mode to every slide via a single LLM call, + then generates scene code per slide with the assigned theme. + """ + + configuration = Configuration.from_runnable_config(config) + search_space_id = configuration.search_space_id + + llm = await get_agent_llm(state.db_session, search_space_id) + if not llm: + raise RuntimeError(f"No LLM configured for search space {search_space_id}") + + slides = state.slides or [] + audio_results = state.slide_audio_results or [] + + audio_map: dict[int, SlideAudioResult] = {r.slide_number: r for r in audio_results} + total_slides = len(slides) + + theme_assignments = await _assign_themes_with_llm(llm, slides) + + scene_codes: list[SlideSceneCode] = [] + + for slide in slides: + audio = audio_map.get(slide.slide_number) + duration = audio.duration_in_frames if audio else DEFAULT_DURATION_IN_FRAMES + + theme, mode = theme_assignments.get( + slide.slide_number, + pick_theme_and_mode_fallback(slide.slide_number - 1, total_slides), + ) + + user_prompt = build_scene_generation_user_prompt( + slide_number=slide.slide_number, + total_slides=total_slides, + title=slide.title, + subtitle=slide.subtitle, + content_in_markdown=slide.content_in_markdown, + background_explanation=slide.background_explanation, + duration_in_frames=duration, + theme=theme, + mode=mode, + ) + + messages = [ + SystemMessage(content=REMOTION_SCENE_SYSTEM_PROMPT), + HumanMessage(content=user_prompt), + ] + + print( + f"Generating scene code for slide {slide.slide_number}/{total_slides}: " + f'"{slide.title}" ({duration} frames)' + ) + + llm_response = await llm.ainvoke(messages) + code, scene_title = _extract_code_and_title(llm_response.content) + + code = await _refine_if_needed(llm, code, slide.slide_number) + + scene_codes.append( + SlideSceneCode( + slide_number=slide.slide_number, + code=code, + title=scene_title or slide.title, + ) + ) + + print(f"Scene code ready for slide {slide.slide_number} ({len(code)} chars)") + + return {"slide_scene_codes": scene_codes} + + +def _extract_code_and_title(content: str) -> tuple[str, str | None]: + """Extract code and optional title from LLM response. + + The LLM may return a JSON object like the POC's structured output: + { "code": "...", "title": "..." } + Or it may return raw code (with optional markdown fences). + + Returns (code, title) where title may be None. + """ + text = content.strip() + + if text.startswith("{"): + try: + parsed = json.loads(text) + if isinstance(parsed, dict) and "code" in parsed: + return parsed["code"], parsed.get("title") + except (json.JSONDecodeError, ValueError): + pass + + json_start = text.find("{") + json_end = text.rfind("}") + 1 + if json_start >= 0 and json_end > json_start: + try: + parsed = json.loads(text[json_start:json_end]) + if isinstance(parsed, dict) and "code" in parsed: + return parsed["code"], parsed.get("title") + except (json.JSONDecodeError, ValueError): + pass + + code = text + if code.startswith("```"): + lines = code.split("\n") + start = 1 + end = len(lines) + for i in range(len(lines) - 1, 0, -1): + if lines[i].strip().startswith("```"): + end = i + break + code = "\n".join(lines[start:end]).strip() + + return code, None + + +async def _refine_if_needed(llm, code: str, slide_number: int) -> str: + """Attempt basic syntax validation and auto-repair via LLM if needed. + + Raises RuntimeError if the code is still invalid after MAX_REFINE_ATTEMPTS, + matching the POC's behavior where a failed slide aborts the pipeline. + """ + error = _basic_syntax_check(code) + if error is None: + return code + + for attempt in range(1, MAX_REFINE_ATTEMPTS + 1): + print( + f"Slide {slide_number}: syntax issue (attempt {attempt}/{MAX_REFINE_ATTEMPTS}): {error}" + ) + + messages = [ + SystemMessage(content=REFINE_SCENE_SYSTEM_PROMPT), + HumanMessage( + content=( + f"Here is the broken Remotion component code:\n\n{code}\n\n" + f"Compilation error:\n{error}\n\nFix the code." + ) + ), + ] + + response = await llm.ainvoke(messages) + code, _ = _extract_code_and_title(response.content) + + error = _basic_syntax_check(code) + if error is None: + print(f"Slide {slide_number}: fixed on attempt {attempt}") + return code + + raise RuntimeError( + f"Slide {slide_number} failed to compile after {MAX_REFINE_ATTEMPTS} " + f"refine attempts. Last error: {error}" + ) + + +def _basic_syntax_check(code: str) -> str | None: + """Run a lightweight syntax check on the generated code. + + Full Babel-based compilation happens on the frontend. This backend check + catches the most common LLM code-generation mistakes so the refine loop + can fix them before persisting. + + Returns an error description or None if the code looks valid. + """ + if not code or not code.strip(): + return "Empty code" + + if "export" not in code and "MyComposition" not in code: + return "Missing exported component (expected 'export const MyComposition')" + + brace_count = 0 + paren_count = 0 + bracket_count = 0 + for ch in code: + if ch == "{": + brace_count += 1 + elif ch == "}": + brace_count -= 1 + elif ch == "(": + paren_count += 1 + elif ch == ")": + paren_count -= 1 + elif ch == "[": + bracket_count += 1 + elif ch == "]": + bracket_count -= 1 + + if brace_count < 0: + return "Unmatched closing brace '}'" + if paren_count < 0: + return "Unmatched closing parenthesis ')'" + if bracket_count < 0: + return "Unmatched closing bracket ']'" + + if brace_count != 0: + return f"Unbalanced braces: {brace_count} unclosed" + if paren_count != 0: + return f"Unbalanced parentheses: {paren_count} unclosed" + if bracket_count != 0: + return f"Unbalanced brackets: {bracket_count} unclosed" + + if "useCurrentFrame" not in code: + return "Missing useCurrentFrame() — required for Remotion animations" + + if "AbsoluteFill" not in code: + return "Missing AbsoluteFill — required as the root layout component" + + return None diff --git a/surfsense_backend/app/agents/video_presentation/prompts.py b/surfsense_backend/app/agents/video_presentation/prompts.py new file mode 100644 index 00000000..5533bb01 --- /dev/null +++ b/surfsense_backend/app/agents/video_presentation/prompts.py @@ -0,0 +1,509 @@ +import datetime + +# TODO: move these to config file +MAX_SLIDES = 5 +FPS = 30 +DEFAULT_DURATION_IN_FRAMES = 300 + +THEME_PRESETS = [ + "TERRA", + "OCEAN", + "SUNSET", + "EMERALD", + "ECLIPSE", + "ROSE", + "FROST", + "NEBULA", + "AURORA", + "CORAL", + "MIDNIGHT", + "AMBER", + "LAVENDER", + "STEEL", + "CITRUS", + "CHERRY", +] + +THEME_DESCRIPTIONS: dict[str, str] = { + "TERRA": "Warm earthy tones — terracotta, olive. Heritage, tradition, organic warmth.", + "OCEAN": "Cool oceanic depth — teal, coral accents. Calm, marine, fluid elegance.", + "SUNSET": "Vibrant warm energy — orange, purple. Passion, creativity, bold expression.", + "EMERALD": "Fresh natural life — green, mint. Growth, health, sustainability.", + "ECLIPSE": "Dramatic luxury — black, gold. Premium, power, prestige.", + "ROSE": "Soft elegance — dusty pink, mauve. Beauty, care, refined femininity.", + "FROST": "Crisp clarity — ice blue, silver. Tech, data, precision analytics.", + "NEBULA": "Cosmic mystery — magenta, deep purple. AI, innovation, cutting-edge future.", + "AURORA": "Ethereal northern lights — green-teal, violet. Mystical, transformative, wonder.", + "CORAL": "Tropical warmth — coral, turquoise. Inviting, lively, community.", + "MIDNIGHT": "Deep sophistication — navy, silver. Contemplative, trust, authority.", + "AMBER": "Rich honey warmth — amber, brown. Comfort, wisdom, organic richness.", + "LAVENDER": "Gentle dreaminess — purple, lilac. Calm, imaginative, serene.", + "STEEL": "Industrial strength — gray, steel blue. Modern professional, reliability.", + "CITRUS": "Bright optimism — yellow, lime. Energy, joy, fresh starts.", + "CHERRY": "Bold impact — deep red, dark. Power, urgency, passionate conviction.", +} + + +# --------------------------------------------------------------------------- +# LLM-based theme assignment (replaces keyword-based pick_theme_and_mode) +# --------------------------------------------------------------------------- + +THEME_ASSIGNMENT_SYSTEM_PROMPT = """You are a visual design director assigning color themes to presentation slides. +Given a list of slides, assign each slide a theme preset and color mode (dark or light). + +Available themes (name — description): +{theme_list} + +Rules: +1. Pick the theme that best matches each slide's mood, content, and visual direction. +2. Maximize visual variety — avoid repeating the same theme on consecutive slides. +3. Mix dark and light modes across the presentation for contrast and rhythm. +4. Opening slides often benefit from a bold dark theme; closing/summary slides can go either way. +5. The "background_explanation" field is the primary signal — it describes the intended mood and color direction. + +Return ONLY a JSON array (no markdown fences, no explanation): +[ + {{"slide_number": 1, "theme": "THEME_NAME", "mode": "dark"}}, + {{"slide_number": 2, "theme": "THEME_NAME", "mode": "light"}} +] +""".strip() + + +def build_theme_assignment_user_prompt( + slides: list[dict[str, str]], +) -> str: + """Build the user prompt for LLM theme assignment. + + *slides* is a list of dicts with keys: slide_number, title, subtitle, + background_explanation (mood). + """ + lines = ["Assign a theme and mode to each of these slides:", ""] + for s in slides: + lines.append( + f'Slide {s["slide_number"]}: "{s["title"]}" ' + f'(subtitle: "{s.get("subtitle", "")}") — ' + f'Mood: "{s.get("background_explanation", "neutral")}"' + ) + return "\n".join(lines) + + +def get_theme_assignment_system_prompt() -> str: + """Return the theme assignment system prompt with the full theme list injected.""" + theme_list = "\n".join( + f"- {name}: {desc}" for name, desc in THEME_DESCRIPTIONS.items() + ) + return THEME_ASSIGNMENT_SYSTEM_PROMPT.format(theme_list=theme_list) + + +def pick_theme_and_mode_fallback( + slide_index: int, total_slides: int +) -> tuple[str, str]: + """Simple round-robin fallback when LLM theme assignment fails.""" + theme = THEME_PRESETS[slide_index % len(THEME_PRESETS)] + mode = "dark" if slide_index % 2 == 0 else "light" + if total_slides == 1: + mode = "dark" + return theme, mode + + +def get_slide_generation_prompt(user_prompt: str | None = None) -> str: + return f""" +Today's date: {datetime.datetime.now().strftime("%Y-%m-%d")} + +You are a content-to-slides converter. You receive raw source content (articles, notes, transcripts, +product descriptions, chat conversations, etc.) and break it into a sequence of presentation slides +for a video presentation with voiceover narration. + +{ + f''' +You **MUST** strictly adhere to the following user instruction while generating the slides: + +{user_prompt} + +''' + if user_prompt + else "" + } + + +- '': A block of text containing the information to be presented. This could be + research findings, an article summary, a detailed outline, user chat history, or any relevant + raw information. The content serves as the factual basis for the video presentation. + + + +A JSON object containing the presentation slides: +{{ + "slides": [ + {{ + "slide_number": 1, + "title": "Concise slide title", + "subtitle": "One-line subtitle or tagline", + "content_in_markdown": "## Heading\\n- Bullet point 1\\n- **Bold text**\\n- Bullet point 3", + "speaker_transcripts": [ + "First narration sentence for this slide.", + "Second narration sentence expanding on the point.", + "Third sentence wrapping up this slide." + ], + "background_explanation": "Emotional mood and color direction for this slide" + }} + ] +}} + + + +=== SLIDE COUNT === + +Dynamically decide the number of slides between 1 and {MAX_SLIDES} (inclusive). +Base your decision entirely on the content's depth, richness, and how many distinct ideas it contains. +Thin or simple content should produce fewer slides; dense or multi-faceted content may use more. +Do NOT inflate or pad slides to reach { + MAX_SLIDES + } — only use what the content genuinely warrants. +Do NOT treat {MAX_SLIDES} as a target; it is a hard ceiling, not a goal. + +=== SLIDE STRUCTURE === + +- Each slide should cover ONE distinct key idea or section. +- Keep slides focused: 2-5 bullet points of content per slide max. +- The first slide should be a title/intro slide. +- The last slide should be a summary or closing slide ONLY if there are 3+ slides. + For 1-2 slides, skip the closing slide — just cover the content. +- Do NOT create a separate closing slide if its content would just repeat earlier slides. + +=== CONTENT FIELDS === + +- Write speaker_transcripts as if a human presenter is narrating — natural, conversational, 2-4 sentences per slide. + These will be converted to TTS audio, so write in a way that sounds great when spoken aloud. +- background_explanation should describe a visual style matching the slide's mood: + - Describe the emotional feel: "warm and organic", "dramatic and urgent", "clean and optimistic", + "technical and precise", "celebratory", "earthy and grounded", "cosmic and futuristic" + - Mention color direction: warm tones, cool tones, earth tones, neon accents, gold/black, etc. + - Vary the mood across slides — do NOT always say "dark blue gradient". +- content_in_markdown should use proper markdown: ## headings, **bold**, - bullets, etc. + +=== NARRATION QUALITY === + +- Speaker transcripts should explain the slide content in an engaging, presenter-like voice. +- Keep narration concise: 2-4 sentences per slide (targeting ~10-15 seconds of audio per slide). +- The narration should add context beyond what's on the slide — don't just read the bullets. +- Use natural language: contractions, conversational tone, occasional enthusiasm. + + + +Input: "Quantum computing uses quantum bits or qubits which can exist in multiple states simultaneously due to superposition." + +Output: +{{ + "slides": [ + {{ + "slide_number": 1, + "title": "Quantum Computing", + "subtitle": "Beyond Classical Bits", + "content_in_markdown": "## The Quantum Leap\\n- Classical computers use **bits** (0 or 1)\\n- Quantum computers use **qubits**\\n- Qubits leverage **superposition**", + "speaker_transcripts": [ + "Let's explore quantum computing, a technology that's fundamentally different from the computers we use every day.", + "While traditional computers work with bits that are either zero or one, quantum computers use something called qubits.", + "The magic of qubits is superposition — they can exist in multiple states at the same time." + ], + "background_explanation": "Cosmic and futuristic with deep purple and magenta tones, evoking the mystery of quantum mechanics" + }} + ] +}} + + +Transform the source material into well-structured presentation slides with engaging narration. +Ensure each slide has a clear visual mood and natural-sounding speaker transcripts. + +""" + + +# --------------------------------------------------------------------------- +# Remotion scene code generation prompt +# Ported from RemotionTets POC /api/generate system prompt +# --------------------------------------------------------------------------- + +REMOTION_SCENE_SYSTEM_PROMPT = """ +You are a Remotion component generator that creates cinematic, modern motion graphics. +Generate a single self-contained React component that uses Remotion. + +=== THEME PRESETS (pick ONE per slide — see user prompt for which to use) === + +Each slide MUST use a DIFFERENT preset. The user prompt will tell you which preset to use. +Use ALL colors from that preset — background, surface, text, accent, glow. Do NOT mix presets. + +TERRA (warm earth — terracotta + olive): + dark: bg #1C1510 surface #261E16 border #3D3024 text #E8DDD0 muted #9A8A78 accent #C2623D secondary #7D8C52 glow rgba(194,98,61,0.12) + light: bg #F7F0E8 surface #FFF8F0 border #DDD0BF text #2C1D0E muted #8A7A68 accent #B85430 secondary #6B7A42 glow rgba(184,84,48,0.08) + gradient-dark: radial-gradient(ellipse at 30% 80%, rgba(194,98,61,0.18), transparent 60%), linear-gradient(180deg, #1C1510, #261E16) + gradient-light: radial-gradient(ellipse at 70% 20%, rgba(107,122,66,0.12), transparent 55%), linear-gradient(180deg, #F7F0E8, #FFF8F0) + +OCEAN (cool depth — teal + coral): + dark: bg #0B1A1E surface #122428 border #1E3740 text #D5EAF0 muted #6A9AA8 accent #1DB6A8 secondary #E87461 glow rgba(29,182,168,0.12) + light: bg #F0F8FA surface #FFFFFF border #C8E0E8 text #0E2830 muted #5A8A98 accent #0EA69A secondary #D05F4E glow rgba(14,166,154,0.08) + gradient-dark: radial-gradient(ellipse at 80% 30%, rgba(29,182,168,0.20), transparent 55%), radial-gradient(circle at 20% 80%, rgba(232,116,97,0.10), transparent 50%), #0B1A1E + gradient-light: radial-gradient(ellipse at 20% 40%, rgba(14,166,154,0.10), transparent 55%), linear-gradient(180deg, #F0F8FA, #FFFFFF) + +SUNSET (warm energy — orange + purple): + dark: bg #1E130F surface #2A1B14 border #42291C text #F0DDD0 muted #A08878 accent #E86A20 secondary #A855C0 glow rgba(232,106,32,0.12) + light: bg #FFF5ED surface #FFFFFF border #EADAC8 text #2E1508 muted #907860 accent #D05A18 secondary #9045A8 glow rgba(208,90,24,0.08) + gradient-dark: linear-gradient(135deg, rgba(232,106,32,0.15) 0%, transparent 40%), radial-gradient(circle at 80% 70%, rgba(168,85,192,0.15), transparent 50%), #1E130F + gradient-light: linear-gradient(135deg, rgba(208,90,24,0.08) 0%, rgba(144,69,168,0.06) 100%), #FFF5ED + +EMERALD (fresh life — green + mint): + dark: bg #0B1E14 surface #12281A border #1E3C28 text #D0F0E0 muted #5EA880 accent #10B981 secondary #84CC16 glow rgba(16,185,129,0.12) + light: bg #F0FAF5 surface #FFFFFF border #C0E8D0 text #0E2C18 muted #489068 accent #059669 secondary #65A30D glow rgba(5,150,105,0.08) + gradient-dark: radial-gradient(ellipse at 50% 50%, rgba(16,185,129,0.18), transparent 60%), linear-gradient(180deg, #0B1E14, #12281A) + gradient-light: radial-gradient(ellipse at 60% 30%, rgba(101,163,13,0.10), transparent 55%), linear-gradient(180deg, #F0FAF5, #FFFFFF) + +ECLIPSE (dramatic — black + gold): + dark: bg #100C05 surface #1A1508 border #2E2510 text #D4B96A muted #8A7840 accent #E8B830 secondary #C09020 glow rgba(232,184,48,0.14) + light: bg #FAF6ED surface #FFFFFF border #E0D8C0 text #1A1408 muted #7A6818 accent #C09820 secondary #A08018 glow rgba(192,152,32,0.08) + gradient-dark: radial-gradient(circle at 50% 40%, rgba(232,184,48,0.20), transparent 50%), radial-gradient(ellipse at 50% 90%, rgba(192,144,32,0.08), transparent 50%), #100C05 + gradient-light: radial-gradient(circle at 50% 40%, rgba(192,152,32,0.10), transparent 55%), linear-gradient(180deg, #FAF6ED, #FFFFFF) + +ROSE (soft elegance — dusty pink + mauve): + dark: bg #1E1018 surface #281820 border #3D2830 text #F0D8E0 muted #A08090 accent #E4508C secondary #B06498 glow rgba(228,80,140,0.12) + light: bg #FDF2F5 surface #FFFFFF border #F0D0D8 text #2C1018 muted #906878 accent #D43D78 secondary #9A5080 glow rgba(212,61,120,0.08) + gradient-dark: radial-gradient(ellipse at 70% 30%, rgba(228,80,140,0.18), transparent 55%), radial-gradient(circle at 20% 80%, rgba(176,100,152,0.10), transparent 50%), #1E1018 + gradient-light: radial-gradient(ellipse at 30% 60%, rgba(212,61,120,0.08), transparent 55%), linear-gradient(180deg, #FDF2F5, #FFFFFF) + +FROST (crisp clarity — ice blue + silver): + dark: bg #0A1520 surface #101D2A border #1A3040 text #D0E5F5 muted #6090B0 accent #5AB4E8 secondary #8BA8C0 glow rgba(90,180,232,0.12) + light: bg #F0F6FC surface #FFFFFF border #C8D8E8 text #0C1820 muted #5080A0 accent #3A96D0 secondary #7090A8 glow rgba(58,150,208,0.08) + gradient-dark: radial-gradient(ellipse at 40% 20%, rgba(90,180,232,0.16), transparent 55%), linear-gradient(180deg, #0A1520, #101D2A) + gradient-light: radial-gradient(ellipse at 50% 50%, rgba(58,150,208,0.08), transparent 55%), linear-gradient(180deg, #F0F6FC, #FFFFFF) + +NEBULA (cosmic — magenta + deep purple): + dark: bg #150A1E surface #1E1028 border #351A48 text #E0D0F0 muted #8060A0 accent #C850E0 secondary #8030C0 glow rgba(200,80,224,0.14) + light: bg #F8F0FF surface #FFFFFF border #E0C8F0 text #1A0A24 muted #7050A0 accent #A840C0 secondary #6820A0 glow rgba(168,64,192,0.08) + gradient-dark: radial-gradient(circle at 60% 40%, rgba(200,80,224,0.18), transparent 50%), radial-gradient(ellipse at 30% 80%, rgba(128,48,192,0.12), transparent 50%), #150A1E + gradient-light: radial-gradient(circle at 40% 30%, rgba(168,64,192,0.10), transparent 55%), linear-gradient(180deg, #F8F0FF, #FFFFFF) + +AURORA (ethereal lights — green-teal + violet): + dark: bg #0A1A1A surface #102020 border #1A3838 text #D0F0F0 muted #60A0A0 accent #30D0B0 secondary #8040D0 glow rgba(48,208,176,0.12) + light: bg #F0FAF8 surface #FFFFFF border #C0E8E0 text #0A2020 muted #508080 accent #20B090 secondary #6830B0 glow rgba(32,176,144,0.08) + gradient-dark: radial-gradient(ellipse at 30% 70%, rgba(48,208,176,0.18), transparent 55%), radial-gradient(circle at 70% 30%, rgba(128,64,208,0.12), transparent 50%), #0A1A1A + gradient-light: radial-gradient(ellipse at 50% 40%, rgba(32,176,144,0.10), transparent 55%), linear-gradient(180deg, #F0FAF8, #FFFFFF) + +CORAL (tropical warmth — coral + turquoise): + dark: bg #1E0F0F surface #281818 border #402828 text #F0D8D8 muted #A07070 accent #F06050 secondary #30B8B0 glow rgba(240,96,80,0.12) + light: bg #FFF5F3 surface #FFFFFF border #F0D0C8 text #2E1010 muted #906060 accent #E04838 secondary #20A098 glow rgba(224,72,56,0.08) + gradient-dark: radial-gradient(ellipse at 60% 60%, rgba(240,96,80,0.18), transparent 55%), radial-gradient(circle at 30% 30%, rgba(48,184,176,0.10), transparent 50%), #1E0F0F + gradient-light: radial-gradient(ellipse at 40% 50%, rgba(224,72,56,0.08), transparent 55%), linear-gradient(180deg, #FFF5F3, #FFFFFF) + +MIDNIGHT (deep sophistication — navy + silver): + dark: bg #080C18 surface #0E1420 border #1A2438 text #C8D8F0 muted #5070A0 accent #4080E0 secondary #A0B0D0 glow rgba(64,128,224,0.12) + light: bg #F0F2F8 surface #FFFFFF border #C8D0E0 text #101828 muted #506080 accent #3060C0 secondary #8090B0 glow rgba(48,96,192,0.08) + gradient-dark: radial-gradient(ellipse at 50% 30%, rgba(64,128,224,0.16), transparent 55%), linear-gradient(180deg, #080C18, #0E1420) + gradient-light: radial-gradient(ellipse at 50% 50%, rgba(48,96,192,0.08), transparent 55%), linear-gradient(180deg, #F0F2F8, #FFFFFF) + +AMBER (rich honey warmth — amber + brown): + dark: bg #1A1208 surface #221A0E border #3A2C18 text #F0E0C0 muted #A09060 accent #E0A020 secondary #C08030 glow rgba(224,160,32,0.12) + light: bg #FFF8E8 surface #FFFFFF border #E8D8B8 text #2A1C08 muted #907840 accent #C88810 secondary #A86820 glow rgba(200,136,16,0.08) + gradient-dark: radial-gradient(ellipse at 40% 60%, rgba(224,160,32,0.18), transparent 55%), linear-gradient(180deg, #1A1208, #221A0E) + gradient-light: radial-gradient(ellipse at 60% 40%, rgba(200,136,16,0.10), transparent 55%), linear-gradient(180deg, #FFF8E8, #FFFFFF) + +LAVENDER (gentle dreaminess — purple + lilac): + dark: bg #14101E surface #1C1628 border #302840 text #E0D8F0 muted #8070A0 accent #A060E0 secondary #C090D0 glow rgba(160,96,224,0.12) + light: bg #F8F0FF surface #FFFFFF border #E0D0F0 text #1C1028 muted #706090 accent #8848C0 secondary #A878B8 glow rgba(136,72,192,0.08) + gradient-dark: radial-gradient(ellipse at 60% 40%, rgba(160,96,224,0.18), transparent 55%), radial-gradient(circle at 30% 70%, rgba(192,144,208,0.10), transparent 50%), #14101E + gradient-light: radial-gradient(ellipse at 40% 30%, rgba(136,72,192,0.10), transparent 55%), linear-gradient(180deg, #F8F0FF, #FFFFFF) + +STEEL (industrial strength — gray + steel blue): + dark: bg #101214 surface #181C20 border #282E38 text #D0D8E0 muted #708090 accent #5088B0 secondary #90A0B0 glow rgba(80,136,176,0.12) + light: bg #F2F4F6 surface #FFFFFF border #D0D8E0 text #181C24 muted #607080 accent #3870A0 secondary #708898 glow rgba(56,112,160,0.08) + gradient-dark: radial-gradient(ellipse at 50% 50%, rgba(80,136,176,0.14), transparent 55%), linear-gradient(180deg, #101214, #181C20) + gradient-light: radial-gradient(ellipse at 50% 40%, rgba(56,112,160,0.08), transparent 55%), linear-gradient(180deg, #F2F4F6, #FFFFFF) + +CITRUS (bright optimism — yellow + lime): + dark: bg #181808 surface #202010 border #383818 text #F0F0C0 muted #A0A060 accent #E8D020 secondary #90D030 glow rgba(232,208,32,0.12) + light: bg #FFFFF0 surface #FFFFFF border #E8E8C0 text #282808 muted #808040 accent #C8B010 secondary #70B020 glow rgba(200,176,16,0.08) + gradient-dark: radial-gradient(ellipse at 40% 40%, rgba(232,208,32,0.18), transparent 55%), radial-gradient(circle at 70% 70%, rgba(144,208,48,0.10), transparent 50%), #181808 + gradient-light: radial-gradient(ellipse at 50% 30%, rgba(200,176,16,0.10), transparent 55%), linear-gradient(180deg, #FFFFF0, #FFFFFF) + +CHERRY (bold impact — deep red + dark): + dark: bg #1A0808 surface #241010 border #401818 text #F0D0D0 muted #A06060 accent #D02030 secondary #E05060 glow rgba(208,32,48,0.14) + light: bg #FFF0F0 surface #FFFFFF border #F0C8C8 text #280808 muted #904848 accent #B01828 secondary #C83848 glow rgba(176,24,40,0.08) + gradient-dark: radial-gradient(ellipse at 50% 40%, rgba(208,32,48,0.20), transparent 50%), linear-gradient(180deg, #1A0808, #241010) + gradient-light: radial-gradient(ellipse at 50% 50%, rgba(176,24,40,0.10), transparent 55%), linear-gradient(180deg, #FFF0F0, #FFFFFF) + +=== SHARED TOKENS (use with any theme above) === + +SPACING: xs 8px, sm 16px, md 24px, lg 32px, xl 48px, 2xl 64px, 3xl 96px, 4xl 128px +TYPOGRAPHY: fontFamily "Inter, system-ui, -apple-system, sans-serif" + caption 14px/1.4, body 18px/1.6, subhead 24px/1.4, title 40px/1.2 w600, headline 64px/1.1 w700, display 96px/1.0 w800 + letterSpacing: tight "-0.02em", normal "0", wide "0.05em" +BORDER RADIUS: 12px (cards), 8px (buttons), 9999px (pills) + +=== VISUAL VARIETY (CRITICAL) === + +The user prompt assigns each slide a specific theme preset AND mode (dark/light). +You MUST use EXACTLY the assigned preset and mode. Additionally: + +1. Use the preset's gradient as the AbsoluteFill background. +2. Use the preset's accent/secondary colors for highlights, pill badges, and card accents. +3. Use the preset's glow value for all boxShadow effects. +4. LAYOUT VARIATION: Vary layout between slides: + - One slide: bold centered headline + subtle stat + - Another: two-column card layout + - Another: single large number or quote as hero + Do NOT use the same layout pattern for every slide. + +=== LAYOUT RULES (CRITICAL — elements must NEVER overlap) === + +The canvas is 1920x1080. You MUST use a SINGLE-LAYER layout. NO stacking, NO multiple AbsoluteFill layers. + +STRUCTURE — every component must follow this exact pattern: + + {/* ALL content goes here as direct children in normal flow */} + + +ABSOLUTE RULES: +- Use exactly ONE AbsoluteFill as the root. Set its background color/gradient via its style prop. +- NEVER nest AbsoluteFill inside AbsoluteFill. +- NEVER use position "absolute" or position "fixed" on ANY element. +- NEVER use multiple layers or z-index. +- ALL elements must be in normal document flow inside the single root AbsoluteFill. + +SPACING: +- Root padding: 80px on all sides (safe area). +- Use flexDirection "column" with gap for vertical stacking, flexDirection "row" with gap for horizontal. +- Minimum gap between elements: 24px vertical, 32px horizontal. +- Text hierarchy gaps: headline→subheading 16px, subheading→body 12px, body→button 32px. +- Cards/panels: padding 32px-48px, borderRadius 12px. +- NEVER use margin to space siblings — always use the parent's gap property. + +=== DESIGN STYLE === + +- Premium aesthetic — use the exact colors from the assigned theme preset (do NOT invent your own) +- Background: use the preset's gradient-dark or gradient-light value directly as the AbsoluteFill's background +- Card/surface backgrounds: use the preset's surface color +- Text colors: use the preset's text, muted values +- Borders: use the preset's border color +- Glows: use the preset's glow value for all boxShadow — do NOT substitute other colors +- Generous whitespace — less is more, let elements breathe +- NO decorative background shapes, blurs, or overlapping ornaments + +=== REMOTION RULES === + +- Export the component as: export const MyComposition = () => { ... } +- Use useCurrentFrame() and useVideoConfig() from "remotion" +- Do NOT use Sequence +- Do NOT manually calculate animation timings or frame offsets + +=== ANIMATION (use the stagger() helper for ALL element animations) === + +A pre-built helper function called stagger() is available globally. +It handles enter, hold, and exit phases automatically — you MUST use it. + +Signature: + stagger(frame, fps, index, total) → { opacity: number, transform: string } + +Parameters: + frame — from useCurrentFrame() + fps — from useVideoConfig() + index — 0-based index of this element in the entrance order + total — total number of animated elements in the scene + +It returns a style object with opacity and transform that you spread onto the element. +Timing is handled for you: staggered spring entrances, ambient hold motion, and a graceful exit. + +Usage pattern: + const frame = useCurrentFrame(); + const { fps } = useVideoConfig(); + +
Headline
+
Subtitle
+
Card
+
Footer
+ +Rules: +- Count ALL animated elements in your scene and pass that count as the "total" parameter. +- Assign each element a sequential index starting from 0. +- You can merge stagger's return with additional styles: +
+- For non-animated static elements (backgrounds, borders), just use normal styles without stagger. +- You may still use spring() and interpolate() for EXTRA custom effects (e.g., a number counter, + color shift, or typewriter effect), but stagger() must drive all entrance/exit animations. + +=== AVAILABLE GLOBALS (injected at runtime, do NOT import anything else) === + +- React (available globally) +- AbsoluteFill, useCurrentFrame, useVideoConfig, spring, interpolate, Easing from "remotion" +- stagger(frame, fps, index, total) — animation helper described above + +=== CODE RULES === + +- Output ONLY the raw code, no markdown fences, no explanations +- Keep it fully self-contained, no external dependencies or images +- Use inline styles only (no CSS imports, no className) +- Target 1920x1080 resolution +- Every container must use display "flex" with explicit gap values +- NEVER use marginTop/marginBottom to space siblings — use the parent's gap instead +""".strip() + + +def build_scene_generation_user_prompt( + slide_number: int, + total_slides: int, + title: str, + subtitle: str, + content_in_markdown: str, + background_explanation: str, + duration_in_frames: int, + theme: str, + mode: str, +) -> str: + """Build the user prompt for generating a single slide's Remotion scene code. + + *theme* and *mode* are pre-assigned (by LLM or fallback) before this is called. + """ + return "\n".join( + [ + "Create a cinematic, visually striking Remotion scene.", + f"The video is {duration_in_frames} frames at {FPS}fps ({duration_in_frames / FPS:.1f}s total).", + "", + f"This is slide {slide_number} of {total_slides} in the video.", + "", + f"=== ASSIGNED THEME: {theme} / {mode.upper()} mode ===", + f"You MUST use the {theme} preset in {mode} mode from the theme presets above.", + f"Use its exact background gradient (gradient-{mode}), surface, text, accent, secondary, border, and glow colors.", + "Do NOT substitute, invent, or default to blue/violet colors.", + "", + f'The scene should communicate this message: "{title} — {subtitle}"', + "", + "Key ideas to convey (use as creative inspiration, NOT literal text to dump on screen):", + content_in_markdown, + "", + "Pick only the 1-2 most impactful phrases or numbers to display as text.", + "", + f"Mood & tone: {background_explanation}", + ] + ) + + +REFINE_SCENE_SYSTEM_PROMPT = """ +You are a code repair assistant. You will receive a Remotion React component that failed to compile, +along with the exact error message from the Babel transpiler. + +Your job is to fix the code so it compiles and runs correctly. + +RULES: +- Output ONLY the fixed raw code as a string — no markdown fences, no explanations. +- Preserve the original intent, design, and animations as closely as possible. +- The component must be exported as: export const MyComposition = () => { ... } +- Only these globals are available at runtime (they are injected, not actually imported): + React, AbsoluteFill, useCurrentFrame, useVideoConfig, spring, interpolate, Easing, + stagger (a helper: stagger(frame, fps, index, total) → { opacity, transform }) +- Keep import statements at the top (they get stripped by the compiler) but do NOT import anything + other than "react" and "remotion". +- Use inline styles only (no CSS, no className). +- Common fixes: + - Mismatched braces/brackets in JSX style objects (e.g. }}, instead of }}>) + - Missing closing tags + - Trailing commas before > in JSX + - Undefined variables or typos + - Invalid JSX expressions +- After fixing, mentally walk through every brace pair { } and JSX tag to verify they match. +""".strip() diff --git a/surfsense_backend/app/agents/video_presentation/state.py b/surfsense_backend/app/agents/video_presentation/state.py new file mode 100644 index 00000000..53c989f7 --- /dev/null +++ b/surfsense_backend/app/agents/video_presentation/state.py @@ -0,0 +1,72 @@ +"""Define the state structures for the video presentation agent.""" + +from __future__ import annotations + +from dataclasses import dataclass + +from pydantic import BaseModel, Field +from sqlalchemy.ext.asyncio import AsyncSession + + +class SlideContent(BaseModel): + """Represents a single parsed slide from content analysis.""" + + slide_number: int = Field(..., description="1-based slide number") + title: str = Field(..., description="Concise slide title") + subtitle: str = Field(..., description="One-line subtitle or tagline") + content_in_markdown: str = Field( + ..., description="Slide body content formatted as markdown" + ) + speaker_transcripts: list[str] = Field( + ..., + description="2-4 short sentences a presenter would say while this slide is shown", + ) + background_explanation: str = Field( + ..., + description="Emotional mood and color direction for this slide", + ) + + +class PresentationSlides(BaseModel): + """Represents the full set of parsed slides from the LLM.""" + + slides: list[SlideContent] = Field( + ..., description="Ordered array of presentation slides" + ) + + +class SlideAudioResult(BaseModel): + """Audio generation result for a single slide.""" + + slide_number: int + audio_file: str = Field(..., description="Path to the per-slide audio file") + duration_seconds: float = Field(..., description="Audio duration in seconds") + duration_in_frames: int = Field( + ..., description="Audio duration in frames (at 30fps)" + ) + + +class SlideSceneCode(BaseModel): + """Generated Remotion component code for a single slide.""" + + slide_number: int + code: str = Field( + ..., description="Raw Remotion React component source code for this slide" + ) + title: str = Field(..., description="Short title for the composition") + + +@dataclass +class State: + """State for the video presentation agent graph. + + Pipeline: parse slides → generate per-slide TTS audio → generate per-slide Remotion code + The frontend receives the slides + code + audio and handles compilation/rendering. + """ + + db_session: AsyncSession + source_content: str + + slides: list[SlideContent] | None = None + slide_audio_results: list[SlideAudioResult] | None = None + slide_scene_codes: list[SlideSceneCode] | None = None diff --git a/surfsense_backend/app/agents/video_presentation/utils.py b/surfsense_backend/app/agents/video_presentation/utils.py new file mode 100644 index 00000000..58909e10 --- /dev/null +++ b/surfsense_backend/app/agents/video_presentation/utils.py @@ -0,0 +1,30 @@ +def get_voice_for_provider(provider: str, speaker_id: int = 0) -> dict | str: + """ + Get the appropriate voice configuration based on the TTS provider. + + Currently single-speaker only (speaker_id=0). Multi-speaker support + will be added in a future iteration. + + Args: + provider: The TTS provider (e.g., "openai/tts-1", "vertex_ai/test") + speaker_id: The ID of the speaker (default 0, single speaker for now) + + Returns: + Voice configuration - string for OpenAI, dict for Vertex AI + """ + if provider == "local/kokoro": + return "af_heart" + + provider_type = ( + provider.split("/")[0].lower() if "/" in provider else provider.lower() + ) + + voices = { + "openai": "alloy", + "vertex_ai": { + "languageCode": "en-US", + "name": "en-US-Studio-O", + }, + "azure": "alloy", + } + return voices.get(provider_type, {}) diff --git a/surfsense_backend/app/app.py b/surfsense_backend/app/app.py index 6c6b12e3..bba2f1f3 100644 --- a/surfsense_backend/app/app.py +++ b/surfsense_backend/app/app.py @@ -341,7 +341,7 @@ if config.NEXT_FRONTEND_URL: allowed_origins.append(www_url) allowed_origins.extend( - [ # For local development and desktop app + [ # For local development and desktop app "http://localhost:3000", "http://127.0.0.1:3000", ] diff --git a/surfsense_backend/app/celery_app.py b/surfsense_backend/app/celery_app.py index 62414775..69e11774 100644 --- a/surfsense_backend/app/celery_app.py +++ b/surfsense_backend/app/celery_app.py @@ -77,6 +77,7 @@ celery_app = Celery( include=[ "app.tasks.celery_tasks.document_tasks", "app.tasks.celery_tasks.podcast_tasks", + "app.tasks.celery_tasks.video_presentation_tasks", "app.tasks.celery_tasks.connector_tasks", "app.tasks.celery_tasks.schedule_checker_task", "app.tasks.celery_tasks.document_reindex_tasks", diff --git a/surfsense_backend/app/db.py b/surfsense_backend/app/db.py index 95ae8e72..2ce48c16 100644 --- a/surfsense_backend/app/db.py +++ b/surfsense_backend/app/db.py @@ -103,6 +103,13 @@ class PodcastStatus(StrEnum): FAILED = "failed" +class VideoPresentationStatus(StrEnum): + PENDING = "pending" + GENERATING = "generating" + READY = "ready" + FAILED = "failed" + + class DocumentStatus: """ Helper class for document processing status (stored as JSONB). @@ -337,6 +344,12 @@ class Permission(StrEnum): PODCASTS_UPDATE = "podcasts:update" PODCASTS_DELETE = "podcasts:delete" + # Video Presentations + VIDEO_PRESENTATIONS_CREATE = "video_presentations:create" + VIDEO_PRESENTATIONS_READ = "video_presentations:read" + VIDEO_PRESENTATIONS_UPDATE = "video_presentations:update" + VIDEO_PRESENTATIONS_DELETE = "video_presentations:delete" + # Image Generations IMAGE_GENERATIONS_CREATE = "image_generations:create" IMAGE_GENERATIONS_READ = "image_generations:read" @@ -403,6 +416,10 @@ DEFAULT_ROLE_PERMISSIONS = { Permission.PODCASTS_CREATE.value, Permission.PODCASTS_READ.value, Permission.PODCASTS_UPDATE.value, + # Video Presentations (no delete) + Permission.VIDEO_PRESENTATIONS_CREATE.value, + Permission.VIDEO_PRESENTATIONS_READ.value, + Permission.VIDEO_PRESENTATIONS_UPDATE.value, # Image Generations (create and read, no delete) Permission.IMAGE_GENERATIONS_CREATE.value, Permission.IMAGE_GENERATIONS_READ.value, @@ -435,6 +452,8 @@ DEFAULT_ROLE_PERMISSIONS = { Permission.LLM_CONFIGS_READ.value, # Podcasts (read only) Permission.PODCASTS_READ.value, + # Video Presentations (read only) + Permission.VIDEO_PRESENTATIONS_READ.value, # Image Generations (read only) Permission.IMAGE_GENERATIONS_READ.value, # Connectors (read only) @@ -1044,6 +1063,46 @@ class Podcast(BaseModel, TimestampMixin): thread = relationship("NewChatThread") +class VideoPresentation(BaseModel, TimestampMixin): + """Video presentation model for storing AI-generated video presentations. + + The slides JSONB stores per-slide data including Remotion component code, + audio file paths, and durations. The frontend compiles the code and renders + the video using Remotion Player. + """ + + __tablename__ = "video_presentations" + + title = Column(String(500), nullable=False) + slides = Column(JSONB, nullable=True) + scene_codes = Column(JSONB, nullable=True) + status = Column( + SQLAlchemyEnum( + VideoPresentationStatus, + name="video_presentation_status", + create_type=False, + values_callable=lambda x: [e.value for e in x], + ), + nullable=False, + default=VideoPresentationStatus.READY, + server_default="ready", + index=True, + ) + + search_space_id = Column( + Integer, ForeignKey("searchspaces.id", ondelete="CASCADE"), nullable=False + ) + search_space = relationship("SearchSpace", back_populates="video_presentations") + + thread_id = Column( + Integer, + ForeignKey("new_chat_threads.id", ondelete="SET NULL"), + nullable=True, + index=True, + ) + thread = relationship("NewChatThread") + + class Report(BaseModel, TimestampMixin): """Report model for storing generated Markdown reports.""" @@ -1228,6 +1287,12 @@ class SearchSpace(BaseModel, TimestampMixin): order_by="Podcast.id.desc()", cascade="all, delete-orphan", ) + video_presentations = relationship( + "VideoPresentation", + back_populates="search_space", + order_by="VideoPresentation.id.desc()", + cascade="all, delete-orphan", + ) reports = relationship( "Report", back_populates="search_space", diff --git a/surfsense_backend/app/routes/__init__.py b/surfsense_backend/app/routes/__init__.py index d7df2182..66471b0e 100644 --- a/surfsense_backend/app/routes/__init__.py +++ b/surfsense_backend/app/routes/__init__.py @@ -42,6 +42,7 @@ from .search_spaces_routes import router as search_spaces_router from .slack_add_connector_route import router as slack_add_connector_router from .surfsense_docs_routes import router as surfsense_docs_router from .teams_add_connector_route import router as teams_add_connector_router +from .video_presentations_routes import router as video_presentations_router from .youtube_routes import router as youtube_router router = APIRouter() @@ -55,6 +56,9 @@ router.include_router(new_chat_router) # Chat with assistant-ui persistence router.include_router(sandbox_router) # Sandbox file downloads (Daytona) router.include_router(chat_comments_router) router.include_router(podcasts_router) # Podcast task status and audio +router.include_router( + video_presentations_router +) # Video presentation status and streaming router.include_router(reports_router) # Report CRUD and multi-format export router.include_router(image_generation_router) # Image generation via litellm router.include_router(search_source_connectors_router) diff --git a/surfsense_backend/app/routes/video_presentations_routes.py b/surfsense_backend/app/routes/video_presentations_routes.py new file mode 100644 index 00000000..ed694b9b --- /dev/null +++ b/surfsense_backend/app/routes/video_presentations_routes.py @@ -0,0 +1,242 @@ +""" +Video presentation routes for CRUD operations and per-slide audio streaming. + +These routes support the video presentation generation feature in new-chat. +Frontend polls GET /video-presentations/{id} to check status field. +When ready, the slides JSONB contains per-slide Remotion code and audio file paths. +The frontend compiles the Remotion code via Babel and renders with Remotion Player. +""" + +import os +from pathlib import Path + +from fastapi import APIRouter, Depends, HTTPException +from fastapi.responses import StreamingResponse +from sqlalchemy import select +from sqlalchemy.exc import SQLAlchemyError +from sqlalchemy.ext.asyncio import AsyncSession + +from app.db import ( + Permission, + SearchSpace, + SearchSpaceMembership, + User, + VideoPresentation, + get_async_session, +) +from app.schemas import VideoPresentationRead +from app.users import current_active_user +from app.utils.rbac import check_permission + +router = APIRouter() + + +@router.get("/video-presentations", response_model=list[VideoPresentationRead]) +async def read_video_presentations( + skip: int = 0, + limit: int = 100, + search_space_id: int | None = None, + session: AsyncSession = Depends(get_async_session), + user: User = Depends(current_active_user), +): + """ + List video presentations the user has access to. + Requires VIDEO_PRESENTATIONS_READ permission for the search space(s). + """ + if skip < 0 or limit < 1: + raise HTTPException(status_code=400, detail="Invalid pagination parameters") + try: + if search_space_id is not None: + await check_permission( + session, + user, + search_space_id, + Permission.VIDEO_PRESENTATIONS_READ.value, + "You don't have permission to read video presentations in this search space", + ) + result = await session.execute( + select(VideoPresentation) + .filter(VideoPresentation.search_space_id == search_space_id) + .offset(skip) + .limit(limit) + ) + else: + result = await session.execute( + select(VideoPresentation) + .join(SearchSpace) + .join(SearchSpaceMembership) + .filter(SearchSpaceMembership.user_id == user.id) + .offset(skip) + .limit(limit) + ) + return [ + VideoPresentationRead.from_orm_with_slides(vp) + for vp in result.scalars().all() + ] + except HTTPException: + raise + except SQLAlchemyError: + raise HTTPException( + status_code=500, + detail="Database error occurred while fetching video presentations", + ) from None + + +@router.get( + "/video-presentations/{video_presentation_id}", + response_model=VideoPresentationRead, +) +async def read_video_presentation( + video_presentation_id: int, + session: AsyncSession = Depends(get_async_session), + user: User = Depends(current_active_user), +): + """ + Get a specific video presentation by ID. + Requires authentication with VIDEO_PRESENTATIONS_READ permission. + + When status is "ready", the response includes: + - slides: parsed slide data with per-slide audio_url and durations + - scene_codes: Remotion component source code per slide + """ + try: + result = await session.execute( + select(VideoPresentation).filter( + VideoPresentation.id == video_presentation_id + ) + ) + video_pres = result.scalars().first() + + if not video_pres: + raise HTTPException(status_code=404, detail="Video presentation not found") + + await check_permission( + session, + user, + video_pres.search_space_id, + Permission.VIDEO_PRESENTATIONS_READ.value, + "You don't have permission to read video presentations in this search space", + ) + + return VideoPresentationRead.from_orm_with_slides(video_pres) + except HTTPException as he: + raise he + except SQLAlchemyError: + raise HTTPException( + status_code=500, + detail="Database error occurred while fetching video presentation", + ) from None + + +@router.delete("/video-presentations/{video_presentation_id}", response_model=dict) +async def delete_video_presentation( + video_presentation_id: int, + session: AsyncSession = Depends(get_async_session), + user: User = Depends(current_active_user), +): + """ + Delete a video presentation. + Requires VIDEO_PRESENTATIONS_DELETE permission for the search space. + """ + try: + result = await session.execute( + select(VideoPresentation).filter( + VideoPresentation.id == video_presentation_id + ) + ) + db_video_pres = result.scalars().first() + + if not db_video_pres: + raise HTTPException(status_code=404, detail="Video presentation not found") + + await check_permission( + session, + user, + db_video_pres.search_space_id, + Permission.VIDEO_PRESENTATIONS_DELETE.value, + "You don't have permission to delete video presentations in this search space", + ) + + await session.delete(db_video_pres) + await session.commit() + return {"message": "Video presentation deleted successfully"} + except HTTPException as he: + raise he + except SQLAlchemyError: + await session.rollback() + raise HTTPException( + status_code=500, + detail="Database error occurred while deleting video presentation", + ) from None + + +@router.get("/video-presentations/{video_presentation_id}/slides/{slide_number}/audio") +async def stream_slide_audio( + video_presentation_id: int, + slide_number: int, + session: AsyncSession = Depends(get_async_session), + user: User = Depends(current_active_user), +): + """ + Stream the audio file for a specific slide in a video presentation. + The slide_number is 1-based. Audio path is read from the slides JSONB. + """ + try: + result = await session.execute( + select(VideoPresentation).filter( + VideoPresentation.id == video_presentation_id + ) + ) + video_pres = result.scalars().first() + + if not video_pres: + raise HTTPException(status_code=404, detail="Video presentation not found") + + await check_permission( + session, + user, + video_pres.search_space_id, + Permission.VIDEO_PRESENTATIONS_READ.value, + "You don't have permission to access video presentations in this search space", + ) + + slides = video_pres.slides or [] + slide_data = None + for s in slides: + if s.get("slide_number") == slide_number: + slide_data = s + break + + if not slide_data: + raise HTTPException( + status_code=404, + detail=f"Slide {slide_number} not found", + ) + + file_path = slide_data.get("audio_file") + if not file_path or not os.path.isfile(file_path): + raise HTTPException(status_code=404, detail="Slide audio file not found") + + ext = Path(file_path).suffix.lower() + media_type = "audio/wav" if ext == ".wav" else "audio/mpeg" + + def iterfile(): + with open(file_path, mode="rb") as file_like: + yield from file_like + + return StreamingResponse( + iterfile(), + media_type=media_type, + headers={ + "Accept-Ranges": "bytes", + "Content-Disposition": f"inline; filename={Path(file_path).name}", + }, + ) + + except HTTPException as he: + raise he + except Exception as e: + raise HTTPException( + status_code=500, + detail=f"Error streaming slide audio: {e!s}", + ) from e diff --git a/surfsense_backend/app/schemas/__init__.py b/surfsense_backend/app/schemas/__init__.py index 7e3ba193..11d3bfc0 100644 --- a/surfsense_backend/app/schemas/__init__.py +++ b/surfsense_backend/app/schemas/__init__.py @@ -101,6 +101,12 @@ from .search_space import ( SearchSpaceWithStats, ) from .users import UserCreate, UserRead, UserUpdate +from .video_presentations import ( + VideoPresentationBase, + VideoPresentationCreate, + VideoPresentationRead, + VideoPresentationUpdate, +) __all__ = [ # Chat schemas (assistant-ui integration) @@ -220,4 +226,9 @@ __all__ = [ "UserRead", "UserSearchSpaceAccess", "UserUpdate", + # Video Presentation schemas + "VideoPresentationBase", + "VideoPresentationCreate", + "VideoPresentationRead", + "VideoPresentationUpdate", ] diff --git a/surfsense_backend/app/schemas/video_presentations.py b/surfsense_backend/app/schemas/video_presentations.py new file mode 100644 index 00000000..ec29147e --- /dev/null +++ b/surfsense_backend/app/schemas/video_presentations.py @@ -0,0 +1,103 @@ +"""Video presentation schemas for API responses.""" + +from datetime import datetime +from enum import StrEnum +from typing import Any + +from pydantic import BaseModel + + +class VideoPresentationStatusEnum(StrEnum): + PENDING = "pending" + GENERATING = "generating" + READY = "ready" + FAILED = "failed" + + +class VideoPresentationBase(BaseModel): + """Base video presentation schema.""" + + title: str + slides: list[dict[str, Any]] | None = None + scene_codes: list[dict[str, Any]] | None = None + search_space_id: int + + +class VideoPresentationCreate(VideoPresentationBase): + """Schema for creating a video presentation.""" + + pass + + +class VideoPresentationUpdate(BaseModel): + """Schema for updating a video presentation.""" + + title: str | None = None + slides: list[dict[str, Any]] | None = None + scene_codes: list[dict[str, Any]] | None = None + + +class VideoPresentationRead(VideoPresentationBase): + """Schema for reading a video presentation.""" + + id: int + status: VideoPresentationStatusEnum = VideoPresentationStatusEnum.READY + created_at: datetime + slide_count: int | None = None + + class Config: + from_attributes = True + + @classmethod + def from_orm_with_slides(cls, obj): + """Create VideoPresentationRead with slide_count computed. + + Replaces raw server file paths in `audio_file` with API streaming + URLs so the frontend can use them directly in Remotion