feat: remove pandoc and its respective engine dependencies

2026-05-05 05:42:39 +02:00 · 2026-02-13 01:34:43 -08:00 · 2026-02-13 01:34:43 -08:00 · a8c1aa28c0
commit a8c1aa28c0
parent 3a7a27f3ae
11 changed files with 3354 additions and 3252 deletions
--- a/surfsense_backend/app/agents/new_chat/system_prompt.py
+++ b/surfsense_backend/app/agents/new_chat/system_prompt.py
@ -94,6 +94,8 @@ You have access to the following tools:

 3. generate_report: Generate a structured Markdown report from provided content.
  - Use this when the user asks to create, generate, write, produce, draft, or summarize into a report-style deliverable.
+  - DECISION RULE (HIGH PRIORITY): If the user asks for a report in any form, call `generate_report` instead of writing the full report directly in chat.
+  - Only skip `generate_report` if the user explicitly asks for chat-only output (e.g., "just answer in chat", "no report card", "don't generate a report").
  - Trigger classes include:
    * Direct trigger words: report, document, memo, letter, template
    * Creation-intent phrases: "write a document/report/post/article"
@ -108,6 +110,7 @@ You have access to the following tools:
    * "write a report/document", "draft a report"
    * "create an executive summary", "make a briefing note", "write a one-pager"
    * "write a blog post", "write an article", "create a comprehensive guide"
+    * "create a small report", "write a short report", "make a quick report", "brief report for class"
  - IMPORTANT FORMAT RULE: Reports are ALWAYS generated in Markdown.
  - Args:
    - topic: The main topic or title of the report
@ -121,7 +124,9 @@ You have access to the following tools:
  - Returns: A dictionary with status "ready" or "failed", report_id, title, and word_count.
  - The report is generated immediately in Markdown and displayed inline in the chat.
  - Export/download formats (e.g., PDF/DOCX) are produced from the generated Markdown report.
-  - IMPORTANT: Always search the knowledge base first to gather comprehensive source_content before generating a report.
+  - SOURCE-COLLECTION RULE:
+    * If the user already provided enough source material (current chat content, uploaded files, pasted text, or a summarized video/article), generate the report directly from that.
+    * Use search_knowledge_base first when additional context is needed or the user asks for information beyond what is already available in the conversation.
  - AFTER CALLING THIS TOOL: Do NOT repeat, summarize, or reproduce the report content in the chat. The report is already displayed as an interactive card that the user can open, read, copy, and export. Simply confirm that the report was generated (e.g., "I've generated your report on [topic]. You can view the Markdown report now, and export to PDF/DOCX from the card."). NEVER write out the report text in the chat.

 4. link_preview: Fetch metadata for a URL to display a rich preview card.
--- a/surfsense_backend/app/agents/new_chat/tools/report.py
+++ b/surfsense_backend/app/agents/new_chat/tools/report.py
@ -58,6 +58,23 @@ Write the report now:
 """


+def _strip_wrapping_code_fences(text: str) -> str:
+    """Remove wrapping code fences that LLMs often add around Markdown output.
+
+    Handles patterns like:
+        ```markdown\\n...content...\\n```
+        ```md\\n...content...\\n```
+        ```\\n...content...\\n```
+    """
+    stripped = text.strip()
+    # Match opening fence with optional language tag (markdown, md, or bare)
+    m = re.match(r"^```(?:markdown|md)?\s*\n", stripped)
+    if m and stripped.endswith("```"):
+        stripped = stripped[m.end() :]  # remove opening fence
+        stripped = stripped[:-3].rstrip()  # remove closing fence
+    return stripped
+
+
 def _extract_metadata(content: str) -> dict[str, Any]:
    """Extract metadata from generated Markdown content."""
    # Count section headings
@ -110,6 +127,11 @@ def create_generate_report_tool(

        Use this tool when the user asks to create, generate, write, produce, draft,
        or summarize into a report-style deliverable.
+        HIGH-PRIORITY DECISION RULE:
+        - If the user asks for a report in any form,
+          call this tool rather than writing the full report directly in chat.
+        - Only skip this tool when the user explicitly requests chat-only output and
+          says they do not want a generated report card.
        Trigger classes include:
        - Direct trigger words: report, document, memo, letter, template
        - Creation-intent phrases: "write a document/report/post/article"
@ -136,11 +158,21 @@ def create_generate_report_tool(
        - "Write an article"
        - "Create a comprehensive guide"
        - "Prepare a report"
+        - "Create a small report"
+        - "Write a short report"
+        - "Make a quick report"
+        - "Brief report for class"

        FORMAT/EXPORT RULE:
        - Always generate the report content in Markdown.
        - If the user requests DOCX/Word/PDF or another file format, export from
          the generated Markdown report.
+        SOURCE-COLLECTION RULE:
+        - If enough source material is already present in the conversation (chat
+          history, pasted text, uploaded files, or a provided video/article summary),
+          generate directly from that source_content.
+        - Use knowledge-base search first only when extra context is needed beyond
+          what the user already provided.

        VERSIONING — parent_report_id:
        - Set parent_report_id when the user wants to MODIFY, REVISE, IMPROVE,
@ -298,6 +330,20 @@ def create_generate_report_tool(
                    "title": topic,
                }

+            # LLMs often wrap output in ```markdown ... ``` fences — strip them
+            # so the stored content is clean Markdown.
+            report_content = _strip_wrapping_code_fences(report_content)
+
+            if not report_content:
+                error_msg = "LLM returned empty or invalid content"
+                report_id = await _save_failed_report(error_msg)
+                return {
+                    "status": "failed",
+                    "error": error_msg,
+                    "report_id": report_id,
+                    "title": topic,
+                }
+
            # Extract metadata (includes "status": "ready")
            metadata = _extract_metadata(report_content)

--- a/surfsense_backend/app/routes/reports_routes.py
+++ b/surfsense_backend/app/routes/reports_routes.py
@ -3,7 +3,8 @@ Report routes for read, export (PDF/DOCX), and delete operations.

 No create or update endpoints here — reports are generated inline by the
 agent tool during chat and stored as Markdown in the database.
-Export to PDF/DOCX is on-demand via pypandoc (PDF uses Typst as the engine).
+Export to PDF/DOCX is on-demand — PDF uses pypandoc (Markdown→Typst) + typst-py
+(Typst→PDF); DOCX uses pypandoc directly.

 Authorization: lightweight search-space membership checks (no granular RBAC)
 since reports are chat-generated artifacts, not standalone managed resources.
@ -13,10 +14,12 @@ import asyncio
 import io
 import logging
 import os
+import re
 import tempfile
 from enum import Enum

 import pypandoc
+import typst
 from fastapi import APIRouter, Depends, HTTPException, Query
 from fastapi.responses import StreamingResponse
 from sqlalchemy import select
@ -51,6 +54,17 @@ class ExportFormat(str, Enum):
 # Helpers
 # ---------------------------------------------------------------------------

+_CODE_FENCE_RE = re.compile(r"^```(?:markdown|md)?\s*\n", re.MULTILINE)
+
+
+def _strip_wrapping_code_fences(text: str) -> str:
+    """Remove wrapping code fences (```markdown...```) that LLMs often add."""
+    stripped = text.strip()
+    m = _CODE_FENCE_RE.match(stripped)
+    if m and stripped.endswith("```"):
+        stripped = stripped[m.end() : -3].rstrip()
+    return stripped
+

 async def _get_report_with_access(
    report_id: int,
@ -209,37 +223,64 @@ async def export_report(
                status_code=400, detail="Report has no content to export"
            )

-        # Convert Markdown to the requested format via pypandoc.
-        # pypandoc spawns a pandoc subprocess (blocking), so we run the
-        # entire convert → read → cleanup pipeline in a thread executor
-        # to avoid blocking the async event loop on any file I/O.
+        # Strip wrapping code fences that LLMs sometimes add around Markdown.
+        # Without this, pandoc treats the entire content as a code block.
+        markdown_content = _strip_wrapping_code_fences(report.content)
+
+        # Convert Markdown to the requested format.
        #
-        # PDF uses Typst as the rendering engine — Typst has built-in
-        # professional styling for tables, headings, code blocks, etc.,
-        # so no CSS injection is needed.
+        # DOCX: pypandoc (pandoc) handles the full conversion directly.
        #
-        # Use "gfm" because LLM output uses GFM-style pipe tables that
-        # pandoc's stricter default "markdown" format may fail to parse.
-        extra_args = ["--standalone"]
-        if format == ExportFormat.PDF:
-            extra_args.append("--pdf-engine=typst")
+        # PDF: two-step pipeline — pypandoc converts Markdown → Typst markup,
+        # then the `typst` Python library compiles Typst → PDF.  This avoids
+        # requiring the Typst CLI on the system PATH; the typst pip package
+        # bundles the compiler as a native extension.  Typst produces
+        # professional styling for tables, headings, code blocks, etc.
+        #
+        # Use "gfm" as the input format because LLM output uses GFM-style
+        # pipe tables that pandoc's stricter default "markdown" may mangle.

        def _convert_and_read() -> bytes:
-            """Run all blocking I/O (tempfile, pandoc, file read, cleanup) in a thread."""
-            fd, tmp_path = tempfile.mkstemp(suffix=f".{format.value}")
-            os.close(fd)
-            try:
-                pypandoc.convert_text(
-                    report.content,
-                    format.value,
+            """Run all blocking I/O (tempfile, pandoc/typst, file read, cleanup) in a thread."""
+            if format == ExportFormat.PDF:
+                # Step 1: Markdown → Typst markup via pandoc.
+                # We must set mainfont / monofont so the generated template's
+                # `font` parameter is non-empty; without it pandoc emits
+                # `font: ()` which makes Typst error with
+                # "font fallback list must not be empty".
+                # We use fonts that ship embedded inside typst-py so this
+                # works even on systems with no fonts installed.
+                typst_markup: str = pypandoc.convert_text(
+                    markdown_content,
+                    "typst",
                    format="gfm",
-                    extra_args=extra_args,
-                    outputfile=tmp_path,
+                    extra_args=[
+                        "--standalone",
+                        "-V",
+                        "mainfont:Libertinus Serif",
+                        "-V",
+                        "monofont:DejaVu Sans Mono",
+                    ],
                )
-                with open(tmp_path, "rb") as f:
-                    return f.read()
-            finally:
-                os.unlink(tmp_path)
+                # Step 2: Typst markup → PDF via typst Python library
+                pdf_bytes: bytes = typst.compile(typst_markup.encode("utf-8"))
+                return pdf_bytes
+            else:
+                # DOCX: let pandoc handle the full conversion
+                fd, tmp_path = tempfile.mkstemp(suffix=f".{format.value}")
+                os.close(fd)
+                try:
+                    pypandoc.convert_text(
+                        markdown_content,
+                        format.value,
+                        format="gfm",
+                        extra_args=["--standalone"],
+                        outputfile=tmp_path,
+                    )
+                    with open(tmp_path, "rb") as f:
+                        return f.read()
+                finally:
+                    os.unlink(tmp_path)

        loop = asyncio.get_running_loop()
        output = await loop.run_in_executor(None, _convert_and_read)
--- a/surfsense_backend/pyproject.toml
+++ b/surfsense_backend/pyproject.toml
@ -63,7 +63,8 @@ dependencies = [
    "unstructured-client>=0.42.3",
    "langchain-unstructured>=1.0.1",
    "slowapi>=0.1.9",
-    "pypandoc>=1.16.2",
+    "pypandoc_binary>=1.16.2",
+    "typst>=0.14.0",
 ]

 [dependency-groups]
--- a/surfsense_backend/uv.lock
+++ b/surfsense_backend/uv.lock
--- a/surfsense_web/app/dashboard/[search_space_id]/new-chat/[[...chat_id]]/page.tsx
+++ b/surfsense_web/app/dashboard/[search_space_id]/new-chat/[[...chat_id]]/page.tsx
@ -28,10 +28,12 @@ import {
 	// extractWriteTodosFromContent,
 	hydratePlanStateAtom,
 } from "@/atoms/chat/plan-state.atom";
+import { closeReportPanelAtom } from "@/atoms/chat/report-panel.atom";
 import { membersAtom } from "@/atoms/members/members-query.atoms";
 import { currentUserAtom } from "@/atoms/user/user-query.atoms";
 import { Thread } from "@/components/assistant-ui/thread";
 import { ChatHeader } from "@/components/new-chat/chat-header";
+import { ReportPanel } from "@/components/report-panel/report-panel";
 import type { ThinkingStep } from "@/components/tool-ui/deepagent-thinking";
 import { DisplayImageToolUI } from "@/components/tool-ui/display-image";
 import { GeneratePodcastToolUI } from "@/components/tool-ui/generate-podcast";
@ -39,8 +41,6 @@ import { GenerateReportToolUI } from "@/components/tool-ui/generate-report";
 import { LinkPreviewToolUI } from "@/components/tool-ui/link-preview";
 import { ScrapeWebpageToolUI } from "@/components/tool-ui/scrape-webpage";
 import { RecallMemoryToolUI, SaveMemoryToolUI } from "@/components/tool-ui/user-memory";
-import { ReportPanel } from "@/components/report-panel/report-panel";
-import { closeReportPanelAtom } from "@/atoms/chat/report-panel.atom";
 import { Skeleton } from "@/components/ui/skeleton";
 import { useChatSessionStateSync } from "@/hooks/use-chat-session-state";
 import { useMessagesElectric } from "@/hooks/use-messages-electric";
--- a/surfsense_web/components/markdown-viewer.tsx
+++ b/surfsense_web/components/markdown-viewer.tsx
@ -1,7 +1,7 @@
-import Image from "next/image";
-import { Streamdown, type StreamdownProps } from "streamdown";
 import { createCodePlugin } from "@streamdown/code";
 import { createMathPlugin } from "@streamdown/math";
+import Image from "next/image";
+import { Streamdown, type StreamdownProps } from "streamdown";
 import "katex/dist/katex.min.css";
 import { cn } from "@/lib/utils";

--- a/surfsense_web/components/public-chat/public-chat-view.tsx
+++ b/surfsense_web/components/public-chat/public-chat-view.tsx
@ -2,12 +2,12 @@

 import { AssistantRuntimeProvider } from "@assistant-ui/react";
 import { Navbar } from "@/components/homepage/navbar";
+import { ReportPanel } from "@/components/report-panel/report-panel";
 import { DisplayImageToolUI } from "@/components/tool-ui/display-image";
 import { GeneratePodcastToolUI } from "@/components/tool-ui/generate-podcast";
 import { GenerateReportToolUI } from "@/components/tool-ui/generate-report";
 import { LinkPreviewToolUI } from "@/components/tool-ui/link-preview";
 import { ScrapeWebpageToolUI } from "@/components/tool-ui/scrape-webpage";
-import { ReportPanel } from "@/components/report-panel/report-panel";
 import { Spinner } from "@/components/ui/spinner";
 import { usePublicChat } from "@/hooks/use-public-chat";
 import { usePublicChatRuntime } from "@/hooks/use-public-chat-runtime";
--- a/surfsense_web/components/report-panel/report-panel.tsx
+++ b/surfsense_web/components/report-panel/report-panel.tsx
@ -5,8 +5,8 @@ import { ChevronDownIcon, XIcon } from "lucide-react";
 import { useCallback, useEffect, useRef, useState } from "react";
 import { z } from "zod";
 import { closeReportPanelAtom, reportPanelAtom } from "@/atoms/chat/report-panel.atom";
+import { MarkdownViewer } from "@/components/markdown-viewer";
 import { Button } from "@/components/ui/button";
-import { Spinner } from "@/components/ui/spinner";
 import { Drawer, DrawerContent, DrawerHandle } from "@/components/ui/drawer";
 import {
 	DropdownMenu,
@ -14,7 +14,7 @@ import {
 	DropdownMenuItem,
 	DropdownMenuTrigger,
 } from "@/components/ui/dropdown-menu";
-import { MarkdownViewer } from "@/components/markdown-viewer";
+import { Spinner } from "@/components/ui/spinner";
 import { useMediaQuery } from "@/hooks/use-media-query";
 import { baseApiService } from "@/lib/apis/base-api.service";
 import { authenticatedFetch } from "@/lib/auth-utils";
--- a/surfsense_web/components/tool-ui/article/index.tsx
+++ b/surfsense_web/components/tool-ui/article/index.tsx
@ -8,8 +8,8 @@ import {
 	FileTextIcon,
 	UserIcon,
 } from "lucide-react";
-import { Component, type ReactNode, useCallback, useState } from "react";
 import Image from "next/image";
+import { Component, type ReactNode, useCallback, useState } from "react";
 import { z } from "zod";
 import { Card, CardContent } from "@/components/ui/card";
 import { Tooltip, TooltipContent, TooltipProvider, TooltipTrigger } from "@/components/ui/tooltip";
--- a/surfsense_web/components/tool-ui/generate-report.tsx
+++ b/surfsense_web/components/tool-ui/generate-report.tsx
@ -6,8 +6,8 @@ import { Dot, FileTextIcon } from "lucide-react";
 import { useParams, usePathname } from "next/navigation";
 import { useEffect, useState } from "react";
 import { z } from "zod";
-import { TextShimmerLoader } from "@/components/prompt-kit/loader";
 import { openReportPanelAtom, reportPanelAtom } from "@/atoms/chat/report-panel.atom";
+import { TextShimmerLoader } from "@/components/prompt-kit/loader";
 import { baseApiService } from "@/lib/apis/base-api.service";

 /**