mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-05-12 09:12:40 +02:00
Merge pull request #816 from AnishSarkar22/feat/report-artifact
fix: fix docker setup of report artifact & some improvements
This commit is contained in:
commit
f489f2c030
12 changed files with 3501 additions and 3338 deletions
|
|
@ -129,16 +129,6 @@ RUN ARCH=$(dpkg --print-architecture) && \
|
||||||
dpkg -i /tmp/pandoc.deb && \
|
dpkg -i /tmp/pandoc.deb && \
|
||||||
rm /tmp/pandoc.deb
|
rm /tmp/pandoc.deb
|
||||||
|
|
||||||
# Install Typst for PDF rendering (Typst has built-in professional styling
|
|
||||||
# for tables, headings, code blocks, etc., no CSS needed).
|
|
||||||
RUN ARCH=$(dpkg --print-architecture) && \
|
|
||||||
if [ "$ARCH" = "amd64" ]; then TYPST_ARCH="x86_64-unknown-linux-musl"; \
|
|
||||||
else TYPST_ARCH="aarch64-unknown-linux-musl"; fi && \
|
|
||||||
wget -qO /tmp/typst.tar.xz "https://github.com/typst/typst/releases/download/v0.14.2/typst-${TYPST_ARCH}.tar.xz" && \
|
|
||||||
tar -xf /tmp/typst.tar.xz -C /tmp && \
|
|
||||||
cp /tmp/typst-*/typst /usr/local/bin/typst && \
|
|
||||||
rm -rf /tmp/typst* && \
|
|
||||||
typst --version
|
|
||||||
|
|
||||||
# Install Node.js 20.x (for running frontend)
|
# Install Node.js 20.x (for running frontend)
|
||||||
RUN curl -fsSL https://deb.nodesource.com/setup_20.x | bash - \
|
RUN curl -fsSL https://deb.nodesource.com/setup_20.x | bash - \
|
||||||
|
|
|
||||||
|
|
@ -30,10 +30,6 @@ RUN ARCH=$(dpkg --print-architecture) && \
|
||||||
dpkg -i /tmp/pandoc.deb && \
|
dpkg -i /tmp/pandoc.deb && \
|
||||||
rm /tmp/pandoc.deb
|
rm /tmp/pandoc.deb
|
||||||
|
|
||||||
# NOTE: Typst CLI is NOT installed here. PDF rendering uses the `typst` Python
|
|
||||||
# library (pip package) which bundles the compiler as a native extension.
|
|
||||||
# This avoids architecture-specific binary downloads and works cross-platform.
|
|
||||||
|
|
||||||
# Update certificates and install SSL tools
|
# Update certificates and install SSL tools
|
||||||
RUN update-ca-certificates
|
RUN update-ca-certificates
|
||||||
RUN pip install --upgrade certifi pip-system-certs
|
RUN pip install --upgrade certifi pip-system-certs
|
||||||
|
|
|
||||||
|
|
@ -21,6 +21,8 @@ You are SurfSense, a reasoning and acting AI agent designed to answer user quest
|
||||||
|
|
||||||
Today's date (UTC): {resolved_today}
|
Today's date (UTC): {resolved_today}
|
||||||
|
|
||||||
|
When writing mathematical formulas or equations, ALWAYS use LaTeX notation. NEVER use backtick code spans or Unicode symbols for math.
|
||||||
|
|
||||||
</system_instruction>
|
</system_instruction>
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
@ -33,6 +35,8 @@ In this team thread, each message is prefixed with **[DisplayName of the author]
|
||||||
|
|
||||||
Today's date (UTC): {resolved_today}
|
Today's date (UTC): {resolved_today}
|
||||||
|
|
||||||
|
When writing mathematical formulas or equations, ALWAYS use LaTeX notation. NEVER use backtick code spans or Unicode symbols for math.
|
||||||
|
|
||||||
</system_instruction>
|
</system_instruction>
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -124,16 +124,15 @@ BUILTIN_TOOLS: list[ToolDefinition] = [
|
||||||
),
|
),
|
||||||
requires=["search_space_id", "db_session", "thread_id"],
|
requires=["search_space_id", "db_session", "thread_id"],
|
||||||
),
|
),
|
||||||
# Report generation tool (inline, no Celery)
|
# Report generation tool (inline, short-lived sessions for DB ops)
|
||||||
ToolDefinition(
|
ToolDefinition(
|
||||||
name="generate_report",
|
name="generate_report",
|
||||||
description="Generate a structured Markdown report from provided content",
|
description="Generate a structured Markdown report from provided content",
|
||||||
factory=lambda deps: create_generate_report_tool(
|
factory=lambda deps: create_generate_report_tool(
|
||||||
search_space_id=deps["search_space_id"],
|
search_space_id=deps["search_space_id"],
|
||||||
db_session=deps["db_session"],
|
|
||||||
thread_id=deps["thread_id"],
|
thread_id=deps["thread_id"],
|
||||||
),
|
),
|
||||||
requires=["search_space_id", "db_session", "thread_id"],
|
requires=["search_space_id", "thread_id"],
|
||||||
),
|
),
|
||||||
# Link preview tool - fetches Open Graph metadata for URLs
|
# Link preview tool - fetches Open Graph metadata for URLs
|
||||||
ToolDefinition(
|
ToolDefinition(
|
||||||
|
|
|
||||||
|
|
@ -6,18 +6,20 @@ that generates a structured Markdown report inline (no Celery). The LLM is
|
||||||
called within the tool, the result is saved to the database, and the tool
|
called within the tool, the result is saved to the database, and the tool
|
||||||
returns immediately with a ready status.
|
returns immediately with a ready status.
|
||||||
|
|
||||||
This follows the same inline pattern as generate_image and display_image,
|
Uses short-lived database sessions to avoid holding connections during long
|
||||||
NOT the Celery-based podcast pattern.
|
LLM calls (30-120+ seconds). Each DB operation (read config, save report)
|
||||||
|
opens and closes its own session, ensuring no connection is held idle during
|
||||||
|
the LLM API call.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import logging
|
import logging
|
||||||
import re
|
import re
|
||||||
from typing import Any
|
from typing import Any
|
||||||
|
|
||||||
|
from langchain_core.messages import HumanMessage
|
||||||
from langchain_core.tools import tool
|
from langchain_core.tools import tool
|
||||||
from sqlalchemy.ext.asyncio import AsyncSession
|
|
||||||
|
|
||||||
from app.db import Report
|
from app.db import Report, async_session_maker
|
||||||
from app.services.llm_service import get_document_summary_llm
|
from app.services.llm_service import get_document_summary_llm
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
@ -53,6 +55,7 @@ _REPORT_PROMPT = """You are an expert report writer. Generate a well-structured,
|
||||||
A[Source Code] --> B[Compiler]
|
A[Source Code] --> B[Compiler]
|
||||||
B --> C[Bytecode]
|
B --> C[Bytecode]
|
||||||
```
|
```
|
||||||
|
10. When including mathematical formulas or equations, ALWAYS use LaTeX notation. NEVER use backtick code spans or Unicode symbols for math.
|
||||||
|
|
||||||
Write the report now:
|
Write the report now:
|
||||||
"""
|
"""
|
||||||
|
|
@ -96,7 +99,6 @@ def _extract_metadata(content: str) -> dict[str, Any]:
|
||||||
|
|
||||||
def create_generate_report_tool(
|
def create_generate_report_tool(
|
||||||
search_space_id: int,
|
search_space_id: int,
|
||||||
db_session: AsyncSession,
|
|
||||||
thread_id: int | None = None,
|
thread_id: int | None = None,
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
|
|
@ -105,9 +107,11 @@ def create_generate_report_tool(
|
||||||
The tool generates a Markdown report inline using the search space's
|
The tool generates a Markdown report inline using the search space's
|
||||||
document summary LLM, saves it to the database, and returns immediately.
|
document summary LLM, saves it to the database, and returns immediately.
|
||||||
|
|
||||||
|
Uses short-lived database sessions for each DB operation so no connection
|
||||||
|
is held during the long LLM API call.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
search_space_id: The user's search space ID
|
search_space_id: The user's search space ID
|
||||||
db_session: Database session for creating the report record
|
|
||||||
thread_id: The chat thread ID for associating the report
|
thread_id: The chat thread ID for associating the report
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
|
|
@ -197,14 +201,17 @@ def create_generate_report_tool(
|
||||||
User: "Rewrite the report in a more formal tone" → parent_report_id = <previous report_id>
|
User: "Rewrite the report in a more formal tone" → parent_report_id = <previous report_id>
|
||||||
User: "I want more details about pricing in here" → parent_report_id = <previous report_id>
|
User: "I want more details about pricing in here" → parent_report_id = <previous report_id>
|
||||||
User: "Include more examples" → parent_report_id = <previous report_id>
|
User: "Include more examples" → parent_report_id = <previous report_id>
|
||||||
User: "Can you also cover security in this?" → parent_report_id = <previous report_id>
|
User: "Can you also cover nutrition in this?" → parent_report_id = <previous report_id>
|
||||||
User: "Make it more detailed" → parent_report_id = <previous report_id>
|
User: "Make it more detailed" → parent_report_id = <previous report_id>
|
||||||
User: "I want more about X for in here" → parent_report_id = <previous report_id>
|
User: "Not bad, but expand on the budget section" → parent_report_id = <previous report_id>
|
||||||
|
User: "Also mention the competitor landscape" → parent_report_id = <previous report_id>
|
||||||
|
|
||||||
Examples of when to LEAVE parent_report_id as None:
|
Examples of when to LEAVE parent_report_id as None:
|
||||||
User: "Generate a report on climate change" → parent_report_id = None (new topic)
|
User: "Generate a report on climate change" → parent_report_id = None (new topic)
|
||||||
User: "Write me a report about the budget" → parent_report_id = None (new topic)
|
User: "Write me a report about the budget" → parent_report_id = None (new topic)
|
||||||
User: "Create another report, this time about marketing" → parent_report_id = None
|
User: "Create another report, this time about marketing" → parent_report_id = None
|
||||||
|
User: "Now write one about travel trends in Europe" → parent_report_id = None (new topic despite "now")
|
||||||
|
User: "Do the same kind of report but for the fitness industry" → parent_report_id = None (new topic, different subject)
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
topic: A short, concise title for the report (maximum 8 words). Keep it brief and descriptive — e.g. "AI in Healthcare Analysis: A Comprehensive Report" instead of "Comprehensive Analysis of Artificial Intelligence Applications in Modern Healthcare Systems".
|
topic: A short, concise title for the report (maximum 8 words). Keep it brief and descriptive — e.g. "AI in Healthcare Analysis: A Comprehensive Report" instead of "Comprehensive Analysis of Artificial Intelligence Applications in Modern Healthcare Systems".
|
||||||
|
|
@ -225,27 +232,14 @@ def create_generate_report_tool(
|
||||||
- word_count: Number of words in the report
|
- word_count: Number of words in the report
|
||||||
- message: Status message (or "error" field if failed)
|
- message: Status message (or "error" field if failed)
|
||||||
"""
|
"""
|
||||||
# Resolve the parent report and its group (if versioning)
|
# Initialize version tracking variables (used by _save_failed_report closure)
|
||||||
parent_report: Report | None = None
|
parent_report_content: str | None = None
|
||||||
report_group_id: int | None = None
|
report_group_id: int | None = None
|
||||||
|
|
||||||
if parent_report_id:
|
|
||||||
parent_report = await db_session.get(Report, parent_report_id)
|
|
||||||
if parent_report:
|
|
||||||
report_group_id = parent_report.report_group_id
|
|
||||||
logger.info(
|
|
||||||
f"[generate_report] Creating new version from parent {parent_report_id} "
|
|
||||||
f"(group {report_group_id})"
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
logger.warning(
|
|
||||||
f"[generate_report] parent_report_id={parent_report_id} not found, "
|
|
||||||
"creating standalone report"
|
|
||||||
)
|
|
||||||
|
|
||||||
async def _save_failed_report(error_msg: str) -> int | None:
|
async def _save_failed_report(error_msg: str) -> int | None:
|
||||||
"""Persist a failed report row so the error is visible later."""
|
"""Persist a failed report row using a short-lived session."""
|
||||||
try:
|
try:
|
||||||
|
async with async_session_maker() as session:
|
||||||
failed_report = Report(
|
failed_report = Report(
|
||||||
title=topic,
|
title=topic,
|
||||||
content=None,
|
content=None,
|
||||||
|
|
@ -258,13 +252,13 @@ def create_generate_report_tool(
|
||||||
thread_id=thread_id,
|
thread_id=thread_id,
|
||||||
report_group_id=report_group_id,
|
report_group_id=report_group_id,
|
||||||
)
|
)
|
||||||
db_session.add(failed_report)
|
session.add(failed_report)
|
||||||
await db_session.commit()
|
await session.commit()
|
||||||
await db_session.refresh(failed_report)
|
await session.refresh(failed_report)
|
||||||
# If this is a new group (v1 failed), set group to self
|
# If this is a new group (v1 failed), set group to self
|
||||||
if not failed_report.report_group_id:
|
if not failed_report.report_group_id:
|
||||||
failed_report.report_group_id = failed_report.id
|
failed_report.report_group_id = failed_report.id
|
||||||
await db_session.commit()
|
await session.commit()
|
||||||
logger.info(
|
logger.info(
|
||||||
f"[generate_report] Saved failed report {failed_report.id}: {error_msg}"
|
f"[generate_report] Saved failed report {failed_report.id}: {error_msg}"
|
||||||
)
|
)
|
||||||
|
|
@ -276,8 +270,28 @@ def create_generate_report_tool(
|
||||||
return None
|
return None
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# Get the LLM instance for this search space
|
# ── Phase 1: READ (short-lived session) ──────────────────────
|
||||||
llm = await get_document_summary_llm(db_session, search_space_id)
|
# Fetch parent report and LLM config, then close the session
|
||||||
|
# so no DB connection is held during the long LLM call.
|
||||||
|
async with async_session_maker() as read_session:
|
||||||
|
if parent_report_id:
|
||||||
|
parent_report = await read_session.get(Report, parent_report_id)
|
||||||
|
if parent_report:
|
||||||
|
report_group_id = parent_report.report_group_id
|
||||||
|
parent_report_content = parent_report.content
|
||||||
|
logger.info(
|
||||||
|
f"[generate_report] Creating new version from parent {parent_report_id} "
|
||||||
|
f"(group {report_group_id})"
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
logger.warning(
|
||||||
|
f"[generate_report] parent_report_id={parent_report_id} not found, "
|
||||||
|
"creating standalone report"
|
||||||
|
)
|
||||||
|
|
||||||
|
llm = await get_document_summary_llm(read_session, search_space_id)
|
||||||
|
# read_session closed — connection returned to pool
|
||||||
|
|
||||||
if not llm:
|
if not llm:
|
||||||
error_msg = (
|
error_msg = (
|
||||||
"No LLM configured. Please configure a language model in Settings."
|
"No LLM configured. Please configure a language model in Settings."
|
||||||
|
|
@ -299,11 +313,11 @@ def create_generate_report_tool(
|
||||||
|
|
||||||
# If revising, include previous version content
|
# If revising, include previous version content
|
||||||
previous_version_section = ""
|
previous_version_section = ""
|
||||||
if parent_report and parent_report.content:
|
if parent_report_content:
|
||||||
previous_version_section = (
|
previous_version_section = (
|
||||||
"**Previous Version of This Report (refine this based on the instructions above — "
|
"**Previous Version of This Report (refine this based on the instructions above — "
|
||||||
"preserve structure and quality, apply only the requested changes):**\n\n"
|
"preserve structure and quality, apply only the requested changes):**\n\n"
|
||||||
f"{parent_report.content}"
|
f"{parent_report_content}"
|
||||||
)
|
)
|
||||||
|
|
||||||
prompt = _REPORT_PROMPT.format(
|
prompt = _REPORT_PROMPT.format(
|
||||||
|
|
@ -314,9 +328,7 @@ def create_generate_report_tool(
|
||||||
source_content=source_content[:100000], # Cap source content
|
source_content=source_content[:100000], # Cap source content
|
||||||
)
|
)
|
||||||
|
|
||||||
# Call the LLM inline
|
# ── Phase 2: LLM CALL (no DB connection held) ────────────────
|
||||||
from langchain_core.messages import HumanMessage
|
|
||||||
|
|
||||||
response = await llm.ainvoke([HumanMessage(content=prompt)])
|
response = await llm.ainvoke([HumanMessage(content=prompt)])
|
||||||
report_content = response.content
|
report_content = response.content
|
||||||
|
|
||||||
|
|
@ -347,7 +359,9 @@ def create_generate_report_tool(
|
||||||
# Extract metadata (includes "status": "ready")
|
# Extract metadata (includes "status": "ready")
|
||||||
metadata = _extract_metadata(report_content)
|
metadata = _extract_metadata(report_content)
|
||||||
|
|
||||||
# Save to database
|
# ── Phase 3: WRITE (short-lived session) ─────────────────────
|
||||||
|
# Save the report to the database, then close the session.
|
||||||
|
async with async_session_maker() as write_session:
|
||||||
report = Report(
|
report = Report(
|
||||||
title=topic,
|
title=topic,
|
||||||
content=report_content,
|
content=report_content,
|
||||||
|
|
@ -355,27 +369,31 @@ def create_generate_report_tool(
|
||||||
report_style=report_style,
|
report_style=report_style,
|
||||||
search_space_id=search_space_id,
|
search_space_id=search_space_id,
|
||||||
thread_id=thread_id,
|
thread_id=thread_id,
|
||||||
report_group_id=report_group_id, # None for v1, inherited for v2+
|
report_group_id=report_group_id,
|
||||||
)
|
)
|
||||||
db_session.add(report)
|
write_session.add(report)
|
||||||
await db_session.commit()
|
await write_session.commit()
|
||||||
await db_session.refresh(report)
|
await write_session.refresh(report)
|
||||||
|
|
||||||
# If this is a brand-new report (v1), set report_group_id = own id
|
# If this is a brand-new report (v1), set report_group_id = own id
|
||||||
if not report.report_group_id:
|
if not report.report_group_id:
|
||||||
report.report_group_id = report.id
|
report.report_group_id = report.id
|
||||||
await db_session.commit()
|
await write_session.commit()
|
||||||
|
|
||||||
|
saved_report_id = report.id
|
||||||
|
saved_group_id = report.report_group_id
|
||||||
|
# write_session closed — connection returned to pool
|
||||||
|
|
||||||
logger.info(
|
logger.info(
|
||||||
f"[generate_report] Created report {report.id} "
|
f"[generate_report] Created report {saved_report_id} "
|
||||||
f"(group={report.report_group_id}): "
|
f"(group={saved_group_id}): "
|
||||||
f"{metadata.get('word_count', 0)} words, "
|
f"{metadata.get('word_count', 0)} words, "
|
||||||
f"{metadata.get('section_count', 0)} sections"
|
f"{metadata.get('section_count', 0)} sections"
|
||||||
)
|
)
|
||||||
|
|
||||||
return {
|
return {
|
||||||
"status": "ready",
|
"status": "ready",
|
||||||
"report_id": report.id,
|
"report_id": saved_report_id,
|
||||||
"title": topic,
|
"title": topic,
|
||||||
"word_count": metadata.get("word_count", 0),
|
"word_count": metadata.get("word_count", 0),
|
||||||
"message": f"Report generated successfully: {topic}",
|
"message": f"Report generated successfully: {topic}",
|
||||||
|
|
|
||||||
|
|
@ -66,6 +66,62 @@ def _strip_wrapping_code_fences(text: str) -> str:
|
||||||
return stripped
|
return stripped
|
||||||
|
|
||||||
|
|
||||||
|
def _normalize_latex_delimiters(text: str) -> str:
|
||||||
|
"""Convert all LaTeX math delimiters to dollar-sign form.
|
||||||
|
|
||||||
|
Pandoc's ``tex_math_dollars`` extension (on the ``gfm`` reader) handles
|
||||||
|
``$…$`` and ``$$…$$`` natively. This function converts every other
|
||||||
|
delimiter style that LLMs produce into dollar-sign form so pandoc can
|
||||||
|
parse them as math.
|
||||||
|
|
||||||
|
Supported conversions:
|
||||||
|
\\[…\\] → $$…$$ (display math)
|
||||||
|
\\(…\\) → $…$ (inline math)
|
||||||
|
\\begin{equation}…\\end{equation} → $$…$$ (display math)
|
||||||
|
\\begin{displaymath}…\\end{displaymath}→ $$…$$ (display math)
|
||||||
|
\\begin{math}…\\end{math} → $…$ (inline math)
|
||||||
|
`$$…$$` / `$…$` → strip wrapping backticks
|
||||||
|
"""
|
||||||
|
# 1. Block math: \[...\] → $$...$$
|
||||||
|
text = re.sub(r"\\\[([\s\S]*?)\\\]", lambda m: f"$${m.group(1)}$$", text)
|
||||||
|
# 2. Inline math: \(...\) → $...$
|
||||||
|
text = re.sub(r"\\\(([\s\S]*?)\\\)", lambda m: f"${m.group(1)}$", text)
|
||||||
|
# 3. \begin{equation}...\end{equation} → $$...$$
|
||||||
|
text = re.sub(
|
||||||
|
r"\\begin\{equation\}([\s\S]*?)\\end\{equation\}",
|
||||||
|
lambda m: f"$${m.group(1)}$$",
|
||||||
|
text,
|
||||||
|
)
|
||||||
|
# 4. \begin{displaymath}...\end{displaymath} → $$...$$
|
||||||
|
text = re.sub(
|
||||||
|
r"\\begin\{displaymath\}([\s\S]*?)\\end\{displaymath\}",
|
||||||
|
lambda m: f"$${m.group(1)}$$",
|
||||||
|
text,
|
||||||
|
)
|
||||||
|
# 5. \begin{math}...\end{math} → $...$
|
||||||
|
text = re.sub(
|
||||||
|
r"\\begin\{math\}([\s\S]*?)\\end\{math\}",
|
||||||
|
lambda m: f"${m.group(1)}$",
|
||||||
|
text,
|
||||||
|
)
|
||||||
|
# 6. Strip backtick wrapping around math: `$$...$$` → $$...$$ and `$...$` → $...$
|
||||||
|
text = re.sub(r"`(\${1,2})((?:(?!\1).)+)\1`", r"\1\2\1", text)
|
||||||
|
|
||||||
|
# 7. Trim whitespace inside inline math $...$.
|
||||||
|
# Pandoc's tex_math_dollars requires NO space after the opening $ and
|
||||||
|
# NO space before the closing $. LLMs frequently produce "$ e^x $"
|
||||||
|
# or "\( e^x \)" (which step 2 converts to "$ e^x $"). Without
|
||||||
|
# trimming, pandoc treats these as literal dollar-sign text.
|
||||||
|
# We require spaces on BOTH sides to avoid false-positives on
|
||||||
|
# currency like "$50" or "$50 and $100".
|
||||||
|
def _trim_inline_math(m: re.Match) -> str:
|
||||||
|
inner = m.group(1).strip()
|
||||||
|
return f"${inner}$" if inner else m.group(0)
|
||||||
|
|
||||||
|
text = re.sub(r"(?<!\$)\$(?!\$) +(.+?) +\$(?!\$)", _trim_inline_math, text)
|
||||||
|
return text
|
||||||
|
|
||||||
|
|
||||||
async def _get_report_with_access(
|
async def _get_report_with_access(
|
||||||
report_id: int,
|
report_id: int,
|
||||||
session: AsyncSession,
|
session: AsyncSession,
|
||||||
|
|
@ -227,6 +283,10 @@ async def export_report(
|
||||||
# Without this, pandoc treats the entire content as a code block.
|
# Without this, pandoc treats the entire content as a code block.
|
||||||
markdown_content = _strip_wrapping_code_fences(report.content)
|
markdown_content = _strip_wrapping_code_fences(report.content)
|
||||||
|
|
||||||
|
# Normalise all LaTeX math delimiters (\(\), \[\], \begin{equation},
|
||||||
|
# etc.) into $/$$ form that pandoc's tex_math_dollars extension can parse.
|
||||||
|
markdown_content = _normalize_latex_delimiters(markdown_content)
|
||||||
|
|
||||||
# Convert Markdown to the requested format.
|
# Convert Markdown to the requested format.
|
||||||
#
|
#
|
||||||
# DOCX: pypandoc (pandoc) handles the full conversion directly.
|
# DOCX: pypandoc (pandoc) handles the full conversion directly.
|
||||||
|
|
@ -237,8 +297,9 @@ async def export_report(
|
||||||
# bundles the compiler as a native extension. Typst produces
|
# bundles the compiler as a native extension. Typst produces
|
||||||
# professional styling for tables, headings, code blocks, etc.
|
# professional styling for tables, headings, code blocks, etc.
|
||||||
#
|
#
|
||||||
# Use "gfm" as the input format because LLM output uses GFM-style
|
# Use "gfm" as the base input format because LLM output uses GFM-style
|
||||||
# pipe tables that pandoc's stricter default "markdown" may mangle.
|
# pipe tables that pandoc's stricter default "markdown" may mangle.
|
||||||
|
# The +tex_math_dollars extension enables $/$$ math recognition.
|
||||||
|
|
||||||
def _convert_and_read() -> bytes:
|
def _convert_and_read() -> bytes:
|
||||||
"""Run all blocking I/O (tempfile, pandoc/typst, file read, cleanup) in a thread."""
|
"""Run all blocking I/O (tempfile, pandoc/typst, file read, cleanup) in a thread."""
|
||||||
|
|
@ -253,7 +314,7 @@ async def export_report(
|
||||||
typst_markup: str = pypandoc.convert_text(
|
typst_markup: str = pypandoc.convert_text(
|
||||||
markdown_content,
|
markdown_content,
|
||||||
"typst",
|
"typst",
|
||||||
format="gfm",
|
format="gfm+tex_math_dollars",
|
||||||
extra_args=[
|
extra_args=[
|
||||||
"--standalone",
|
"--standalone",
|
||||||
"-V",
|
"-V",
|
||||||
|
|
@ -273,7 +334,7 @@ async def export_report(
|
||||||
pypandoc.convert_text(
|
pypandoc.convert_text(
|
||||||
markdown_content,
|
markdown_content,
|
||||||
format.value,
|
format.value,
|
||||||
format="gfm",
|
format="gfm+tex_math_dollars",
|
||||||
extra_args=["--standalone"],
|
extra_args=["--standalone"],
|
||||||
outputfile=tmp_path,
|
outputfile=tmp_path,
|
||||||
)
|
)
|
||||||
|
|
|
||||||
6461
surfsense_backend/uv.lock
generated
6461
surfsense_backend/uv.lock
generated
File diff suppressed because it is too large
Load diff
|
|
@ -11,10 +11,45 @@ import {
|
||||||
import { CheckIcon, CopyIcon } from "lucide-react";
|
import { CheckIcon, CopyIcon } from "lucide-react";
|
||||||
import { type FC, memo, type ReactNode, useState } from "react";
|
import { type FC, memo, type ReactNode, useState } from "react";
|
||||||
import remarkGfm from "remark-gfm";
|
import remarkGfm from "remark-gfm";
|
||||||
|
import remarkMath from "remark-math";
|
||||||
|
import rehypeKatex from "rehype-katex";
|
||||||
|
import "katex/dist/katex.min.css";
|
||||||
import { InlineCitation } from "@/components/assistant-ui/inline-citation";
|
import { InlineCitation } from "@/components/assistant-ui/inline-citation";
|
||||||
import { TooltipIconButton } from "@/components/assistant-ui/tooltip-icon-button";
|
import { TooltipIconButton } from "@/components/assistant-ui/tooltip-icon-button";
|
||||||
import { cn } from "@/lib/utils";
|
import { cn } from "@/lib/utils";
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Convert all LaTeX delimiter styles to the dollar-sign syntax
|
||||||
|
* that remark-math understands. LLMs use various delimiters
|
||||||
|
* (\(...\), \[...\], \begin{equation}, etc.) and we need to
|
||||||
|
* normalise them all to $ / $$ before the markdown parser runs.
|
||||||
|
*/
|
||||||
|
function convertLatexDelimiters(content: string): string {
|
||||||
|
// 1. Block math: \[...\] → $$...$$
|
||||||
|
content = content.replace(/\\\[([\s\S]*?)\\\]/g, (_, inner) => `$$${inner}$$`);
|
||||||
|
// 2. Inline math: \(...\) → $...$
|
||||||
|
content = content.replace(/\\\(([\s\S]*?)\\\)/g, (_, inner) => `$${inner}$`);
|
||||||
|
// 3. Block: \begin{equation}...\end{equation} → $$...$$
|
||||||
|
content = content.replace(
|
||||||
|
/\\begin\{equation\}([\s\S]*?)\\end\{equation\}/g,
|
||||||
|
(_, inner) => `$$${inner}$$`
|
||||||
|
);
|
||||||
|
// 4. Block: \begin{displaymath}...\end{displaymath} → $$...$$
|
||||||
|
content = content.replace(
|
||||||
|
/\\begin\{displaymath\}([\s\S]*?)\\end\{displaymath\}/g,
|
||||||
|
(_, inner) => `$$${inner}$$`
|
||||||
|
);
|
||||||
|
// 5. Inline: \begin{math}...\end{math} → $...$
|
||||||
|
content = content.replace(/\\begin\{math\}([\s\S]*?)\\end\{math\}/g, (_, inner) => `$${inner}$`);
|
||||||
|
// 6. Strip backtick wrapping around math: `$$...$$` → $$...$$ and `$...$` → $...$
|
||||||
|
content = content.replace(/`(\${1,2})((?:(?!\1).)+)\1`/g, "$1$2$1");
|
||||||
|
|
||||||
|
// Ensure markdown headings (## ...) always start on their own line.
|
||||||
|
content = content.replace(/([^\n])(#{1,6}\s)/g, "$1\n\n$2");
|
||||||
|
|
||||||
|
return content;
|
||||||
|
}
|
||||||
|
|
||||||
// Citation pattern: [citation:CHUNK_ID] or [citation:doc-CHUNK_ID]
|
// Citation pattern: [citation:CHUNK_ID] or [citation:doc-CHUNK_ID]
|
||||||
// Also matches Chinese brackets 【】 and handles zero-width spaces that LLM sometimes inserts
|
// Also matches Chinese brackets 【】 and handles zero-width spaces that LLM sometimes inserts
|
||||||
const CITATION_REGEX = /[[【]\u200B?citation:(doc-)?(\d+)\u200B?[\]】]/g;
|
const CITATION_REGEX = /[[【]\u200B?citation:(doc-)?(\d+)\u200B?[\]】]/g;
|
||||||
|
|
@ -59,7 +94,8 @@ function parseTextWithCitations(text: string): ReactNode[] {
|
||||||
// Reset regex state
|
// Reset regex state
|
||||||
CITATION_REGEX.lastIndex = 0;
|
CITATION_REGEX.lastIndex = 0;
|
||||||
|
|
||||||
while ((match = CITATION_REGEX.exec(text)) !== null) {
|
match = CITATION_REGEX.exec(text);
|
||||||
|
while (match !== null) {
|
||||||
// Add text before the citation
|
// Add text before the citation
|
||||||
if (match.index > lastIndex) {
|
if (match.index > lastIndex) {
|
||||||
parts.push(text.substring(lastIndex, match.index));
|
parts.push(text.substring(lastIndex, match.index));
|
||||||
|
|
@ -80,6 +116,7 @@ function parseTextWithCitations(text: string): ReactNode[] {
|
||||||
|
|
||||||
lastIndex = match.index + match[0].length;
|
lastIndex = match.index + match[0].length;
|
||||||
instanceIndex++;
|
instanceIndex++;
|
||||||
|
match = CITATION_REGEX.exec(text);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Add any remaining text after the last citation
|
// Add any remaining text after the last citation
|
||||||
|
|
@ -93,9 +130,11 @@ function parseTextWithCitations(text: string): ReactNode[] {
|
||||||
const MarkdownTextImpl = () => {
|
const MarkdownTextImpl = () => {
|
||||||
return (
|
return (
|
||||||
<MarkdownTextPrimitive
|
<MarkdownTextPrimitive
|
||||||
remarkPlugins={[remarkGfm]}
|
remarkPlugins={[remarkGfm, remarkMath]}
|
||||||
|
rehypePlugins={[rehypeKatex]}
|
||||||
className="aui-md"
|
className="aui-md"
|
||||||
components={defaultComponents}
|
components={defaultComponents}
|
||||||
|
preprocess={convertLatexDelimiters}
|
||||||
/>
|
/>
|
||||||
);
|
);
|
||||||
};
|
};
|
||||||
|
|
|
||||||
|
|
@ -29,33 +29,53 @@ function stripOuterMarkdownFence(content: string): string {
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Convert various LaTeX delimiter styles to the dollar-sign syntax
|
* Convert all LaTeX delimiter styles to the double-dollar syntax
|
||||||
* that remark-math understands, and normalise edge-cases that
|
* that Streamdown's @streamdown/math plugin understands.
|
||||||
* commonly appear in LLM-generated markdown.
|
|
||||||
*
|
*
|
||||||
* \[...\] → $$ ... $$ (block / display math)
|
* Streamdown math conventions (different from remark-math!):
|
||||||
* \(...\) → $ ... $ (inline math)
|
* $$...$$ on the SAME line → inline math
|
||||||
* same-line $$…$$ → $ ... $ (inline math — display math
|
* $$\n...\n$$ on SEPARATE lines → block (display) math
|
||||||
* can't live inside table cells)
|
*
|
||||||
|
* Conversions performed:
|
||||||
|
* \[...\] → $$\n ... \n$$ (block math)
|
||||||
|
* \(...\) → $$...$$ (inline math, same line)
|
||||||
|
* \begin{equation}...\end{equation} → $$\n ... \n$$ (block math)
|
||||||
|
* \begin{displaymath}...\end{displaymath} → $$\n ... \n$$ (block math)
|
||||||
|
* \begin{math}...\end{math} → $$...$$ (inline math, same line)
|
||||||
* `$$ … $$` → $$ … $$ (strip wrapping backtick code)
|
* `$$ … $$` → $$ … $$ (strip wrapping backtick code)
|
||||||
* `$ … $` → $ … $ (strip wrapping backtick code)
|
* `$ … $` → $ … $ (strip wrapping backtick code)
|
||||||
|
* $...$ → $$...$$ (normalise single-$ to double-$$)
|
||||||
*/
|
*/
|
||||||
function convertLatexDelimiters(content: string): string {
|
function convertLatexDelimiters(content: string): string {
|
||||||
// 1. Block math: \[...\] → $$...$$
|
// 1. Block math: \[...\] → $$\n...\n$$ (display math on separate lines)
|
||||||
content = content.replace(/\\\[([\s\S]*?)\\\]/g, (_match, inner) => {
|
content = content.replace(/\\\[([\s\S]*?)\\\]/g, (_, inner) => `\n$$\n${inner.trim()}\n$$\n`);
|
||||||
return `$$${inner}$$`;
|
// 2. Inline math: \(...\) → $$...$$ (inline math on same line)
|
||||||
});
|
content = content.replace(/\\\(([\s\S]*?)\\\)/g, (_, inner) => `$$${inner.trim()}$$`);
|
||||||
// 2. Inline math: \(...\) → $...$
|
// 3. Block: \begin{equation}...\end{equation} → $$\n...\n$$
|
||||||
content = content.replace(/\\\(([\s\S]*?)\\\)/g, (_match, inner) => {
|
content = content.replace(
|
||||||
return `$${inner}$`;
|
/\\begin\{equation\}([\s\S]*?)\\end\{equation\}/g,
|
||||||
});
|
(_, inner) => `\n$$\n${inner.trim()}\n$$\n`
|
||||||
// 3. Strip backtick wrapping around math: `$$...$$` → $$...$$ and `$...$` → $...$
|
);
|
||||||
|
// 4. Block: \begin{displaymath}...\end{displaymath} → $$\n...\n$$
|
||||||
|
content = content.replace(
|
||||||
|
/\\begin\{displaymath\}([\s\S]*?)\\end\{displaymath\}/g,
|
||||||
|
(_, inner) => `\n$$\n${inner.trim()}\n$$\n`
|
||||||
|
);
|
||||||
|
// 5. Inline: \begin{math}...\end{math} → $$...$$
|
||||||
|
content = content.replace(
|
||||||
|
/\\begin\{math\}([\s\S]*?)\\end\{math\}/g,
|
||||||
|
(_, inner) => `$$${inner.trim()}$$`
|
||||||
|
);
|
||||||
|
// 6. Strip backtick wrapping around math: `$$...$$` → $$...$$ and `$...$` → $...$
|
||||||
content = content.replace(/`(\${1,2})((?:(?!\1).)+)\1`/g, "$1$2$1");
|
content = content.replace(/`(\${1,2})((?:(?!\1).)+)\1`/g, "$1$2$1");
|
||||||
// 4. Same-line $$...$$ → $...$ (inline math) so it works inside table cells.
|
// 7. Normalise single-dollar $...$ to double-dollar $$...$$ so they render
|
||||||
// True display math has $$ on its own line, so this only affects inline usage.
|
// reliably in Streamdown (single-$ has strict no-space rules that often fail).
|
||||||
content = content.replace(/\$\$([^\n]+?)\$\$/g, (_match, inner) => {
|
// We match $…$ where the content starts with a backslash (LaTeX command)
|
||||||
return `$${inner}$`;
|
// to avoid converting currency like $50.
|
||||||
});
|
content = content.replace(
|
||||||
|
/(?<!\$)\$(?!\$)(\\[a-zA-Z][\s\S]*?)(?<!\$)\$(?!\$)/g,
|
||||||
|
(_, inner) => `$$${inner.trim()}$$`
|
||||||
|
);
|
||||||
return content;
|
return content;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -14,7 +14,6 @@ import {
|
||||||
DropdownMenuItem,
|
DropdownMenuItem,
|
||||||
DropdownMenuTrigger,
|
DropdownMenuTrigger,
|
||||||
} from "@/components/ui/dropdown-menu";
|
} from "@/components/ui/dropdown-menu";
|
||||||
import { Spinner } from "@/components/ui/spinner";
|
|
||||||
import { useMediaQuery } from "@/hooks/use-media-query";
|
import { useMediaQuery } from "@/hooks/use-media-query";
|
||||||
import { baseApiService } from "@/lib/apis/base-api.service";
|
import { baseApiService } from "@/lib/apis/base-api.service";
|
||||||
import { authenticatedFetch } from "@/lib/auth-utils";
|
import { authenticatedFetch } from "@/lib/auth-utils";
|
||||||
|
|
@ -298,14 +297,12 @@ function ReportPanelContent({
|
||||||
onClick={() => handleExport("pdf")}
|
onClick={() => handleExport("pdf")}
|
||||||
disabled={exporting !== null}
|
disabled={exporting !== null}
|
||||||
>
|
>
|
||||||
{exporting === "pdf" && <Spinner size="xs" />}
|
|
||||||
Download PDF
|
Download PDF
|
||||||
</DropdownMenuItem>
|
</DropdownMenuItem>
|
||||||
<DropdownMenuItem
|
<DropdownMenuItem
|
||||||
onClick={() => handleExport("docx")}
|
onClick={() => handleExport("docx")}
|
||||||
disabled={exporting !== null}
|
disabled={exporting !== null}
|
||||||
>
|
>
|
||||||
{exporting === "docx" && <Spinner size="xs" />}
|
|
||||||
Download DOCX
|
Download DOCX
|
||||||
</DropdownMenuItem>
|
</DropdownMenuItem>
|
||||||
</>
|
</>
|
||||||
|
|
|
||||||
|
|
@ -99,9 +99,11 @@
|
||||||
"react-json-view-lite": "^2.4.1",
|
"react-json-view-lite": "^2.4.1",
|
||||||
"react-syntax-highlighter": "^15.6.1",
|
"react-syntax-highlighter": "^15.6.1",
|
||||||
"react-wrap-balancer": "^1.1.1",
|
"react-wrap-balancer": "^1.1.1",
|
||||||
|
"rehype-katex": "^7.0.1",
|
||||||
"rehype-raw": "^7.0.0",
|
"rehype-raw": "^7.0.0",
|
||||||
"rehype-sanitize": "^6.0.0",
|
"rehype-sanitize": "^6.0.0",
|
||||||
"remark-gfm": "^4.0.1",
|
"remark-gfm": "^4.0.1",
|
||||||
|
"remark-math": "^6.0.0",
|
||||||
"server-only": "^0.0.1",
|
"server-only": "^0.0.1",
|
||||||
"sonner": "^2.0.6",
|
"sonner": "^2.0.6",
|
||||||
"streamdown": "^2.2.0",
|
"streamdown": "^2.2.0",
|
||||||
|
|
|
||||||
6
surfsense_web/pnpm-lock.yaml
generated
6
surfsense_web/pnpm-lock.yaml
generated
|
|
@ -242,6 +242,9 @@ importers:
|
||||||
react-wrap-balancer:
|
react-wrap-balancer:
|
||||||
specifier: ^1.1.1
|
specifier: ^1.1.1
|
||||||
version: 1.1.1(react@19.2.3)
|
version: 1.1.1(react@19.2.3)
|
||||||
|
rehype-katex:
|
||||||
|
specifier: ^7.0.1
|
||||||
|
version: 7.0.1
|
||||||
rehype-raw:
|
rehype-raw:
|
||||||
specifier: ^7.0.0
|
specifier: ^7.0.0
|
||||||
version: 7.0.0
|
version: 7.0.0
|
||||||
|
|
@ -251,6 +254,9 @@ importers:
|
||||||
remark-gfm:
|
remark-gfm:
|
||||||
specifier: ^4.0.1
|
specifier: ^4.0.1
|
||||||
version: 4.0.1
|
version: 4.0.1
|
||||||
|
remark-math:
|
||||||
|
specifier: ^6.0.0
|
||||||
|
version: 6.0.0
|
||||||
server-only:
|
server-only:
|
||||||
specifier: ^0.0.1
|
specifier: ^0.0.1
|
||||||
version: 0.0.1
|
version: 0.0.1
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue