mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-06-02 19:55:18 +02:00
feat: enhance system prompts to specify LaTeX notation for mathematical expressions
This commit is contained in:
parent
47670997df
commit
8746051670
3 changed files with 68 additions and 3 deletions
|
|
@ -21,6 +21,8 @@ You are SurfSense, a reasoning and acting AI agent designed to answer user quest
|
||||||
|
|
||||||
Today's date (UTC): {resolved_today}
|
Today's date (UTC): {resolved_today}
|
||||||
|
|
||||||
|
When writing mathematical formulas or equations, ALWAYS use LaTeX notation. NEVER use backtick code spans or Unicode symbols for math.
|
||||||
|
|
||||||
</system_instruction>
|
</system_instruction>
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
@ -33,6 +35,8 @@ In this team thread, each message is prefixed with **[DisplayName of the author]
|
||||||
|
|
||||||
Today's date (UTC): {resolved_today}
|
Today's date (UTC): {resolved_today}
|
||||||
|
|
||||||
|
When writing mathematical formulas or equations, ALWAYS use LaTeX notation. NEVER use backtick code spans or Unicode symbols for math.
|
||||||
|
|
||||||
</system_instruction>
|
</system_instruction>
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -53,6 +53,7 @@ _REPORT_PROMPT = """You are an expert report writer. Generate a well-structured,
|
||||||
A[Source Code] --> B[Compiler]
|
A[Source Code] --> B[Compiler]
|
||||||
B --> C[Bytecode]
|
B --> C[Bytecode]
|
||||||
```
|
```
|
||||||
|
10. When including mathematical formulas or equations, ALWAYS use LaTeX notation. NEVER use backtick code spans or Unicode symbols for math.
|
||||||
|
|
||||||
Write the report now:
|
Write the report now:
|
||||||
"""
|
"""
|
||||||
|
|
|
||||||
|
|
@ -66,6 +66,61 @@ def _strip_wrapping_code_fences(text: str) -> str:
|
||||||
return stripped
|
return stripped
|
||||||
|
|
||||||
|
|
||||||
|
def _normalize_latex_delimiters(text: str) -> str:
|
||||||
|
"""Convert all LaTeX math delimiters to dollar-sign form.
|
||||||
|
|
||||||
|
Pandoc's ``tex_math_dollars`` extension (on the ``gfm`` reader) handles
|
||||||
|
``$…$`` and ``$$…$$`` natively. This function converts every other
|
||||||
|
delimiter style that LLMs produce into dollar-sign form so pandoc can
|
||||||
|
parse them as math.
|
||||||
|
|
||||||
|
Supported conversions:
|
||||||
|
\\[…\\] → $$…$$ (display math)
|
||||||
|
\\(…\\) → $…$ (inline math)
|
||||||
|
\\begin{equation}…\\end{equation} → $$…$$ (display math)
|
||||||
|
\\begin{displaymath}…\\end{displaymath}→ $$…$$ (display math)
|
||||||
|
\\begin{math}…\\end{math} → $…$ (inline math)
|
||||||
|
`$$…$$` / `$…$` → strip wrapping backticks
|
||||||
|
"""
|
||||||
|
# 1. Block math: \[...\] → $$...$$
|
||||||
|
text = re.sub(r"\\\[([\s\S]*?)\\\]", lambda m: f"$${m.group(1)}$$", text)
|
||||||
|
# 2. Inline math: \(...\) → $...$
|
||||||
|
text = re.sub(r"\\\(([\s\S]*?)\\\)", lambda m: f"${m.group(1)}$", text)
|
||||||
|
# 3. \begin{equation}...\end{equation} → $$...$$
|
||||||
|
text = re.sub(
|
||||||
|
r"\\begin\{equation\}([\s\S]*?)\\end\{equation\}",
|
||||||
|
lambda m: f"$${m.group(1)}$$",
|
||||||
|
text,
|
||||||
|
)
|
||||||
|
# 4. \begin{displaymath}...\end{displaymath} → $$...$$
|
||||||
|
text = re.sub(
|
||||||
|
r"\\begin\{displaymath\}([\s\S]*?)\\end\{displaymath\}",
|
||||||
|
lambda m: f"$${m.group(1)}$$",
|
||||||
|
text,
|
||||||
|
)
|
||||||
|
# 5. \begin{math}...\end{math} → $...$
|
||||||
|
text = re.sub(
|
||||||
|
r"\\begin\{math\}([\s\S]*?)\\end\{math\}",
|
||||||
|
lambda m: f"${m.group(1)}$",
|
||||||
|
text,
|
||||||
|
)
|
||||||
|
# 6. Strip backtick wrapping around math: `$$...$$` → $$...$$ and `$...$` → $...$
|
||||||
|
text = re.sub(r"`(\${1,2})((?:(?!\1).)+)\1`", r"\1\2\1", text)
|
||||||
|
# 7. Trim whitespace inside inline math $...$.
|
||||||
|
# Pandoc's tex_math_dollars requires NO space after the opening $ and
|
||||||
|
# NO space before the closing $. LLMs frequently produce "$ e^x $"
|
||||||
|
# or "\( e^x \)" (which step 2 converts to "$ e^x $"). Without
|
||||||
|
# trimming, pandoc treats these as literal dollar-sign text.
|
||||||
|
# We require spaces on BOTH sides to avoid false-positives on
|
||||||
|
# currency like "$50" or "$50 and $100".
|
||||||
|
def _trim_inline_math(m: re.Match) -> str:
|
||||||
|
inner = m.group(1).strip()
|
||||||
|
return f"${inner}$" if inner else m.group(0)
|
||||||
|
|
||||||
|
text = re.sub(r"(?<!\$)\$(?!\$) +(.+?) +\$(?!\$)", _trim_inline_math, text)
|
||||||
|
return text
|
||||||
|
|
||||||
|
|
||||||
async def _get_report_with_access(
|
async def _get_report_with_access(
|
||||||
report_id: int,
|
report_id: int,
|
||||||
session: AsyncSession,
|
session: AsyncSession,
|
||||||
|
|
@ -227,6 +282,10 @@ async def export_report(
|
||||||
# Without this, pandoc treats the entire content as a code block.
|
# Without this, pandoc treats the entire content as a code block.
|
||||||
markdown_content = _strip_wrapping_code_fences(report.content)
|
markdown_content = _strip_wrapping_code_fences(report.content)
|
||||||
|
|
||||||
|
# Normalise all LaTeX math delimiters (\(\), \[\], \begin{equation},
|
||||||
|
# etc.) into $/$$ form that pandoc's tex_math_dollars extension can parse.
|
||||||
|
markdown_content = _normalize_latex_delimiters(markdown_content)
|
||||||
|
|
||||||
# Convert Markdown to the requested format.
|
# Convert Markdown to the requested format.
|
||||||
#
|
#
|
||||||
# DOCX: pypandoc (pandoc) handles the full conversion directly.
|
# DOCX: pypandoc (pandoc) handles the full conversion directly.
|
||||||
|
|
@ -237,8 +296,9 @@ async def export_report(
|
||||||
# bundles the compiler as a native extension. Typst produces
|
# bundles the compiler as a native extension. Typst produces
|
||||||
# professional styling for tables, headings, code blocks, etc.
|
# professional styling for tables, headings, code blocks, etc.
|
||||||
#
|
#
|
||||||
# Use "gfm" as the input format because LLM output uses GFM-style
|
# Use "gfm" as the base input format because LLM output uses GFM-style
|
||||||
# pipe tables that pandoc's stricter default "markdown" may mangle.
|
# pipe tables that pandoc's stricter default "markdown" may mangle.
|
||||||
|
# The +tex_math_dollars extension enables $/$$ math recognition.
|
||||||
|
|
||||||
def _convert_and_read() -> bytes:
|
def _convert_and_read() -> bytes:
|
||||||
"""Run all blocking I/O (tempfile, pandoc/typst, file read, cleanup) in a thread."""
|
"""Run all blocking I/O (tempfile, pandoc/typst, file read, cleanup) in a thread."""
|
||||||
|
|
@ -253,7 +313,7 @@ async def export_report(
|
||||||
typst_markup: str = pypandoc.convert_text(
|
typst_markup: str = pypandoc.convert_text(
|
||||||
markdown_content,
|
markdown_content,
|
||||||
"typst",
|
"typst",
|
||||||
format="gfm",
|
format="gfm+tex_math_dollars",
|
||||||
extra_args=[
|
extra_args=[
|
||||||
"--standalone",
|
"--standalone",
|
||||||
"-V",
|
"-V",
|
||||||
|
|
@ -273,7 +333,7 @@ async def export_report(
|
||||||
pypandoc.convert_text(
|
pypandoc.convert_text(
|
||||||
markdown_content,
|
markdown_content,
|
||||||
format.value,
|
format.value,
|
||||||
format="gfm",
|
format="gfm+tex_math_dollars",
|
||||||
extra_args=["--standalone"],
|
extra_args=["--standalone"],
|
||||||
outputfile=tmp_path,
|
outputfile=tmp_path,
|
||||||
)
|
)
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue