feat: enhance system prompts to specify LaTeX notation for mathematical expressions

This commit is contained in:
Anish Sarkar 2026-02-14 14:28:47 +05:30
parent 47670997df
commit 8746051670
3 changed files with 68 additions and 3 deletions

View file

@ -21,6 +21,8 @@ You are SurfSense, a reasoning and acting AI agent designed to answer user quest
Today's date (UTC): {resolved_today}
When writing mathematical formulas or equations, ALWAYS use LaTeX notation. NEVER use backtick code spans or Unicode symbols for math.
</system_instruction>
"""
@ -33,6 +35,8 @@ In this team thread, each message is prefixed with **[DisplayName of the author]
Today's date (UTC): {resolved_today}
When writing mathematical formulas or equations, ALWAYS use LaTeX notation. NEVER use backtick code spans or Unicode symbols for math.
</system_instruction>
"""

View file

@ -53,6 +53,7 @@ _REPORT_PROMPT = """You are an expert report writer. Generate a well-structured,
A[Source Code] --> B[Compiler]
B --> C[Bytecode]
```
10. When including mathematical formulas or equations, ALWAYS use LaTeX notation. NEVER use backtick code spans or Unicode symbols for math.
Write the report now:
"""

View file

@ -66,6 +66,61 @@ def _strip_wrapping_code_fences(text: str) -> str:
return stripped
def _normalize_latex_delimiters(text: str) -> str:
"""Convert all LaTeX math delimiters to dollar-sign form.
Pandoc's ``tex_math_dollars`` extension (on the ``gfm`` reader) handles
``$$`` and ``$$$$`` natively. This function converts every other
delimiter style that LLMs produce into dollar-sign form so pandoc can
parse them as math.
Supported conversions:
\\[\\] $$$$ (display math)
\\(\\) $$ (inline math)
\\begin{equation}\\end{equation} $$$$ (display math)
\\begin{displaymath}\\end{displaymath} $$$$ (display math)
\\begin{math}\\end{math} $$ (inline math)
`$$$$` / `$$` strip wrapping backticks
"""
# 1. Block math: \[...\] → $$...$$
text = re.sub(r"\\\[([\s\S]*?)\\\]", lambda m: f"$${m.group(1)}$$", text)
# 2. Inline math: \(...\) → $...$
text = re.sub(r"\\\(([\s\S]*?)\\\)", lambda m: f"${m.group(1)}$", text)
# 3. \begin{equation}...\end{equation} → $$...$$
text = re.sub(
r"\\begin\{equation\}([\s\S]*?)\\end\{equation\}",
lambda m: f"$${m.group(1)}$$",
text,
)
# 4. \begin{displaymath}...\end{displaymath} → $$...$$
text = re.sub(
r"\\begin\{displaymath\}([\s\S]*?)\\end\{displaymath\}",
lambda m: f"$${m.group(1)}$$",
text,
)
# 5. \begin{math}...\end{math} → $...$
text = re.sub(
r"\\begin\{math\}([\s\S]*?)\\end\{math\}",
lambda m: f"${m.group(1)}$",
text,
)
# 6. Strip backtick wrapping around math: `$$...$$` → $$...$$ and `$...$` → $...$
text = re.sub(r"`(\${1,2})((?:(?!\1).)+)\1`", r"\1\2\1", text)
# 7. Trim whitespace inside inline math $...$.
# Pandoc's tex_math_dollars requires NO space after the opening $ and
# NO space before the closing $. LLMs frequently produce "$ e^x $"
# or "\( e^x \)" (which step 2 converts to "$ e^x $"). Without
# trimming, pandoc treats these as literal dollar-sign text.
# We require spaces on BOTH sides to avoid false-positives on
# currency like "$50" or "$50 and $100".
def _trim_inline_math(m: re.Match) -> str:
inner = m.group(1).strip()
return f"${inner}$" if inner else m.group(0)
text = re.sub(r"(?<!\$)\$(?!\$) +(.+?) +\$(?!\$)", _trim_inline_math, text)
return text
async def _get_report_with_access(
report_id: int,
session: AsyncSession,
@ -227,6 +282,10 @@ async def export_report(
# Without this, pandoc treats the entire content as a code block.
markdown_content = _strip_wrapping_code_fences(report.content)
# Normalise all LaTeX math delimiters (\(\), \[\], \begin{equation},
# etc.) into $/$$ form that pandoc's tex_math_dollars extension can parse.
markdown_content = _normalize_latex_delimiters(markdown_content)
# Convert Markdown to the requested format.
#
# DOCX: pypandoc (pandoc) handles the full conversion directly.
@ -237,8 +296,9 @@ async def export_report(
# bundles the compiler as a native extension. Typst produces
# professional styling for tables, headings, code blocks, etc.
#
# Use "gfm" as the input format because LLM output uses GFM-style
# Use "gfm" as the base input format because LLM output uses GFM-style
# pipe tables that pandoc's stricter default "markdown" may mangle.
# The +tex_math_dollars extension enables $/$$ math recognition.
def _convert_and_read() -> bytes:
"""Run all blocking I/O (tempfile, pandoc/typst, file read, cleanup) in a thread."""
@ -253,7 +313,7 @@ async def export_report(
typst_markup: str = pypandoc.convert_text(
markdown_content,
"typst",
format="gfm",
format="gfm+tex_math_dollars",
extra_args=[
"--standalone",
"-V",
@ -273,7 +333,7 @@ async def export_report(
pypandoc.convert_text(
markdown_content,
format.value,
format="gfm",
format="gfm+tex_math_dollars",
extra_args=["--standalone"],
outputfile=tmp_path,
)