feat: added ai file sorting

2026-06-22 21:28:12 +02:00 · 2026-04-14 01:43:30 -07:00 · 2026-04-14 01:43:30 -07:00 · 4bee367d4a
commit 4bee367d4a
parent fa0b47dfca
51 changed files with 1703 additions and 72 deletions
--- a/surfsense_web/components/editor/utils/escape-mdx.ts
+++ b/surfsense_web/components/editor/utils/escape-mdx.ts
@ -1,25 +1,60 @@
 // ---------------------------------------------------------------------------
-// MDX curly-brace escaping helper
+// MDX pre-processing helpers
 // ---------------------------------------------------------------------------
-// remarkMdx treats { } as JSX expression delimiters. Arbitrary markdown
-// (e.g. AI-generated reports) can contain curly braces that are NOT valid JS
-// expressions, which makes acorn throw "Could not parse expression".
-// We escape unescaped { and } *outside* of fenced code blocks and inline code
-// so remarkMdx treats them as literal characters while still parsing
-// <mark>, <u>, <kbd>, etc. tags correctly.
+// remarkMdx treats { } as JSX expression delimiters and does NOT support
+// HTML comments (<!-- -->). Arbitrary markdown from document conversions
+// (e.g. PDF-to-markdown via Azure/DocIntel) can contain constructs that
+// break the MDX parser. This module sanitises them before deserialization.
 // ---------------------------------------------------------------------------

 const FENCED_OR_INLINE_CODE = /(```[\s\S]*?```|`[^`\n]+`)/g;

-export function escapeMdxExpressions(md: string): string {
+// Strip HTML comments that MDX cannot parse.
+// PDF converters emit <!-- PageHeader="..." -->, <!-- PageBreak -->, etc.
+// MDX uses JSX-style comments and chokes on HTML comments, causing the
+// parser to stop at the first occurrence.
+// - <!-- PageBreak --> becomes a thematic break (---)
+// - All other HTML comments are removed
+function stripHtmlComments(md: string): string {
+	return md
+		.replace(/<!--\s*PageBreak\s*-->/gi, "\n---\n")
+		.replace(/<!--[\s\S]*?-->/g, "");
+}
+
+// Convert <figure>...</figure> blocks to plain text blockquotes.
+// <figure> with arbitrary text content is not valid JSX, causing the MDX
+// parser to fail.
+function convertFigureBlocks(md: string): string {
+	return md.replace(/<figure[^>]*>([\s\S]*?)<\/figure>/gi, (_match, inner: string) => {
+		const trimmed = (inner as string).trim();
+		if (!trimmed) return "";
+		const quoted = trimmed
+			.split("\n")
+			.map((line) => `> ${line}`)
+			.join("\n");
+		return `\n${quoted}\n`;
+	});
+}
+
+// Escape unescaped { and } outside of fenced/inline code so remarkMdx
+// treats them as literal characters rather than JSX expression delimiters.
+function escapeCurlyBraces(md: string): string {
 	const parts = md.split(FENCED_OR_INLINE_CODE);

 	return parts
 		.map((part, i) => {
-			// Odd indices are code blocks / inline code – leave untouched
 			if (i % 2 === 1) return part;
-			// Escape { and } that are NOT already escaped (no preceding \)
 			return part.replace(/(?<!\\)\{/g, "\\{").replace(/(?<!\\)\}/g, "\\}");
 		})
 		.join("");
 }
+
+// Pre-process raw markdown so it can be safely parsed by the MDX-enabled
+// Plate editor. Applies all sanitisation steps in order.
+export function escapeMdxExpressions(md: string): string {
+	let result = md;
+	result = stripHtmlComments(result);
+	result = convertFigureBlocks(result);
+	result = escapeCurlyBraces(result);
+	return result;
+}