mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-04-25 16:56:22 +02:00
Some checks are pending
Build and Push Docker Images / tag_release (push) Waiting to run
Build and Push Docker Images / build (./surfsense_backend, ./surfsense_backend/Dockerfile, backend, surfsense-backend, ubuntu-24.04-arm, linux/arm64, arm64) (push) Blocked by required conditions
Build and Push Docker Images / build (./surfsense_backend, ./surfsense_backend/Dockerfile, backend, surfsense-backend, ubuntu-latest, linux/amd64, amd64) (push) Blocked by required conditions
Build and Push Docker Images / build (./surfsense_web, ./surfsense_web/Dockerfile, web, surfsense-web, ubuntu-24.04-arm, linux/arm64, arm64) (push) Blocked by required conditions
Build and Push Docker Images / build (./surfsense_web, ./surfsense_web/Dockerfile, web, surfsense-web, ubuntu-latest, linux/amd64, amd64) (push) Blocked by required conditions
Build and Push Docker Images / create_manifest (backend, surfsense-backend) (push) Blocked by required conditions
Build and Push Docker Images / create_manifest (web, surfsense-web) (push) Blocked by required conditions
- Introduced a `ProcessingMode` enum to differentiate between basic and premium processing modes. - Updated `EtlRequest` to include a `processing_mode` field, defaulting to basic. - Enhanced ETL pipeline services to utilize the selected processing mode for Azure Document Intelligence and LlamaCloud parsing. - Modified various routes and services to handle processing mode, affecting document upload and indexing tasks. - Improved error handling and logging to include processing mode details. - Added tests to validate processing mode functionality and its impact on ETL operations.
58 lines
2.3 KiB
TypeScript
58 lines
2.3 KiB
TypeScript
// ---------------------------------------------------------------------------
|
|
// MDX pre-processing helpers
|
|
// ---------------------------------------------------------------------------
|
|
// remarkMdx treats { } as JSX expression delimiters and does NOT support
|
|
// HTML comments (<!-- -->). Arbitrary markdown from document conversions
|
|
// (e.g. PDF-to-markdown via Azure/DocIntel) can contain constructs that
|
|
// break the MDX parser. This module sanitises them before deserialization.
|
|
// ---------------------------------------------------------------------------
|
|
|
|
const FENCED_OR_INLINE_CODE = /(```[\s\S]*?```|`[^`\n]+`)/g;
|
|
|
|
// Strip HTML comments that MDX cannot parse.
|
|
// PDF converters emit <!-- PageHeader="..." -->, <!-- PageBreak -->, etc.
|
|
// MDX uses JSX-style comments and chokes on HTML comments, causing the
|
|
// parser to stop at the first occurrence.
|
|
// - <!-- PageBreak --> becomes a thematic break (---)
|
|
// - All other HTML comments are removed
|
|
function stripHtmlComments(md: string): string {
|
|
return md.replace(/<!--\s*PageBreak\s*-->/gi, "\n---\n").replace(/<!--[\s\S]*?-->/g, "");
|
|
}
|
|
|
|
// Convert <figure>...</figure> blocks to plain text blockquotes.
|
|
// <figure> with arbitrary text content is not valid JSX, causing the MDX
|
|
// parser to fail.
|
|
function convertFigureBlocks(md: string): string {
|
|
return md.replace(/<figure[^>]*>([\s\S]*?)<\/figure>/gi, (_match, inner: string) => {
|
|
const trimmed = (inner as string).trim();
|
|
if (!trimmed) return "";
|
|
const quoted = trimmed
|
|
.split("\n")
|
|
.map((line) => `> ${line}`)
|
|
.join("\n");
|
|
return `\n${quoted}\n`;
|
|
});
|
|
}
|
|
|
|
// Escape unescaped { and } outside of fenced/inline code so remarkMdx
|
|
// treats them as literal characters rather than JSX expression delimiters.
|
|
function escapeCurlyBraces(md: string): string {
|
|
const parts = md.split(FENCED_OR_INLINE_CODE);
|
|
|
|
return parts
|
|
.map((part, i) => {
|
|
if (i % 2 === 1) return part;
|
|
return part.replace(/(?<!\\)\{/g, "\\{").replace(/(?<!\\)\}/g, "\\}");
|
|
})
|
|
.join("");
|
|
}
|
|
|
|
// Pre-process raw markdown so it can be safely parsed by the MDX-enabled
|
|
// Plate editor. Applies all sanitisation steps in order.
|
|
export function escapeMdxExpressions(md: string): string {
|
|
let result = md;
|
|
result = stripHtmlComments(result);
|
|
result = convertFigureBlocks(result);
|
|
result = escapeCurlyBraces(result);
|
|
return result;
|
|
}
|