feat(markdown): enable citation rendering in MarkdownViewer and related components

- Added `enableCitations` prop to `MarkdownViewer` to support interactive citation badges. - Updated instances of `MarkdownViewer` across various components to utilize the new citation feature. - Enhanced citation processing in `PlateEditor` for read-only views, ensuring citations are rendered correctly without affecting markdown serialization. - Refactored citation handling in `InlineCitation` and `MarkdownText` to improve citation context management.
2026-06-30 21:59:46 +02:00 · 2026-04-30 18:40:55 -07:00 · 2026-04-30 18:40:55 -07:00 · 7aeb8bb0a8
commit 7aeb8bb0a8
parent d335e96ec2
14 changed files with 809 additions and 260 deletions
--- a/surfsense_web/lib/citations/citation-parser.ts
+++ b/surfsense_web/lib/citations/citation-parser.ts
@ -0,0 +1,134 @@
+// Pure citation parsing for `[citation:...]` tokens emitted by SurfSense
+// agents. No React imports — consumed by both the React renderer
+// (markdown surfaces) and the Plate value transform (document viewer).
+//
+// The same logic previously lived inline in
+// `components/assistant-ui/markdown-text.tsx` with module-level mutable
+// state. This module exposes a per-call URL map so multiple concurrent
+// renderers / SSR contexts can't race each other.
+
+import { FENCED_OR_INLINE_CODE } from "@/lib/markdown/code-regions";
+
+/**
+ * Matches `[citation:...]` with numeric IDs (incl. negative, doc- prefix,
+ * comma-separated), URL-based IDs from live web search, or `urlciteN`
+ * placeholders produced by `preprocessCitationMarkdown`.
+ *
+ * Also matches Chinese brackets 【】 and zero-width spaces that LLMs
+ * sometimes emit.
+ */
+export const CITATION_REGEX =
+	/[[【]\u200B?citation:\s*(https?:\/\/[^\]】\u200B]+|urlcite\d+|(?:doc-)?-?\d+(?:\s*,\s*(?:doc-)?-?\d+)*)\s*\u200B?[\]】]/g;
+
+/** A single parsed citation reference. */
+export type CitationToken =
+	| { kind: "url"; url: string }
+	| { kind: "chunk"; chunkId: number; isDocsChunk: boolean };
+
+/** Output of `parseTextWithCitations` — interleaved text + citation tokens. */
+export type ParsedSegment = string | CitationToken;
+
+/** Per-call URL placeholder map; key is `urlciteN`, value is the original URL. */
+export type CitationUrlMap = Map<string, string>;
+
+/** Result of preprocessing raw markdown for downstream parsing. */
+export interface PreprocessedCitations {
+	/** Markdown with `[citation:URL]` tokens rewritten to `[citation:urlciteN]`. */
+	content: string;
+	/** Lookup table to recover the original URL from each placeholder. */
+	urlMap: CitationUrlMap;
+}
+
+/** Pattern matching only URL-form citations (used during preprocessing). */
+const URL_CITATION_REGEX =
+	/[[【]\u200B?citation:\s*(https?:\/\/[^\]】\u200B]+)\s*\u200B?[\]】]/g;
+
+/**
+ * Replace `[citation:URL]` tokens with `[citation:urlciteN]` placeholders so
+ * GFM autolinks don't split the URL out of the brackets during markdown
+ * parsing. Returns both the rewritten content and a map for later lookup.
+ *
+ * Code-fence aware: skips fenced (``` ``` ```) and inline (`` ` ``) code
+ * regions so citation-shaped strings inside example code remain literal.
+ *
+ * Known limitations: `~~~` fences, 4-space indented code, and LaTeX math
+ * blocks are not skipped. Citation tokens inside those regions are rare in
+ * practice; documented in the plan.
+ */
+export function preprocessCitationMarkdown(content: string): PreprocessedCitations {
+	const urlMap: CitationUrlMap = new Map();
+	let counter = 0;
+
+	// Splitting on a regex with one capture group puts code regions at odd
+	// indexes (matched delimiters) and the surrounding text at even indexes.
+	// Only transform the even-indexed parts.
+	const parts = content.split(FENCED_OR_INLINE_CODE);
+	const transformed = parts.map((part, index) => {
+		if (index % 2 === 1) return part;
+		return part.replace(URL_CITATION_REGEX, (_match, url: string) => {
+			const key = `urlcite${counter++}`;
+			urlMap.set(key, url.trim());
+			return `[citation:${key}]`;
+		});
+	});
+
+	return { content: transformed.join(""), urlMap };
+}
+
+/**
+ * Parse a string into an array of plain text segments and citation tokens.
+ *
+ * Pure data — no React. The renderer module is responsible for mapping
+ * tokens to JSX. Negative chunk IDs are forwarded as-is so the consumer
+ * can decide how to render anonymous documents.
+ */
+export function parseTextWithCitations(
+	text: string,
+	urlMap: CitationUrlMap
+): ParsedSegment[] {
+	const segments: ParsedSegment[] = [];
+	let lastIndex = 0;
+	let match: RegExpExecArray | null;
+
+	CITATION_REGEX.lastIndex = 0;
+	match = CITATION_REGEX.exec(text);
+	while (match !== null) {
+		if (match.index > lastIndex) {
+			segments.push(text.substring(lastIndex, match.index));
+		}
+
+		const captured = match[1];
+
+		if (captured.startsWith("http://") || captured.startsWith("https://")) {
+			segments.push({ kind: "url", url: captured.trim() });
+		} else if (captured.startsWith("urlcite")) {
+			const url = urlMap.get(captured);
+			if (url) {
+				segments.push({ kind: "url", url });
+			}
+		} else {
+			const rawIds = captured.split(",").map((s) => s.trim());
+			for (const rawId of rawIds) {
+				const isDocsChunk = rawId.startsWith("doc-");
+				const chunkId = Number.parseInt(isDocsChunk ? rawId.slice(4) : rawId, 10);
+				if (!Number.isNaN(chunkId)) {
+					segments.push({ kind: "chunk", chunkId, isDocsChunk });
+				}
+			}
+		}
+
+		lastIndex = match.index + match[0].length;
+		match = CITATION_REGEX.exec(text);
+	}
+
+	if (lastIndex < text.length) {
+		segments.push(text.substring(lastIndex));
+	}
+
+	return segments.length > 0 ? segments : [text];
+}
+
+/** Type guard for the citation branch of `ParsedSegment`. */
+export function isCitationToken(segment: ParsedSegment): segment is CitationToken {
+	return typeof segment !== "string";
+}
--- a/surfsense_web/lib/markdown/code-regions.ts
+++ b/surfsense_web/lib/markdown/code-regions.ts
@ -0,0 +1,8 @@
+// Matches fenced (```...```) and inline (`...`) code regions. Used by MDX
+// escaping and citation preprocessing — single source of truth so future
+// edits stay in sync.
+//
+// String.split() with this capturing pattern places non-code parts at even
+// indexes and matched code regions at odd indexes — preserve odd-indexed
+// segments verbatim when transforming markdown.
+export const FENCED_OR_INLINE_CODE = /(```[\s\S]*?```|`[^`\n]+`)/g;