SurfSense/surfsense_web/lib/citations/citation-parser.ts
DESKTOP-RTLN3BA\$punk c644f02d05
Some checks are pending
Build and Push Docker Images / tag_release (push) Waiting to run
Build and Push Docker Images / build (./surfsense_backend, ./surfsense_backend/Dockerfile, backend, surfsense-backend, ubuntu-24.04-arm, linux/arm64, arm64) (push) Blocked by required conditions
Build and Push Docker Images / build (./surfsense_backend, ./surfsense_backend/Dockerfile, backend, surfsense-backend, ubuntu-latest, linux/amd64, amd64) (push) Blocked by required conditions
Build and Push Docker Images / build (./surfsense_web, ./surfsense_web/Dockerfile, web, surfsense-web, ubuntu-24.04-arm, linux/arm64, arm64) (push) Blocked by required conditions
Build and Push Docker Images / build (./surfsense_web, ./surfsense_web/Dockerfile, web, surfsense-web, ubuntu-latest, linux/amd64, amd64) (push) Blocked by required conditions
Build and Push Docker Images / create_manifest (backend, surfsense-backend) (push) Blocked by required conditions
Build and Push Docker Images / create_manifest (web, surfsense-web) (push) Blocked by required conditions
chore: linting
2026-04-30 18:42:38 -07:00

130 lines
4.8 KiB
TypeScript

// Pure citation parsing for `[citation:...]` tokens emitted by SurfSense
// agents. No React imports — consumed by both the React renderer
// (markdown surfaces) and the Plate value transform (document viewer).
//
// The same logic previously lived inline in
// `components/assistant-ui/markdown-text.tsx` with module-level mutable
// state. This module exposes a per-call URL map so multiple concurrent
// renderers / SSR contexts can't race each other.
import { FENCED_OR_INLINE_CODE } from "@/lib/markdown/code-regions";
/**
* Matches `[citation:...]` with numeric IDs (incl. negative, doc- prefix,
* comma-separated), URL-based IDs from live web search, or `urlciteN`
* placeholders produced by `preprocessCitationMarkdown`.
*
* Also matches Chinese brackets 【】 and zero-width spaces that LLMs
* sometimes emit.
*/
export const CITATION_REGEX =
/[[【]\u200B?citation:\s*(https?:\/\/[^\]】\u200B]+|urlcite\d+|(?:doc-)?-?\d+(?:\s*,\s*(?:doc-)?-?\d+)*)\s*\u200B?[\]】]/g;
/** A single parsed citation reference. */
export type CitationToken =
| { kind: "url"; url: string }
| { kind: "chunk"; chunkId: number; isDocsChunk: boolean };
/** Output of `parseTextWithCitations` — interleaved text + citation tokens. */
export type ParsedSegment = string | CitationToken;
/** Per-call URL placeholder map; key is `urlciteN`, value is the original URL. */
export type CitationUrlMap = Map<string, string>;
/** Result of preprocessing raw markdown for downstream parsing. */
export interface PreprocessedCitations {
/** Markdown with `[citation:URL]` tokens rewritten to `[citation:urlciteN]`. */
content: string;
/** Lookup table to recover the original URL from each placeholder. */
urlMap: CitationUrlMap;
}
/** Pattern matching only URL-form citations (used during preprocessing). */
const URL_CITATION_REGEX = /[[【]\u200B?citation:\s*(https?:\/\/[^\]】\u200B]+)\s*\u200B?[\]】]/g;
/**
* Replace `[citation:URL]` tokens with `[citation:urlciteN]` placeholders so
* GFM autolinks don't split the URL out of the brackets during markdown
* parsing. Returns both the rewritten content and a map for later lookup.
*
* Code-fence aware: skips fenced (``` ``` ```) and inline (`` ` ``) code
* regions so citation-shaped strings inside example code remain literal.
*
* Known limitations: `~~~` fences, 4-space indented code, and LaTeX math
* blocks are not skipped. Citation tokens inside those regions are rare in
* practice; documented in the plan.
*/
export function preprocessCitationMarkdown(content: string): PreprocessedCitations {
const urlMap: CitationUrlMap = new Map();
let counter = 0;
// Splitting on a regex with one capture group puts code regions at odd
// indexes (matched delimiters) and the surrounding text at even indexes.
// Only transform the even-indexed parts.
const parts = content.split(FENCED_OR_INLINE_CODE);
const transformed = parts.map((part, index) => {
if (index % 2 === 1) return part;
return part.replace(URL_CITATION_REGEX, (_match, url: string) => {
const key = `urlcite${counter++}`;
urlMap.set(key, url.trim());
return `[citation:${key}]`;
});
});
return { content: transformed.join(""), urlMap };
}
/**
* Parse a string into an array of plain text segments and citation tokens.
*
* Pure data — no React. The renderer module is responsible for mapping
* tokens to JSX. Negative chunk IDs are forwarded as-is so the consumer
* can decide how to render anonymous documents.
*/
export function parseTextWithCitations(text: string, urlMap: CitationUrlMap): ParsedSegment[] {
const segments: ParsedSegment[] = [];
let lastIndex = 0;
let match: RegExpExecArray | null;
CITATION_REGEX.lastIndex = 0;
match = CITATION_REGEX.exec(text);
while (match !== null) {
if (match.index > lastIndex) {
segments.push(text.substring(lastIndex, match.index));
}
const captured = match[1];
if (captured.startsWith("http://") || captured.startsWith("https://")) {
segments.push({ kind: "url", url: captured.trim() });
} else if (captured.startsWith("urlcite")) {
const url = urlMap.get(captured);
if (url) {
segments.push({ kind: "url", url });
}
} else {
const rawIds = captured.split(",").map((s) => s.trim());
for (const rawId of rawIds) {
const isDocsChunk = rawId.startsWith("doc-");
const chunkId = Number.parseInt(isDocsChunk ? rawId.slice(4) : rawId, 10);
if (!Number.isNaN(chunkId)) {
segments.push({ kind: "chunk", chunkId, isDocsChunk });
}
}
}
lastIndex = match.index + match[0].length;
match = CITATION_REGEX.exec(text);
}
if (lastIndex < text.length) {
segments.push(text.substring(lastIndex));
}
return segments.length > 0 ? segments : [text];
}
/** Type guard for the citation branch of `ParsedSegment`. */
export function isCitationToken(segment: ParsedSegment): segment is CitationToken {
return typeof segment !== "string";
}