SurfSense/surfsense_web/components/editor/plugins/citation-kit.tsx

199 lines
5.7 KiB
TypeScript

"use client";
import { type Descendant, KEYS } from "platejs";
import { createPlatePlugin, type PlateElementProps } from "platejs/react";
import type { FC } from "react";
import { InlineCitation, UrlCitation } from "@/components/assistant-ui/inline-citation";
import {
CITATION_REGEX,
type CitationUrlMap,
parseTextWithCitations,
} from "@/lib/citations/citation-parser";
/**
* Plate inline-void node for one `[citation:...]` reference.
* Inline voids keep the citation chip atomic while preserving caret behavior
* around the surrounding text.
*/
export type CitationElementNode = {
type: "citation";
kind: "chunk" | "doc" | "url";
chunkId?: number;
url?: string;
/** Original literal token that produced this citation node. */
rawText: string;
children: [{ text: "" }];
};
const CITATION_TYPE = "citation";
const CitationElement: FC<PlateElementProps<CitationElementNode>> = ({
attributes,
children,
element,
}) => {
const isUrl = element.kind === "url";
return (
<span {...attributes} className="inline-flex align-baseline">
<span contentEditable={false}>
{isUrl && element.url ? (
<UrlCitation url={element.url} />
) : element.chunkId !== undefined ? (
<InlineCitation chunkId={element.chunkId} isDocsChunk={element.kind === "doc"} />
) : null}
</span>
{children}
</span>
);
};
const CitationPlugin = createPlatePlugin({
key: CITATION_TYPE,
node: {
isElement: true,
isInline: true,
isVoid: true,
type: CITATION_TYPE,
component: CitationElement,
},
});
export const CitationKit = [CitationPlugin];
// ---------------------------------------------------------------------------
// Slate value transform
// ---------------------------------------------------------------------------
// Local structural shapes keep the recursive walker readable without forcing
// Plate's broad Descendant union into narrower generic predicates.
type SlateText = { text: string } & Record<string, unknown>;
type SlateElement = { type?: string; children: Descendant[] } & Record<string, unknown>;
function isText(node: Descendant): boolean {
return typeof (node as { text?: unknown }).text === "string";
}
function asText(node: Descendant): SlateText {
return node as unknown as SlateText;
}
function asElement(node: Descendant): SlateElement {
return node as unknown as SlateElement;
}
/**
* Subtrees that should keep citation tokens as text:
* - Code nodes preserve source text and reject inline void children.
* - Link nodes already render as anchors; citation chips are interactive
* shadcn Button-based controls, so injecting them would nest interactions.
*/
const SKIP_SUBTREE_TYPES = new Set<string>([KEYS.codeBlock, "code_line", KEYS.link]);
/**
* Preserve text marks such as bold and italic when splitting around citations.
*/
function copyMarks(textNode: SlateText): Record<string, unknown> {
const { text: _text, ...marks } = textNode;
return marks;
}
function makeCitationElement(
rawText: string,
segment: { kind: "url"; url: string } | { kind: "chunk"; chunkId: number; isDocsChunk: boolean }
): CitationElementNode {
if (segment.kind === "url") {
return {
type: CITATION_TYPE,
kind: "url",
url: segment.url,
rawText,
children: [{ text: "" }],
};
}
return {
type: CITATION_TYPE,
kind: segment.isDocsChunk ? "doc" : "chunk",
chunkId: segment.chunkId,
rawText,
children: [{ text: "" }],
};
}
/**
* Keep each original citation token on the generated node for diagnostics.
*/
function extractRawCitationMatches(text: string): string[] {
const matches: string[] = [];
CITATION_REGEX.lastIndex = 0;
let m: RegExpExecArray | null = CITATION_REGEX.exec(text);
while (m !== null) {
matches.push(m[0]);
m = CITATION_REGEX.exec(text);
}
return matches;
}
function transformTextNode(node: SlateText, urlMap: CitationUrlMap): Descendant[] {
const segments = parseTextWithCitations(node.text, urlMap);
if (segments.length === 1 && typeof segments[0] === "string") {
return [node as unknown as Descendant];
}
const marks = copyMarks(node);
const rawMatches = extractRawCitationMatches(node.text);
const out: Descendant[] = [];
let citationIdx = 0;
let pendingText: string | null = null;
const flushText = () => {
// Inline voids need text siblings, even at text boundaries.
out.push({ ...marks, text: pendingText ?? "" } as unknown as Descendant);
pendingText = null;
};
for (const segment of segments) {
if (typeof segment === "string") {
pendingText = (pendingText ?? "") + segment;
} else {
flushText();
const raw = rawMatches[citationIdx] ?? "";
out.push(makeCitationElement(raw, segment) as unknown as Descendant);
citationIdx += 1;
// Ensure a trailing text sibling if the citation ends the node.
pendingText = "";
}
}
flushText();
return out;
}
function transformChildren(children: Descendant[], urlMap: CitationUrlMap): Descendant[] {
const out: Descendant[] = [];
for (const child of children) {
if (isText(child)) {
out.push(...transformTextNode(asText(child), urlMap));
continue;
}
const elementChild = asElement(child);
const elementType = (elementChild.type ?? "") as string;
if (elementType && SKIP_SUBTREE_TYPES.has(elementType)) {
out.push(child);
continue;
}
out.push({
...elementChild,
children: transformChildren(elementChild.children, urlMap),
} as unknown as Descendant);
}
return out;
}
/**
* Replace citation tokens in a deserialized Slate tree with citation inline
* void nodes. URL placeholders from `preprocessCitationMarkdown` are resolved
* through `urlMap`; skipped subtrees are returned unchanged.
*/
export function injectCitationNodes(value: Descendant[], urlMap: CitationUrlMap): Descendant[] {
return transformChildren(value, urlMap);
}