SurfSense/surfsense_web/lib/citation-search.ts
DESKTOP-RTLN3BA\$punk b9a66cb417 feat: various UI fixes, prompt optimizations, and allowing duplicate docs
- Updated `content_hash` in the `Document` model to remove global uniqueness, allowing identical content across different paths.
- Enhanced `_create_document` function to handle path uniqueness and prevent session-poisoning from `IntegrityError`.
- Added detailed comments for clarity on the changes and their implications.
- Introduced new citation handling in the editor for improved user experience with citation jumps.
- Updated package dependencies in the frontend for better functionality.
2026-04-28 21:30:53 -07:00

125 lines
5 KiB
TypeScript

/**
* Snippet generation for the citation-jump highlight, driven by Plate's
* `FindReplacePlugin`. The plugin runs `decorate` per-block and only matches
* within blocks whose children are all `Text` nodes (so it crosses inline
* marks like bold/italic but **not** block boundaries, and a block that
* contains even one inline element such as a link is silently skipped).
* That means a full chunk that spans heading + paragraph won't match as a
* single string — we have to pick a shorter snippet that fits inside one
* rendered block.
*
* `buildCitationSearchCandidates` returns search strings ordered from
* "most-specific anchor" to "broadest fallback":
* 1. First sentence of the chunk (capped at `FIRST_SENTENCE_MAX`).
* 2. First `FIRST_PHRASE_WORDS` words.
* 3. Each non-trivial line of the chunk, in source order — gives us a
* separate attempt for each rendered block, so a heading line with
* an inline link doesn't doom the whole jump.
* 4. Full chunk (only if it's already short enough to plausibly fit
* inside one block).
*
* The caller tries each candidate in turn — set the plugin's `search`
* option, `editor.api.redecorate()`, then check the editor DOM for a
* `.citation-highlight-leaf` element. First candidate that produces one
* wins; subsequent candidates are skipped.
*/
const FIRST_SENTENCE_MAX = 120;
const FIRST_PHRASE_WORDS = 8;
const MIN_SNIPPET_LENGTH = 6;
const FULL_CHUNK_MAX = FIRST_SENTENCE_MAX * 2;
const MAX_LINE_CANDIDATES = 6;
const LINE_CANDIDATE_MAX = FIRST_SENTENCE_MAX;
function normalizeWhitespace(input: string): string {
return input.replace(/\s+/g, " ").trim();
}
/**
* Strip the markdown syntax that won't survive into the rendered editor's
* plain text, so the chunk text (which comes back from the indexer as raw
* source markdown) can be matched against the literal text values stored
* in Plate's Slate tree.
*
* Order matters: handle multi-char and "container" syntax before single-
* char emphasis, otherwise `**text**` collapses to `*text*` first.
*
* Heuristic only — we don't aim to be a full markdown parser, just to
* remove the common markers (`**bold**`, `[text](url)`, `# headings`,
* `- list`, etc.) that show up in connector-doc chunks and would break
* literal substring search.
*/
export function stripMarkdownForMatch(input: string): string {
let s = input;
s = s.replace(/```[a-z0-9_+-]*\n?([\s\S]*?)```/gi, (_, body: string) => body);
s = s.replace(/<!--[\s\S]*?-->/g, " ");
s = s.replace(/!\[([^\]]*)\]\([^)]*\)/g, "$1");
s = s.replace(/!\[([^\]]*)\]\[[^\]]*\]/g, "$1");
s = s.replace(/\[([^\]]+)\]\([^)]*\)/g, "$1");
s = s.replace(/\[([^\]]+)\]\[[^\]]*\]/g, "$1");
s = s.replace(/<((?:https?|mailto):[^>\s]+)>/g, "$1");
s = s.replace(/`+([^`\n]+?)`+/g, "$1");
s = s.replace(/(\*\*|__)([\s\S]+?)\1/g, "$2");
s = s.replace(/(?<!\w)([*_])([^*_\n]+?)\1(?!\w)/g, "$2");
s = s.replace(/~~([^~]+)~~/g, "$1");
s = s.replace(/^[ \t]{0,3}#{1,6}[ \t]+/gm, "");
s = s.replace(/^[ \t]{0,3}(?:=+|-+)[ \t]*$/gm, "");
s = s.replace(/^[ \t]{0,3}>+[ \t]?/gm, "");
s = s.replace(/^[ \t]*[-*+][ \t]+/gm, "");
s = s.replace(/^[ \t]*\d+\.[ \t]+/gm, "");
s = s.replace(/^[ \t]{0,3}(?:[-*_])(?:[ \t]*[-*_]){2,}[ \t]*$/gm, "");
s = s.replace(/^[ \t]*\|?(?:[ \t]*:?-+:?[ \t]*\|)+[ \t]*:?-+:?[ \t]*\|?[ \t]*$/gm, "");
s = s.replace(/\\([\\`*_{}[\]()#+\-.!~>])/g, "$1");
return s;
}
export function buildCitationSearchCandidates(rawText: string): string[] {
if (!rawText) return [];
const stripped = stripMarkdownForMatch(rawText);
const normalized = normalizeWhitespace(stripped);
if (normalized.length < MIN_SNIPPET_LENGTH) return [];
const out: string[] = [];
const seen = new Set<string>();
const push = (s: string) => {
const t = normalizeWhitespace(s);
if (t.length >= MIN_SNIPPET_LENGTH && !seen.has(t)) {
out.push(t);
seen.add(t);
}
};
const sentenceMatch = normalized.match(/^[^.!?]+[.!?]/);
if (sentenceMatch) {
const sentence = sentenceMatch[0];
push(sentence.length > FIRST_SENTENCE_MAX ? sentence.slice(0, FIRST_SENTENCE_MAX) : sentence);
} else if (normalized.length > FIRST_SENTENCE_MAX) {
push(normalized.slice(0, FIRST_SENTENCE_MAX));
}
const words = normalized.split(" ").filter(Boolean);
if (words.length > FIRST_PHRASE_WORDS) {
push(words.slice(0, FIRST_PHRASE_WORDS).join(" "));
}
// Per-line candidates: each chunk line is roughly one block in the
// rendered editor. Trying them in order gives us a separate decorate
// attempt for each block, which matters when the first line is a
// heading containing a link (Plate's `FindReplacePlugin` will skip
// any block whose children aren't all text nodes).
const rawLines = stripped.split(/\r?\n/);
let lineCount = 0;
for (const line of rawLines) {
if (lineCount >= MAX_LINE_CANDIDATES) break;
const trimmed = normalizeWhitespace(line);
if (trimmed.length < MIN_SNIPPET_LENGTH) continue;
push(trimmed.length > LINE_CANDIDATE_MAX ? trimmed.slice(0, LINE_CANDIDATE_MAX) : trimmed);
lineCount++;
}
if (normalized.length <= FULL_CHUNK_MAX) {
push(normalized);
}
return out;
}