feat: various UI fixes, prompt optimizations, and allowing duplicate docs

- Updated `content_hash` in the `Document` model to remove global uniqueness, allowing identical content across different paths. - Enhanced `_create_document` function to handle path uniqueness and prevent session-poisoning from `IntegrityError`. - Added detailed comments for clarity on the changes and their implications. - Introduced new citation handling in the editor for improved user experience with citation jumps. - Updated package dependencies in the frontend for better functionality.
2026-05-15 18:25:18 +02:00 · 2026-04-28 21:30:53 -07:00 · 2026-04-28 21:30:53 -07:00 · b9a66cb417
commit b9a66cb417
parent e6433f78c4
26 changed files with 1540 additions and 852 deletions
--- a/surfsense_web/lib/citation-search.ts
+++ b/surfsense_web/lib/citation-search.ts
@ -0,0 +1,125 @@
+/**
+ * Snippet generation for the citation-jump highlight, driven by Plate's
+ * `FindReplacePlugin`. The plugin runs `decorate` per-block and only matches
+ * within blocks whose children are all `Text` nodes (so it crosses inline
+ * marks like bold/italic but **not** block boundaries, and a block that
+ * contains even one inline element such as a link is silently skipped).
+ * That means a full chunk that spans heading + paragraph won't match as a
+ * single string — we have to pick a shorter snippet that fits inside one
+ * rendered block.
+ *
+ * `buildCitationSearchCandidates` returns search strings ordered from
+ * "most-specific anchor" to "broadest fallback":
+ *   1. First sentence of the chunk (capped at `FIRST_SENTENCE_MAX`).
+ *   2. First `FIRST_PHRASE_WORDS` words.
+ *   3. Each non-trivial line of the chunk, in source order — gives us a
+ *      separate attempt for each rendered block, so a heading line with
+ *      an inline link doesn't doom the whole jump.
+ *   4. Full chunk (only if it's already short enough to plausibly fit
+ *      inside one block).
+ *
+ * The caller tries each candidate in turn — set the plugin's `search`
+ * option, `editor.api.redecorate()`, then check the editor DOM for a
+ * `.citation-highlight-leaf` element. First candidate that produces one
+ * wins; subsequent candidates are skipped.
+ */
+
+const FIRST_SENTENCE_MAX = 120;
+const FIRST_PHRASE_WORDS = 8;
+const MIN_SNIPPET_LENGTH = 6;
+const FULL_CHUNK_MAX = FIRST_SENTENCE_MAX * 2;
+const MAX_LINE_CANDIDATES = 6;
+const LINE_CANDIDATE_MAX = FIRST_SENTENCE_MAX;
+
+function normalizeWhitespace(input: string): string {
+	return input.replace(/\s+/g, " ").trim();
+}
+
+/**
+ * Strip the markdown syntax that won't survive into the rendered editor's
+ * plain text, so the chunk text (which comes back from the indexer as raw
+ * source markdown) can be matched against the literal text values stored
+ * in Plate's Slate tree.
+ *
+ * Order matters: handle multi-char and "container" syntax before single-
+ * char emphasis, otherwise `**text**` collapses to `*text*` first.
+ *
+ * Heuristic only — we don't aim to be a full markdown parser, just to
+ * remove the common markers (`**bold**`, `[text](url)`, `# headings`,
+ * `- list`, etc.) that show up in connector-doc chunks and would break
+ * literal substring search.
+ */
+export function stripMarkdownForMatch(input: string): string {
+	let s = input;
+	s = s.replace(/```[a-z0-9_+-]*\n?([\s\S]*?)```/gi, (_, body: string) => body);
+	s = s.replace(/<!--[\s\S]*?-->/g, " ");
+	s = s.replace(/!\[([^\]]*)\]\([^)]*\)/g, "$1");
+	s = s.replace(/!\[([^\]]*)\]\[[^\]]*\]/g, "$1");
+	s = s.replace(/\[([^\]]+)\]\([^)]*\)/g, "$1");
+	s = s.replace(/\[([^\]]+)\]\[[^\]]*\]/g, "$1");
+	s = s.replace(/<((?:https?|mailto):[^>\s]+)>/g, "$1");
+	s = s.replace(/`+([^`\n]+?)`+/g, "$1");
+	s = s.replace(/(\*\*|__)([\s\S]+?)\1/g, "$2");
+	s = s.replace(/(?<!\w)([*_])([^*_\n]+?)\1(?!\w)/g, "$2");
+	s = s.replace(/~~([^~]+)~~/g, "$1");
+	s = s.replace(/^[ \t]{0,3}#{1,6}[ \t]+/gm, "");
+	s = s.replace(/^[ \t]{0,3}(?:=+|-+)[ \t]*$/gm, "");
+	s = s.replace(/^[ \t]{0,3}>+[ \t]?/gm, "");
+	s = s.replace(/^[ \t]*[-*+][ \t]+/gm, "");
+	s = s.replace(/^[ \t]*\d+\.[ \t]+/gm, "");
+	s = s.replace(/^[ \t]{0,3}(?:[-*_])(?:[ \t]*[-*_]){2,}[ \t]*$/gm, "");
+	s = s.replace(/^[ \t]*\|?(?:[ \t]*:?-+:?[ \t]*\|)+[ \t]*:?-+:?[ \t]*\|?[ \t]*$/gm, "");
+	s = s.replace(/\\([\\`*_{}[\]()#+\-.!~>])/g, "$1");
+	return s;
+}
+
+export function buildCitationSearchCandidates(rawText: string): string[] {
+	if (!rawText) return [];
+	const stripped = stripMarkdownForMatch(rawText);
+	const normalized = normalizeWhitespace(stripped);
+	if (normalized.length < MIN_SNIPPET_LENGTH) return [];
+
+	const out: string[] = [];
+	const seen = new Set<string>();
+	const push = (s: string) => {
+		const t = normalizeWhitespace(s);
+		if (t.length >= MIN_SNIPPET_LENGTH && !seen.has(t)) {
+			out.push(t);
+			seen.add(t);
+		}
+	};
+
+	const sentenceMatch = normalized.match(/^[^.!?]+[.!?]/);
+	if (sentenceMatch) {
+		const sentence = sentenceMatch[0];
+		push(sentence.length > FIRST_SENTENCE_MAX ? sentence.slice(0, FIRST_SENTENCE_MAX) : sentence);
+	} else if (normalized.length > FIRST_SENTENCE_MAX) {
+		push(normalized.slice(0, FIRST_SENTENCE_MAX));
+	}
+
+	const words = normalized.split(" ").filter(Boolean);
+	if (words.length > FIRST_PHRASE_WORDS) {
+		push(words.slice(0, FIRST_PHRASE_WORDS).join(" "));
+	}
+
+	// Per-line candidates: each chunk line is roughly one block in the
+	// rendered editor. Trying them in order gives us a separate decorate
+	// attempt for each block, which matters when the first line is a
+	// heading containing a link (Plate's `FindReplacePlugin` will skip
+	// any block whose children aren't all text nodes).
+	const rawLines = stripped.split(/\r?\n/);
+	let lineCount = 0;
+	for (const line of rawLines) {
+		if (lineCount >= MAX_LINE_CANDIDATES) break;
+		const trimmed = normalizeWhitespace(line);
+		if (trimmed.length < MIN_SNIPPET_LENGTH) continue;
+		push(trimmed.length > LINE_CANDIDATE_MAX ? trimmed.slice(0, LINE_CANDIDATE_MAX) : trimmed);
+		lineCount++;
+	}
+
+	if (normalized.length <= FULL_CHUNK_MAX) {
+		push(normalized);
+	}
+
+	return out;
+}