feat: fixed live connectors citations

This commit is contained in:
DESKTOP-RTLN3BA\$punk 2026-02-20 16:45:50 -08:00
parent ce46708064
commit 81dfc7102f
14 changed files with 189 additions and 92 deletions

View file

@ -523,6 +523,7 @@ CRITICAL CITATION REQUIREMENTS:
<document_structure_example>
The documents you receive are structured like this:
**Knowledge base documents (numeric chunk IDs):**
<document>
<document_metadata>
<document_id>42</document_id>
@ -538,7 +539,24 @@ The documents you receive are structured like this:
</document_content>
</document>
IMPORTANT: You MUST cite using the chunk ids (e.g. 123, 124, doc-45). Do NOT cite document_id.
**Live web search results (URL chunk IDs):**
<document>
<document_metadata>
<document_id>TAVILY_API::Some Title::https://example.com/article</document_id>
<document_type>TAVILY_API</document_type>
<title><![CDATA[Some web search result]]></title>
<url><![CDATA[https://example.com/article]]></url>
</document_metadata>
<document_content>
<chunk id='https://example.com/article'><![CDATA[Content from web search...]]></chunk>
</document_content>
</document>
IMPORTANT: You MUST cite using the EXACT chunk ids from the `<chunk id='...'>` tags.
- For knowledge base documents, chunk ids are numeric (e.g. 123, 124) or prefixed (e.g. doc-45).
- For live web search results, chunk ids are URLs (e.g. https://example.com/article).
Do NOT cite document_id. Always use the chunk id.
</document_structure_example>
<citation_format>
@ -550,13 +568,15 @@ IMPORTANT: You MUST cite using the chunk ids (e.g. 123, 124, doc-45). Do NOT cit
- NEVER format citations as clickable links or as markdown links like "([citation:5](https://example.com))". Always use plain square brackets only
- NEVER make up chunk IDs if you are unsure about the chunk_id. It is better to omit the citation than to guess
- Copy the EXACT chunk id from the XML - if it says `<chunk id='doc-123'>`, use [citation:doc-123]
- If the chunk id is a URL like `<chunk id='https://example.com/page'>`, use [citation:https://example.com/page]
</citation_format>
<citation_examples>
CORRECT citation formats:
- [citation:5]
- [citation:5] (numeric chunk ID from knowledge base)
- [citation:doc-123] (for Surfsense documentation chunks)
- [citation:chunk_id1], [citation:chunk_id2], [citation:chunk_id3]
- [citation:https://example.com/article] (URL chunk ID from web search results)
- [citation:chunk_id1], [citation:chunk_id2], [citation:chunk_id3] (multiple citations)
INCORRECT citation formats (DO NOT use):
- Using parentheses and markdown links: ([citation:5](https://github.com/MODSetter/SurfSense))
@ -571,7 +591,7 @@ INCORRECT citation formats (DO NOT use):
<citation_output_example>
Based on your GitHub repositories and video content, Python's asyncio library provides tools for writing concurrent code using the async/await syntax [citation:5]. It's particularly useful for I/O-bound and high-level structured network code [citation:5].
The key advantage of asyncio is that it can improve performance by allowing other code to run while waiting for I/O operations to complete [citation:12]. This makes it excellent for scenarios like web scraping, API calls, database operations, or any situation where your program spends time waiting for external resources.
According to web search results, the key advantage of asyncio is that it can improve performance by allowing other code to run while waiting for I/O operations to complete [citation:https://docs.python.org/3/library/asyncio.html]. This makes it excellent for scenarios like web scraping, API calls, database operations, or any situation where your program spends time waiting for external resources.
However, from your video learning, it's important to note that asyncio is not suitable for CPU-bound tasks as it runs on a single thread [citation:12]. For computationally intensive work, you'd want to use multiprocessing instead.
</citation_output_example>

View file

@ -210,6 +210,7 @@ def format_documents_for_context(documents: list[dict[str, Any]]) -> str:
source = (
(doc.get("source") if isinstance(doc, dict) else None)
or document_info.get("document_type")
or metadata.get("document_type")
or "UNKNOWN"
)
@ -268,10 +269,20 @@ def format_documents_for_context(documents: list[dict[str, Any]]) -> str:
continue
grouped[doc_key]["chunks"].append({"chunk_id": chunk_id, "content": content})
# Live search connectors whose results should be cited by URL rather than
# a numeric chunk_id (the numeric IDs are meaningless auto-incremented counters).
_LIVE_SEARCH_CONNECTORS = {
"TAVILY_API",
"SEARXNG_API",
"LINKUP_API",
"BAIDU_SEARCH_API",
}
# Render XML expected by citation instructions
parts: list[str] = []
for g in grouped.values():
metadata_json = json.dumps(g["metadata"], ensure_ascii=False)
is_live_search = g["document_type"] in _LIVE_SEARCH_CONNECTORS
parts.append("<document>")
parts.append("<document_metadata>")
@ -286,7 +297,10 @@ def format_documents_for_context(documents: list[dict[str, Any]]) -> str:
for ch in g["chunks"]:
ch_content = ch["content"]
ch_id = ch["chunk_id"]
# For live search connectors, use the document URL as the chunk id
# so the LLM outputs [citation:https://...] which the frontend
# renders as a clickable link.
ch_id = g["url"] if (is_live_search and g["url"]) else ch["chunk_id"]
if ch_id is None:
parts.append(f" <chunk><![CDATA[{ch_content}]]></chunk>")
else:

View file

@ -418,7 +418,7 @@ export const ConnectorIndicator: FC<{ hideTrigger?: boolean }> = ({ hideTrigger
</div>
</div>
{/* Bottom fade shadow */}
<div className="absolute bottom-0 left-0 right-0 h-7 bg-gradient-to-t from-muted via-muted/80 to-transparent pointer-events-none z-10" />
<div className="absolute bottom-0 left-0 right-0 h-7 bg-linear-to-t from-muted via-muted/80 to-transparent pointer-events-none z-10" />
</div>
</Tabs>
)}

View file

@ -57,6 +57,7 @@ export const BaiduSearchApiConnectForm: FC<ConnectFormProps> = ({ onSubmit, isSu
BAIDU_API_KEY: values.api_key,
},
is_indexable: false,
is_active: true,
last_indexed_at: null,
periodic_indexing_enabled: false,
indexing_frequency_minutes: null,

View file

@ -51,6 +51,7 @@ export const CirclebackConnectForm: FC<ConnectFormProps> = ({ onSubmit, isSubmit
connector_type: EnumConnectorName.CIRCLEBACK_CONNECTOR,
config: {},
is_indexable: false,
is_active: true,
last_indexed_at: null,
periodic_indexing_enabled: false,
indexing_frequency_minutes: null,

View file

@ -155,6 +155,7 @@ export const ElasticsearchConnectForm: FC<ConnectFormProps> = ({ onSubmit, isSub
connector_type: EnumConnectorName.ELASTICSEARCH_CONNECTOR,
config,
is_indexable: true,
is_active: true,
last_indexed_at: null,
periodic_indexing_enabled: periodicEnabled,
indexing_frequency_minutes: periodicEnabled ? parseInt(frequencyMinutes, 10) : null,

View file

@ -57,6 +57,7 @@ export const LinkupApiConnectForm: FC<ConnectFormProps> = ({ onSubmit, isSubmitt
LINKUP_API_KEY: values.api_key,
},
is_indexable: false,
is_active: true,
last_indexed_at: null,
periodic_indexing_enabled: false,
indexing_frequency_minutes: null,

View file

@ -71,6 +71,7 @@ export const LumaConnectForm: FC<ConnectFormProps> = ({ onSubmit, isSubmitting }
LUMA_API_KEY: values.api_key,
},
is_indexable: true,
is_active: true,
last_indexed_at: null,
periodic_indexing_enabled: periodicEnabled,
indexing_frequency_minutes: periodicEnabled ? parseInt(frequencyMinutes, 10) : null,

View file

@ -110,6 +110,7 @@ export const SearxngConnectForm: FC<ConnectFormProps> = ({ onSubmit, isSubmittin
connector_type: EnumConnectorName.SEARXNG_API,
config,
is_indexable: false,
is_active: true,
last_indexed_at: null,
periodic_indexing_enabled: false,
indexing_frequency_minutes: null,

View file

@ -57,6 +57,7 @@ export const TavilyApiConnectForm: FC<ConnectFormProps> = ({ onSubmit, isSubmitt
TAVILY_API_KEY: values.api_key,
},
is_indexable: false,
is_active: true,
last_indexed_at: null,
periodic_indexing_enabled: false,
indexing_frequency_minutes: null,

View file

@ -1,7 +1,7 @@
"use client";
import { ArrowLeft } from "lucide-react";
import { type FC, useMemo } from "react";
import { type FC, useMemo, useRef } from "react";
import { Button } from "@/components/ui/button";
import { Spinner } from "@/components/ui/spinner";
import type { EnumConnectorName } from "@/contracts/enums/connector";
@ -9,6 +9,20 @@ import { getConnectorIcon } from "@/contracts/enums/connectorIcons";
import { getConnectorTypeDisplay } from "@/lib/connectors/utils";
import { getConnectFormComponent } from "../../connect-forms";
const FORM_ID_MAP: Record<string, string> = {
TAVILY_API: "tavily-connect-form",
SEARXNG_API: "searxng-connect-form",
LINKUP_API: "linkup-api-connect-form",
BAIDU_SEARCH_API: "baidu-search-api-connect-form",
ELASTICSEARCH_CONNECTOR: "elasticsearch-connect-form",
BOOKSTACK_CONNECTOR: "bookstack-connect-form",
GITHUB_CONNECTOR: "github-connect-form",
LUMA_CONNECTOR: "luma-connect-form",
CIRCLEBACK_CONNECTOR: "circleback-connect-form",
MCP_CONNECTOR: "mcp-connect-form",
OBSIDIAN_CONNECTOR: "obsidian-connect-form",
};
interface ConnectorConnectViewProps {
connectorType: string;
onSubmit: (data: {
@ -35,6 +49,7 @@ export const ConnectorConnectView: FC<ConnectorConnectViewProps> = ({
onBack,
isSubmitting,
}) => {
const formContainerRef = useRef<HTMLDivElement | null>(null);
// Get connector-specific form component
const ConnectFormComponent = useMemo(
() => getConnectFormComponent(connectorType),
@ -46,27 +61,19 @@ export const ConnectorConnectView: FC<ConnectorConnectViewProps> = ({
if (isSubmitting) {
return;
}
// Map connector types to their form IDs
const formIdMap: Record<string, string> = {
TAVILY_API: "tavily-connect-form",
SEARXNG_API: "searxng-connect-form",
LINKUP_API: "linkup-api-connect-form",
BAIDU_SEARCH_API: "baidu-search-api-connect-form",
ELASTICSEARCH_CONNECTOR: "elasticsearch-connect-form",
BOOKSTACK_CONNECTOR: "bookstack-connect-form",
GITHUB_CONNECTOR: "github-connect-form",
LUMA_CONNECTOR: "luma-connect-form",
CIRCLEBACK_CONNECTOR: "circleback-connect-form",
MCP_CONNECTOR: "mcp-connect-form",
OBSIDIAN_CONNECTOR: "obsidian-connect-form",
};
const formId = formIdMap[connectorType];
if (formId) {
const form = document.getElementById(formId) as HTMLFormElement;
const formId = FORM_ID_MAP[connectorType];
const root = formContainerRef.current;
const mappedForm =
root && formId
? (root.querySelector(`[id="${formId}"]`) as HTMLFormElement | null)
: null;
// Fallback to currently rendered form to avoid silent no-op
// when a connector type or form id mapping drifts.
const fallbackForm = root?.querySelector("form") as HTMLFormElement | null;
const form = mappedForm ?? fallbackForm;
if (form) {
form.requestSubmit();
}
}
};
if (!ConnectFormComponent) {
@ -114,7 +121,10 @@ export const ConnectorConnectView: FC<ConnectorConnectViewProps> = ({
</div>
{/* Form Content - Scrollable */}
<div className="flex-1 min-h-0 overflow-y-auto px-6 sm:px-12">
<div
ref={formContainerRef}
className="connector-connect-form-root flex-1 min-h-0 overflow-y-auto px-6 sm:px-12"
>
<ConnectFormComponent
onSubmit={onSubmit}
onBack={onBack}
@ -134,6 +144,7 @@ export const ConnectorConnectView: FC<ConnectorConnectViewProps> = ({
Cancel
</Button>
<Button
type="button"
onClick={handleFormSubmit}
disabled={isSubmitting}
className="text-xs sm:text-sm min-w-[140px] disabled:opacity-50 disabled:cursor-not-allowed disabled:pointer-events-none"

View file

@ -558,7 +558,9 @@ export const useConnectorDialog = () => {
},
onIndexingStart?: (connectorId: number) => void
) => {
if (!searchSpaceId || !connectingConnectorType) return;
if (!searchSpaceId || !connectingConnectorType) {
return;
}
// Prevent multiple submissions using ref for immediate check
if (isCreatingConnectorRef.current) return;
@ -582,7 +584,6 @@ export const useConnectorDialog = () => {
search_space_id: searchSpaceId,
},
});
// Refetch connectors to get the new one
const result = await refetchAllConnectors();
if (result.data) {

View file

@ -2,22 +2,20 @@
import type { FC } from "react";
import { useState } from "react";
import { ExternalLink } from "lucide-react";
import { SourceDetailPanel } from "@/components/new-chat/source-detail-panel";
interface InlineCitationProps {
chunkId: number;
citationNumber: number;
isDocsChunk?: boolean;
}
/**
* Inline citation component for the new chat.
* Renders a clickable numbered badge that opens the SourceDetailPanel with document chunk details.
* Supports both regular knowledge base chunks and Surfsense documentation chunks.
* Inline citation for knowledge-base chunks (numeric chunk IDs).
* Renders a clickable badge showing the actual chunk ID that opens the SourceDetailPanel.
*/
export const InlineCitation: FC<InlineCitationProps> = ({
chunkId,
citationNumber,
isDocsChunk = false,
}) => {
const [isOpen, setIsOpen] = useState(false);
@ -37,12 +35,46 @@ export const InlineCitation: FC<InlineCitationProps> = ({
onClick={() => setIsOpen(true)}
onKeyDown={(e) => e.key === "Enter" && setIsOpen(true)}
className="text-[10px] font-bold bg-primary/80 hover:bg-primary text-primary-foreground rounded-full min-w-4 h-4 px-1 inline-flex items-center justify-center align-super cursor-pointer transition-colors ml-0.5"
title={`View source #${citationNumber}`}
title={`View source chunk #${chunkId}`}
role="button"
tabIndex={0}
>
{citationNumber}
{chunkId}
</span>
</SourceDetailPanel>
);
};
function extractDomain(url: string): string {
try {
const hostname = new URL(url).hostname;
return hostname.replace(/^www\./, "");
} catch {
return url;
}
}
interface UrlCitationProps {
url: string;
}
/**
* Inline citation for live web search results (URL-based chunk IDs).
* Renders a clickable badge showing the source domain that opens the URL in a new tab.
*/
export const UrlCitation: FC<UrlCitationProps> = ({ url }) => {
const domain = extractDomain(url);
return (
<a
href={url}
target="_blank"
rel="noopener noreferrer"
className="text-[10px] font-bold bg-primary/80 hover:bg-primary text-primary-foreground rounded-full h-4 px-1.5 inline-flex items-center gap-0.5 align-super cursor-pointer transition-colors ml-0.5 no-underline"
title={url}
>
<ExternalLink className="size-2.5 shrink-0" />
{domain}
</a>
);
};

View file

@ -14,17 +14,36 @@ import rehypeKatex from "rehype-katex";
import remarkGfm from "remark-gfm";
import remarkMath from "remark-math";
import "katex/dist/katex.min.css";
import { InlineCitation } from "@/components/assistant-ui/inline-citation";
import { InlineCitation, UrlCitation } from "@/components/assistant-ui/inline-citation";
import { TooltipIconButton } from "@/components/assistant-ui/tooltip-icon-button";
import { cn } from "@/lib/utils";
// Storage for URL citations replaced during preprocess to avoid GFM autolink interference.
// Populated in preprocessMarkdown, consumed in parseTextWithCitations.
let _pendingUrlCitations = new Map<string, string>();
let _urlCiteIdx = 0;
/**
* Convert all LaTeX delimiter styles to the dollar-sign syntax
* that remark-math understands. LLMs use various delimiters
* (\(...\), \[...\], \begin{equation}, etc.) and we need to
* normalise them all to $ / $$ before the markdown parser runs.
* Preprocess raw markdown before it reaches the remark/rehype pipeline.
* - Replaces URL-based citations with safe placeholders (prevents GFM autolinks)
* - Normalises LaTeX delimiters to dollar-sign syntax for remark-math
*/
function convertLatexDelimiters(content: string): string {
function preprocessMarkdown(content: string): string {
// Replace URL-based citations with safe placeholders BEFORE markdown parsing.
// GFM autolinks would otherwise convert the https://... inside [citation:URL]
// into an <a> element, splitting the text and preventing our citation regex
// from matching the full pattern.
_pendingUrlCitations = new Map();
_urlCiteIdx = 0;
content = content.replace(
/[[【]\u200B?citation:\s*(https?:\/\/[^\]\】\u200B]+)\s*\u200B?[\]】]/g,
(_, url) => {
const key = `urlcite${_urlCiteIdx++}`;
_pendingUrlCitations.set(key, url.trim());
return `[citation:${key}]`;
}
);
// 1. Block math: \[...\] → $$...$$
content = content.replace(/\\\[([\s\S]*?)\\\]/g, (_, inner) => `$$${inner}$$`);
// 2. Inline math: \(...\) → $...$
@ -50,40 +69,19 @@ function convertLatexDelimiters(content: string): string {
return content;
}
// Citation pattern: [citation:CHUNK_ID] or [citation:doc-CHUNK_ID]
// Also matches Chinese brackets 【】 and handles zero-width spaces that LLM sometimes inserts
const CITATION_REGEX = /[[【]\u200B?citation:(doc-)?(\d+)\u200B?[\]】]/g;
// Track chunk IDs to citation numbers mapping for consistent numbering
// This map is reset when a new message starts rendering
// Uses string keys to differentiate between doc and regular chunks (e.g., "doc-123" vs "123")
let chunkIdToCitationNumber: Map<string, number> = new Map();
let nextCitationNumber = 1;
// Matches [citation:...] with numeric IDs (incl. doc- prefix, comma-separated),
// URL-based IDs from live web search, or urlciteN placeholders from preprocess.
// Also matches Chinese brackets 【】 and handles zero-width spaces that LLM sometimes inserts.
const CITATION_REGEX =
/[[【]\u200B?citation:\s*(https?:\/\/[^\]\】\u200B]+|urlcite\d+|(?:doc-)?\d+(?:\s*,\s*(?:doc-)?\d+)*)\s*\u200B?[\]】]/g;
/**
* Resets the citation counter - should be called at the start of each message
*/
export function resetCitationCounter() {
chunkIdToCitationNumber = new Map();
nextCitationNumber = 1;
}
/**
* Gets or assigns a citation number for a chunk ID
* Uses string key to differentiate between doc and regular chunks
*/
function getCitationNumber(chunkId: number, isDocsChunk: boolean): number {
const key = isDocsChunk ? `doc-${chunkId}` : String(chunkId);
const existingNumber = chunkIdToCitationNumber.get(key);
if (existingNumber === undefined) {
chunkIdToCitationNumber.set(key, nextCitationNumber++);
}
return chunkIdToCitationNumber.get(key)!;
}
/**
* Parses text and replaces [citation:XXX] patterns with InlineCitation components
* Supports both regular chunks [citation:123] and docs chunks [citation:doc-123]
* Parses text and replaces [citation:XXX] patterns with citation components.
* Supports:
* - Numeric chunk IDs: [citation:123]
* - Doc-prefixed IDs: [citation:doc-123]
* - Comma-separated IDs: [citation:4149, 4150, 4151]
* - URL-based citations from live search: [citation:https://example.com/page]
*/
function parseTextWithCitations(text: string): ReactNode[] {
const parts: ReactNode[] = [];
@ -91,35 +89,49 @@ function parseTextWithCitations(text: string): ReactNode[] {
let match: RegExpExecArray | null;
let instanceIndex = 0;
// Reset regex state
CITATION_REGEX.lastIndex = 0;
match = CITATION_REGEX.exec(text);
while (match !== null) {
// Add text before the citation
if (match.index > lastIndex) {
parts.push(text.substring(lastIndex, match.index));
}
// Check if this is a docs chunk (has "doc-" prefix)
const isDocsChunk = match[1] === "doc-";
const chunkId = Number.parseInt(match[2], 10);
const citationNumber = getCitationNumber(chunkId, isDocsChunk);
const captured = match[1];
if (captured.startsWith("http://") || captured.startsWith("https://")) {
parts.push(
<UrlCitation key={`citation-url-${instanceIndex}`} url={captured.trim()} />
);
instanceIndex++;
} else if (captured.startsWith("urlcite")) {
const url = _pendingUrlCitations.get(captured);
if (url) {
parts.push(
<UrlCitation key={`citation-url-${instanceIndex}`} url={url} />
);
}
instanceIndex++;
} else {
const rawIds = captured.split(",").map((s) => s.trim());
for (const rawId of rawIds) {
const isDocsChunk = rawId.startsWith("doc-");
const chunkId = Number.parseInt(isDocsChunk ? rawId.slice(4) : rawId, 10);
parts.push(
<InlineCitation
key={`citation-${isDocsChunk ? "doc-" : ""}${chunkId}-${instanceIndex}`}
chunkId={chunkId}
citationNumber={citationNumber}
isDocsChunk={isDocsChunk}
/>
);
instanceIndex++;
}
}
lastIndex = match.index + match[0].length;
instanceIndex++;
match = CITATION_REGEX.exec(text);
}
// Add any remaining text after the last citation
if (lastIndex < text.length) {
parts.push(text.substring(lastIndex));
}
@ -134,7 +146,7 @@ const MarkdownTextImpl = () => {
rehypePlugins={[rehypeKatex]}
className="aui-md"
components={defaultComponents}
preprocess={convertLatexDelimiters}
preprocess={preprocessMarkdown}
/>
);
};