mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-05-25 19:15:18 +02:00
feat: fixed live connectors citations
This commit is contained in:
parent
ce46708064
commit
81dfc7102f
14 changed files with 189 additions and 92 deletions
|
|
@ -523,6 +523,7 @@ CRITICAL CITATION REQUIREMENTS:
|
|||
<document_structure_example>
|
||||
The documents you receive are structured like this:
|
||||
|
||||
**Knowledge base documents (numeric chunk IDs):**
|
||||
<document>
|
||||
<document_metadata>
|
||||
<document_id>42</document_id>
|
||||
|
|
@ -538,7 +539,24 @@ The documents you receive are structured like this:
|
|||
</document_content>
|
||||
</document>
|
||||
|
||||
IMPORTANT: You MUST cite using the chunk ids (e.g. 123, 124, doc-45). Do NOT cite document_id.
|
||||
**Live web search results (URL chunk IDs):**
|
||||
<document>
|
||||
<document_metadata>
|
||||
<document_id>TAVILY_API::Some Title::https://example.com/article</document_id>
|
||||
<document_type>TAVILY_API</document_type>
|
||||
<title><![CDATA[Some web search result]]></title>
|
||||
<url><![CDATA[https://example.com/article]]></url>
|
||||
</document_metadata>
|
||||
|
||||
<document_content>
|
||||
<chunk id='https://example.com/article'><![CDATA[Content from web search...]]></chunk>
|
||||
</document_content>
|
||||
</document>
|
||||
|
||||
IMPORTANT: You MUST cite using the EXACT chunk ids from the `<chunk id='...'>` tags.
|
||||
- For knowledge base documents, chunk ids are numeric (e.g. 123, 124) or prefixed (e.g. doc-45).
|
||||
- For live web search results, chunk ids are URLs (e.g. https://example.com/article).
|
||||
Do NOT cite document_id. Always use the chunk id.
|
||||
</document_structure_example>
|
||||
|
||||
<citation_format>
|
||||
|
|
@ -550,13 +568,15 @@ IMPORTANT: You MUST cite using the chunk ids (e.g. 123, 124, doc-45). Do NOT cit
|
|||
- NEVER format citations as clickable links or as markdown links like "([citation:5](https://example.com))". Always use plain square brackets only
|
||||
- NEVER make up chunk IDs if you are unsure about the chunk_id. It is better to omit the citation than to guess
|
||||
- Copy the EXACT chunk id from the XML - if it says `<chunk id='doc-123'>`, use [citation:doc-123]
|
||||
- If the chunk id is a URL like `<chunk id='https://example.com/page'>`, use [citation:https://example.com/page]
|
||||
</citation_format>
|
||||
|
||||
<citation_examples>
|
||||
CORRECT citation formats:
|
||||
- [citation:5]
|
||||
- [citation:5] (numeric chunk ID from knowledge base)
|
||||
- [citation:doc-123] (for Surfsense documentation chunks)
|
||||
- [citation:chunk_id1], [citation:chunk_id2], [citation:chunk_id3]
|
||||
- [citation:https://example.com/article] (URL chunk ID from web search results)
|
||||
- [citation:chunk_id1], [citation:chunk_id2], [citation:chunk_id3] (multiple citations)
|
||||
|
||||
INCORRECT citation formats (DO NOT use):
|
||||
- Using parentheses and markdown links: ([citation:5](https://github.com/MODSetter/SurfSense))
|
||||
|
|
@ -571,7 +591,7 @@ INCORRECT citation formats (DO NOT use):
|
|||
<citation_output_example>
|
||||
Based on your GitHub repositories and video content, Python's asyncio library provides tools for writing concurrent code using the async/await syntax [citation:5]. It's particularly useful for I/O-bound and high-level structured network code [citation:5].
|
||||
|
||||
The key advantage of asyncio is that it can improve performance by allowing other code to run while waiting for I/O operations to complete [citation:12]. This makes it excellent for scenarios like web scraping, API calls, database operations, or any situation where your program spends time waiting for external resources.
|
||||
According to web search results, the key advantage of asyncio is that it can improve performance by allowing other code to run while waiting for I/O operations to complete [citation:https://docs.python.org/3/library/asyncio.html]. This makes it excellent for scenarios like web scraping, API calls, database operations, or any situation where your program spends time waiting for external resources.
|
||||
|
||||
However, from your video learning, it's important to note that asyncio is not suitable for CPU-bound tasks as it runs on a single thread [citation:12]. For computationally intensive work, you'd want to use multiprocessing instead.
|
||||
</citation_output_example>
|
||||
|
|
|
|||
|
|
@ -210,6 +210,7 @@ def format_documents_for_context(documents: list[dict[str, Any]]) -> str:
|
|||
|
||||
source = (
|
||||
(doc.get("source") if isinstance(doc, dict) else None)
|
||||
or document_info.get("document_type")
|
||||
or metadata.get("document_type")
|
||||
or "UNKNOWN"
|
||||
)
|
||||
|
|
@ -268,10 +269,20 @@ def format_documents_for_context(documents: list[dict[str, Any]]) -> str:
|
|||
continue
|
||||
grouped[doc_key]["chunks"].append({"chunk_id": chunk_id, "content": content})
|
||||
|
||||
# Live search connectors whose results should be cited by URL rather than
|
||||
# a numeric chunk_id (the numeric IDs are meaningless auto-incremented counters).
|
||||
_LIVE_SEARCH_CONNECTORS = {
|
||||
"TAVILY_API",
|
||||
"SEARXNG_API",
|
||||
"LINKUP_API",
|
||||
"BAIDU_SEARCH_API",
|
||||
}
|
||||
|
||||
# Render XML expected by citation instructions
|
||||
parts: list[str] = []
|
||||
for g in grouped.values():
|
||||
metadata_json = json.dumps(g["metadata"], ensure_ascii=False)
|
||||
is_live_search = g["document_type"] in _LIVE_SEARCH_CONNECTORS
|
||||
|
||||
parts.append("<document>")
|
||||
parts.append("<document_metadata>")
|
||||
|
|
@ -286,7 +297,10 @@ def format_documents_for_context(documents: list[dict[str, Any]]) -> str:
|
|||
|
||||
for ch in g["chunks"]:
|
||||
ch_content = ch["content"]
|
||||
ch_id = ch["chunk_id"]
|
||||
# For live search connectors, use the document URL as the chunk id
|
||||
# so the LLM outputs [citation:https://...] which the frontend
|
||||
# renders as a clickable link.
|
||||
ch_id = g["url"] if (is_live_search and g["url"]) else ch["chunk_id"]
|
||||
if ch_id is None:
|
||||
parts.append(f" <chunk><![CDATA[{ch_content}]]></chunk>")
|
||||
else:
|
||||
|
|
|
|||
|
|
@ -418,7 +418,7 @@ export const ConnectorIndicator: FC<{ hideTrigger?: boolean }> = ({ hideTrigger
|
|||
</div>
|
||||
</div>
|
||||
{/* Bottom fade shadow */}
|
||||
<div className="absolute bottom-0 left-0 right-0 h-7 bg-gradient-to-t from-muted via-muted/80 to-transparent pointer-events-none z-10" />
|
||||
<div className="absolute bottom-0 left-0 right-0 h-7 bg-linear-to-t from-muted via-muted/80 to-transparent pointer-events-none z-10" />
|
||||
</div>
|
||||
</Tabs>
|
||||
)}
|
||||
|
|
|
|||
|
|
@ -57,6 +57,7 @@ export const BaiduSearchApiConnectForm: FC<ConnectFormProps> = ({ onSubmit, isSu
|
|||
BAIDU_API_KEY: values.api_key,
|
||||
},
|
||||
is_indexable: false,
|
||||
is_active: true,
|
||||
last_indexed_at: null,
|
||||
periodic_indexing_enabled: false,
|
||||
indexing_frequency_minutes: null,
|
||||
|
|
|
|||
|
|
@ -51,6 +51,7 @@ export const CirclebackConnectForm: FC<ConnectFormProps> = ({ onSubmit, isSubmit
|
|||
connector_type: EnumConnectorName.CIRCLEBACK_CONNECTOR,
|
||||
config: {},
|
||||
is_indexable: false,
|
||||
is_active: true,
|
||||
last_indexed_at: null,
|
||||
periodic_indexing_enabled: false,
|
||||
indexing_frequency_minutes: null,
|
||||
|
|
|
|||
|
|
@ -155,6 +155,7 @@ export const ElasticsearchConnectForm: FC<ConnectFormProps> = ({ onSubmit, isSub
|
|||
connector_type: EnumConnectorName.ELASTICSEARCH_CONNECTOR,
|
||||
config,
|
||||
is_indexable: true,
|
||||
is_active: true,
|
||||
last_indexed_at: null,
|
||||
periodic_indexing_enabled: periodicEnabled,
|
||||
indexing_frequency_minutes: periodicEnabled ? parseInt(frequencyMinutes, 10) : null,
|
||||
|
|
|
|||
|
|
@ -57,6 +57,7 @@ export const LinkupApiConnectForm: FC<ConnectFormProps> = ({ onSubmit, isSubmitt
|
|||
LINKUP_API_KEY: values.api_key,
|
||||
},
|
||||
is_indexable: false,
|
||||
is_active: true,
|
||||
last_indexed_at: null,
|
||||
periodic_indexing_enabled: false,
|
||||
indexing_frequency_minutes: null,
|
||||
|
|
|
|||
|
|
@ -71,6 +71,7 @@ export const LumaConnectForm: FC<ConnectFormProps> = ({ onSubmit, isSubmitting }
|
|||
LUMA_API_KEY: values.api_key,
|
||||
},
|
||||
is_indexable: true,
|
||||
is_active: true,
|
||||
last_indexed_at: null,
|
||||
periodic_indexing_enabled: periodicEnabled,
|
||||
indexing_frequency_minutes: periodicEnabled ? parseInt(frequencyMinutes, 10) : null,
|
||||
|
|
|
|||
|
|
@ -110,6 +110,7 @@ export const SearxngConnectForm: FC<ConnectFormProps> = ({ onSubmit, isSubmittin
|
|||
connector_type: EnumConnectorName.SEARXNG_API,
|
||||
config,
|
||||
is_indexable: false,
|
||||
is_active: true,
|
||||
last_indexed_at: null,
|
||||
periodic_indexing_enabled: false,
|
||||
indexing_frequency_minutes: null,
|
||||
|
|
|
|||
|
|
@ -57,6 +57,7 @@ export const TavilyApiConnectForm: FC<ConnectFormProps> = ({ onSubmit, isSubmitt
|
|||
TAVILY_API_KEY: values.api_key,
|
||||
},
|
||||
is_indexable: false,
|
||||
is_active: true,
|
||||
last_indexed_at: null,
|
||||
periodic_indexing_enabled: false,
|
||||
indexing_frequency_minutes: null,
|
||||
|
|
|
|||
|
|
@ -1,7 +1,7 @@
|
|||
"use client";
|
||||
|
||||
import { ArrowLeft } from "lucide-react";
|
||||
import { type FC, useMemo } from "react";
|
||||
import { type FC, useMemo, useRef } from "react";
|
||||
import { Button } from "@/components/ui/button";
|
||||
import { Spinner } from "@/components/ui/spinner";
|
||||
import type { EnumConnectorName } from "@/contracts/enums/connector";
|
||||
|
|
@ -9,6 +9,20 @@ import { getConnectorIcon } from "@/contracts/enums/connectorIcons";
|
|||
import { getConnectorTypeDisplay } from "@/lib/connectors/utils";
|
||||
import { getConnectFormComponent } from "../../connect-forms";
|
||||
|
||||
const FORM_ID_MAP: Record<string, string> = {
|
||||
TAVILY_API: "tavily-connect-form",
|
||||
SEARXNG_API: "searxng-connect-form",
|
||||
LINKUP_API: "linkup-api-connect-form",
|
||||
BAIDU_SEARCH_API: "baidu-search-api-connect-form",
|
||||
ELASTICSEARCH_CONNECTOR: "elasticsearch-connect-form",
|
||||
BOOKSTACK_CONNECTOR: "bookstack-connect-form",
|
||||
GITHUB_CONNECTOR: "github-connect-form",
|
||||
LUMA_CONNECTOR: "luma-connect-form",
|
||||
CIRCLEBACK_CONNECTOR: "circleback-connect-form",
|
||||
MCP_CONNECTOR: "mcp-connect-form",
|
||||
OBSIDIAN_CONNECTOR: "obsidian-connect-form",
|
||||
};
|
||||
|
||||
interface ConnectorConnectViewProps {
|
||||
connectorType: string;
|
||||
onSubmit: (data: {
|
||||
|
|
@ -35,6 +49,7 @@ export const ConnectorConnectView: FC<ConnectorConnectViewProps> = ({
|
|||
onBack,
|
||||
isSubmitting,
|
||||
}) => {
|
||||
const formContainerRef = useRef<HTMLDivElement | null>(null);
|
||||
// Get connector-specific form component
|
||||
const ConnectFormComponent = useMemo(
|
||||
() => getConnectFormComponent(connectorType),
|
||||
|
|
@ -46,26 +61,18 @@ export const ConnectorConnectView: FC<ConnectorConnectViewProps> = ({
|
|||
if (isSubmitting) {
|
||||
return;
|
||||
}
|
||||
// Map connector types to their form IDs
|
||||
const formIdMap: Record<string, string> = {
|
||||
TAVILY_API: "tavily-connect-form",
|
||||
SEARXNG_API: "searxng-connect-form",
|
||||
LINKUP_API: "linkup-api-connect-form",
|
||||
BAIDU_SEARCH_API: "baidu-search-api-connect-form",
|
||||
ELASTICSEARCH_CONNECTOR: "elasticsearch-connect-form",
|
||||
BOOKSTACK_CONNECTOR: "bookstack-connect-form",
|
||||
GITHUB_CONNECTOR: "github-connect-form",
|
||||
LUMA_CONNECTOR: "luma-connect-form",
|
||||
CIRCLEBACK_CONNECTOR: "circleback-connect-form",
|
||||
MCP_CONNECTOR: "mcp-connect-form",
|
||||
OBSIDIAN_CONNECTOR: "obsidian-connect-form",
|
||||
};
|
||||
const formId = formIdMap[connectorType];
|
||||
if (formId) {
|
||||
const form = document.getElementById(formId) as HTMLFormElement;
|
||||
if (form) {
|
||||
form.requestSubmit();
|
||||
}
|
||||
const formId = FORM_ID_MAP[connectorType];
|
||||
const root = formContainerRef.current;
|
||||
const mappedForm =
|
||||
root && formId
|
||||
? (root.querySelector(`[id="${formId}"]`) as HTMLFormElement | null)
|
||||
: null;
|
||||
// Fallback to currently rendered form to avoid silent no-op
|
||||
// when a connector type or form id mapping drifts.
|
||||
const fallbackForm = root?.querySelector("form") as HTMLFormElement | null;
|
||||
const form = mappedForm ?? fallbackForm;
|
||||
if (form) {
|
||||
form.requestSubmit();
|
||||
}
|
||||
};
|
||||
|
||||
|
|
@ -114,7 +121,10 @@ export const ConnectorConnectView: FC<ConnectorConnectViewProps> = ({
|
|||
</div>
|
||||
|
||||
{/* Form Content - Scrollable */}
|
||||
<div className="flex-1 min-h-0 overflow-y-auto px-6 sm:px-12">
|
||||
<div
|
||||
ref={formContainerRef}
|
||||
className="connector-connect-form-root flex-1 min-h-0 overflow-y-auto px-6 sm:px-12"
|
||||
>
|
||||
<ConnectFormComponent
|
||||
onSubmit={onSubmit}
|
||||
onBack={onBack}
|
||||
|
|
@ -134,6 +144,7 @@ export const ConnectorConnectView: FC<ConnectorConnectViewProps> = ({
|
|||
Cancel
|
||||
</Button>
|
||||
<Button
|
||||
type="button"
|
||||
onClick={handleFormSubmit}
|
||||
disabled={isSubmitting}
|
||||
className="text-xs sm:text-sm min-w-[140px] disabled:opacity-50 disabled:cursor-not-allowed disabled:pointer-events-none"
|
||||
|
|
|
|||
|
|
@ -558,7 +558,9 @@ export const useConnectorDialog = () => {
|
|||
},
|
||||
onIndexingStart?: (connectorId: number) => void
|
||||
) => {
|
||||
if (!searchSpaceId || !connectingConnectorType) return;
|
||||
if (!searchSpaceId || !connectingConnectorType) {
|
||||
return;
|
||||
}
|
||||
|
||||
// Prevent multiple submissions using ref for immediate check
|
||||
if (isCreatingConnectorRef.current) return;
|
||||
|
|
@ -582,7 +584,6 @@ export const useConnectorDialog = () => {
|
|||
search_space_id: searchSpaceId,
|
||||
},
|
||||
});
|
||||
|
||||
// Refetch connectors to get the new one
|
||||
const result = await refetchAllConnectors();
|
||||
if (result.data) {
|
||||
|
|
|
|||
|
|
@ -2,22 +2,20 @@
|
|||
|
||||
import type { FC } from "react";
|
||||
import { useState } from "react";
|
||||
import { ExternalLink } from "lucide-react";
|
||||
import { SourceDetailPanel } from "@/components/new-chat/source-detail-panel";
|
||||
|
||||
interface InlineCitationProps {
|
||||
chunkId: number;
|
||||
citationNumber: number;
|
||||
isDocsChunk?: boolean;
|
||||
}
|
||||
|
||||
/**
|
||||
* Inline citation component for the new chat.
|
||||
* Renders a clickable numbered badge that opens the SourceDetailPanel with document chunk details.
|
||||
* Supports both regular knowledge base chunks and Surfsense documentation chunks.
|
||||
* Inline citation for knowledge-base chunks (numeric chunk IDs).
|
||||
* Renders a clickable badge showing the actual chunk ID that opens the SourceDetailPanel.
|
||||
*/
|
||||
export const InlineCitation: FC<InlineCitationProps> = ({
|
||||
chunkId,
|
||||
citationNumber,
|
||||
isDocsChunk = false,
|
||||
}) => {
|
||||
const [isOpen, setIsOpen] = useState(false);
|
||||
|
|
@ -37,12 +35,46 @@ export const InlineCitation: FC<InlineCitationProps> = ({
|
|||
onClick={() => setIsOpen(true)}
|
||||
onKeyDown={(e) => e.key === "Enter" && setIsOpen(true)}
|
||||
className="text-[10px] font-bold bg-primary/80 hover:bg-primary text-primary-foreground rounded-full min-w-4 h-4 px-1 inline-flex items-center justify-center align-super cursor-pointer transition-colors ml-0.5"
|
||||
title={`View source #${citationNumber}`}
|
||||
title={`View source chunk #${chunkId}`}
|
||||
role="button"
|
||||
tabIndex={0}
|
||||
>
|
||||
{citationNumber}
|
||||
{chunkId}
|
||||
</span>
|
||||
</SourceDetailPanel>
|
||||
);
|
||||
};
|
||||
|
||||
function extractDomain(url: string): string {
|
||||
try {
|
||||
const hostname = new URL(url).hostname;
|
||||
return hostname.replace(/^www\./, "");
|
||||
} catch {
|
||||
return url;
|
||||
}
|
||||
}
|
||||
|
||||
interface UrlCitationProps {
|
||||
url: string;
|
||||
}
|
||||
|
||||
/**
|
||||
* Inline citation for live web search results (URL-based chunk IDs).
|
||||
* Renders a clickable badge showing the source domain that opens the URL in a new tab.
|
||||
*/
|
||||
export const UrlCitation: FC<UrlCitationProps> = ({ url }) => {
|
||||
const domain = extractDomain(url);
|
||||
|
||||
return (
|
||||
<a
|
||||
href={url}
|
||||
target="_blank"
|
||||
rel="noopener noreferrer"
|
||||
className="text-[10px] font-bold bg-primary/80 hover:bg-primary text-primary-foreground rounded-full h-4 px-1.5 inline-flex items-center gap-0.5 align-super cursor-pointer transition-colors ml-0.5 no-underline"
|
||||
title={url}
|
||||
>
|
||||
<ExternalLink className="size-2.5 shrink-0" />
|
||||
{domain}
|
||||
</a>
|
||||
);
|
||||
};
|
||||
|
|
|
|||
|
|
@ -14,17 +14,36 @@ import rehypeKatex from "rehype-katex";
|
|||
import remarkGfm from "remark-gfm";
|
||||
import remarkMath from "remark-math";
|
||||
import "katex/dist/katex.min.css";
|
||||
import { InlineCitation } from "@/components/assistant-ui/inline-citation";
|
||||
import { InlineCitation, UrlCitation } from "@/components/assistant-ui/inline-citation";
|
||||
import { TooltipIconButton } from "@/components/assistant-ui/tooltip-icon-button";
|
||||
import { cn } from "@/lib/utils";
|
||||
|
||||
// Storage for URL citations replaced during preprocess to avoid GFM autolink interference.
|
||||
// Populated in preprocessMarkdown, consumed in parseTextWithCitations.
|
||||
let _pendingUrlCitations = new Map<string, string>();
|
||||
let _urlCiteIdx = 0;
|
||||
|
||||
/**
|
||||
* Convert all LaTeX delimiter styles to the dollar-sign syntax
|
||||
* that remark-math understands. LLMs use various delimiters
|
||||
* (\(...\), \[...\], \begin{equation}, etc.) and we need to
|
||||
* normalise them all to $ / $$ before the markdown parser runs.
|
||||
* Preprocess raw markdown before it reaches the remark/rehype pipeline.
|
||||
* - Replaces URL-based citations with safe placeholders (prevents GFM autolinks)
|
||||
* - Normalises LaTeX delimiters to dollar-sign syntax for remark-math
|
||||
*/
|
||||
function convertLatexDelimiters(content: string): string {
|
||||
function preprocessMarkdown(content: string): string {
|
||||
// Replace URL-based citations with safe placeholders BEFORE markdown parsing.
|
||||
// GFM autolinks would otherwise convert the https://... inside [citation:URL]
|
||||
// into an <a> element, splitting the text and preventing our citation regex
|
||||
// from matching the full pattern.
|
||||
_pendingUrlCitations = new Map();
|
||||
_urlCiteIdx = 0;
|
||||
content = content.replace(
|
||||
/[[【]\u200B?citation:\s*(https?:\/\/[^\]\】\u200B]+)\s*\u200B?[\]】]/g,
|
||||
(_, url) => {
|
||||
const key = `urlcite${_urlCiteIdx++}`;
|
||||
_pendingUrlCitations.set(key, url.trim());
|
||||
return `[citation:${key}]`;
|
||||
}
|
||||
);
|
||||
|
||||
// 1. Block math: \[...\] → $$...$$
|
||||
content = content.replace(/\\\[([\s\S]*?)\\\]/g, (_, inner) => `$$${inner}$$`);
|
||||
// 2. Inline math: \(...\) → $...$
|
||||
|
|
@ -50,40 +69,19 @@ function convertLatexDelimiters(content: string): string {
|
|||
return content;
|
||||
}
|
||||
|
||||
// Citation pattern: [citation:CHUNK_ID] or [citation:doc-CHUNK_ID]
|
||||
// Also matches Chinese brackets 【】 and handles zero-width spaces that LLM sometimes inserts
|
||||
const CITATION_REGEX = /[[【]\u200B?citation:(doc-)?(\d+)\u200B?[\]】]/g;
|
||||
|
||||
// Track chunk IDs to citation numbers mapping for consistent numbering
|
||||
// This map is reset when a new message starts rendering
|
||||
// Uses string keys to differentiate between doc and regular chunks (e.g., "doc-123" vs "123")
|
||||
let chunkIdToCitationNumber: Map<string, number> = new Map();
|
||||
let nextCitationNumber = 1;
|
||||
// Matches [citation:...] with numeric IDs (incl. doc- prefix, comma-separated),
|
||||
// URL-based IDs from live web search, or urlciteN placeholders from preprocess.
|
||||
// Also matches Chinese brackets 【】 and handles zero-width spaces that LLM sometimes inserts.
|
||||
const CITATION_REGEX =
|
||||
/[[【]\u200B?citation:\s*(https?:\/\/[^\]\】\u200B]+|urlcite\d+|(?:doc-)?\d+(?:\s*,\s*(?:doc-)?\d+)*)\s*\u200B?[\]】]/g;
|
||||
|
||||
/**
|
||||
* Resets the citation counter - should be called at the start of each message
|
||||
*/
|
||||
export function resetCitationCounter() {
|
||||
chunkIdToCitationNumber = new Map();
|
||||
nextCitationNumber = 1;
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets or assigns a citation number for a chunk ID
|
||||
* Uses string key to differentiate between doc and regular chunks
|
||||
*/
|
||||
function getCitationNumber(chunkId: number, isDocsChunk: boolean): number {
|
||||
const key = isDocsChunk ? `doc-${chunkId}` : String(chunkId);
|
||||
const existingNumber = chunkIdToCitationNumber.get(key);
|
||||
if (existingNumber === undefined) {
|
||||
chunkIdToCitationNumber.set(key, nextCitationNumber++);
|
||||
}
|
||||
return chunkIdToCitationNumber.get(key)!;
|
||||
}
|
||||
|
||||
/**
|
||||
* Parses text and replaces [citation:XXX] patterns with InlineCitation components
|
||||
* Supports both regular chunks [citation:123] and docs chunks [citation:doc-123]
|
||||
* Parses text and replaces [citation:XXX] patterns with citation components.
|
||||
* Supports:
|
||||
* - Numeric chunk IDs: [citation:123]
|
||||
* - Doc-prefixed IDs: [citation:doc-123]
|
||||
* - Comma-separated IDs: [citation:4149, 4150, 4151]
|
||||
* - URL-based citations from live search: [citation:https://example.com/page]
|
||||
*/
|
||||
function parseTextWithCitations(text: string): ReactNode[] {
|
||||
const parts: ReactNode[] = [];
|
||||
|
|
@ -91,35 +89,49 @@ function parseTextWithCitations(text: string): ReactNode[] {
|
|||
let match: RegExpExecArray | null;
|
||||
let instanceIndex = 0;
|
||||
|
||||
// Reset regex state
|
||||
CITATION_REGEX.lastIndex = 0;
|
||||
|
||||
match = CITATION_REGEX.exec(text);
|
||||
while (match !== null) {
|
||||
// Add text before the citation
|
||||
if (match.index > lastIndex) {
|
||||
parts.push(text.substring(lastIndex, match.index));
|
||||
}
|
||||
|
||||
// Check if this is a docs chunk (has "doc-" prefix)
|
||||
const isDocsChunk = match[1] === "doc-";
|
||||
const chunkId = Number.parseInt(match[2], 10);
|
||||
const citationNumber = getCitationNumber(chunkId, isDocsChunk);
|
||||
parts.push(
|
||||
<InlineCitation
|
||||
key={`citation-${isDocsChunk ? "doc-" : ""}${chunkId}-${instanceIndex}`}
|
||||
chunkId={chunkId}
|
||||
citationNumber={citationNumber}
|
||||
isDocsChunk={isDocsChunk}
|
||||
/>
|
||||
);
|
||||
const captured = match[1];
|
||||
|
||||
if (captured.startsWith("http://") || captured.startsWith("https://")) {
|
||||
parts.push(
|
||||
<UrlCitation key={`citation-url-${instanceIndex}`} url={captured.trim()} />
|
||||
);
|
||||
instanceIndex++;
|
||||
} else if (captured.startsWith("urlcite")) {
|
||||
const url = _pendingUrlCitations.get(captured);
|
||||
if (url) {
|
||||
parts.push(
|
||||
<UrlCitation key={`citation-url-${instanceIndex}`} url={url} />
|
||||
);
|
||||
}
|
||||
instanceIndex++;
|
||||
} else {
|
||||
const rawIds = captured.split(",").map((s) => s.trim());
|
||||
for (const rawId of rawIds) {
|
||||
const isDocsChunk = rawId.startsWith("doc-");
|
||||
const chunkId = Number.parseInt(isDocsChunk ? rawId.slice(4) : rawId, 10);
|
||||
parts.push(
|
||||
<InlineCitation
|
||||
key={`citation-${isDocsChunk ? "doc-" : ""}${chunkId}-${instanceIndex}`}
|
||||
chunkId={chunkId}
|
||||
isDocsChunk={isDocsChunk}
|
||||
/>
|
||||
);
|
||||
instanceIndex++;
|
||||
}
|
||||
}
|
||||
|
||||
lastIndex = match.index + match[0].length;
|
||||
instanceIndex++;
|
||||
match = CITATION_REGEX.exec(text);
|
||||
}
|
||||
|
||||
// Add any remaining text after the last citation
|
||||
if (lastIndex < text.length) {
|
||||
parts.push(text.substring(lastIndex));
|
||||
}
|
||||
|
|
@ -134,7 +146,7 @@ const MarkdownTextImpl = () => {
|
|||
rehypePlugins={[rehypeKatex]}
|
||||
className="aui-md"
|
||||
components={defaultComponents}
|
||||
preprocess={convertLatexDelimiters}
|
||||
preprocess={preprocessMarkdown}
|
||||
/>
|
||||
);
|
||||
};
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue