diff --git a/surfsense_backend/app/tasks/chat/stream_new_chat.py b/surfsense_backend/app/tasks/chat/stream_new_chat.py
index a49c244eb..8dfff4895 100644
--- a/surfsense_backend/app/tasks/chat/stream_new_chat.py
+++ b/surfsense_backend/app/tasks/chat/stream_new_chat.py
@@ -54,21 +54,64 @@ def format_attachments_as_context(attachments: list[ChatAttachment]) -> str:
def format_mentioned_documents_as_context(documents: list[Document]) -> str:
- """Format mentioned documents as context for the agent."""
+ """
+ Format mentioned documents as context for the agent.
+
+ Uses the same XML structure as knowledge_base.format_documents_for_context
+ to ensure citations work properly with chunk IDs.
+ """
if not documents:
return ""
context_parts = [""]
context_parts.append(
"The user has explicitly mentioned the following documents from their knowledge base. "
- "These documents are directly relevant to the query and should be prioritized as primary sources."
+ "These documents are directly relevant to the query and should be prioritized as primary sources. "
+ "Use [citation:CHUNK_ID] format for citations (e.g., [citation:123])."
)
- for i, doc in enumerate(documents, 1):
- context_parts.append(
- f""
+ context_parts.append("")
+
+ for doc in documents:
+ # Build metadata JSON
+ metadata = doc.document_metadata or {}
+ metadata_json = json.dumps(metadata, ensure_ascii=False)
+
+ # Get URL from metadata
+ url = (
+ metadata.get("url")
+ or metadata.get("source")
+ or metadata.get("page_url")
+ or ""
)
- context_parts.append(f"")
+
+ context_parts.append("")
+ context_parts.append("")
+ context_parts.append(f" {doc.id}")
+ context_parts.append(f" {doc.document_type.value}")
+ context_parts.append(f" ")
+ context_parts.append(f" ")
+ context_parts.append(f" ")
+ context_parts.append("")
+ context_parts.append("")
+ context_parts.append("")
+
+ # Use chunks if available (preferred for proper citations)
+ if hasattr(doc, "chunks") and doc.chunks:
+ for chunk in doc.chunks:
+ context_parts.append(
+ f" "
+ )
+ else:
+ # Fallback to document content if chunks not loaded
+ # Use document ID as chunk ID prefix for consistency
+ context_parts.append(
+ f" "
+ )
+
+ context_parts.append("")
context_parts.append("")
+ context_parts.append("")
+
context_parts.append("")
return "\n".join(context_parts)
@@ -81,8 +124,6 @@ def format_mentioned_surfsense_docs_as_context(
if not documents:
return ""
- import json
-
context_parts = [""]
context_parts.append(
"The user has explicitly mentioned the following SurfSense documentation pages. "
@@ -262,11 +303,15 @@ async def stream_new_chat(
# Build input with message history from frontend
langchain_messages = []
- # Fetch mentioned documents if any
+ # Fetch mentioned documents if any (with chunks for proper citations)
mentioned_documents: list[Document] = []
if mentioned_document_ids:
+ from sqlalchemy.orm import selectinload as doc_selectinload
+
result = await session.execute(
- select(Document).filter(
+ select(Document)
+ .options(doc_selectinload(Document.chunks))
+ .filter(
Document.id.in_(mentioned_document_ids),
Document.search_space_id == search_space_id,
)