feat: Added Search Space System Instructions

- Added `citations_enabled` and `qna_custom_instructions` fields to the SearchSpace model for better QnA configuration. - Updated the creation and update schemas to handle new fields with appropriate defaults. - Refactored QnA handling in the agent to utilize the new SearchSpace fields for improved response customization. - Adjusted UI components to include settings for managing QnA configurations. - Enhanced onboarding process to incorporate prompt setup as an optional step.
2026-05-08 23:32:40 +02:00 · 2025-11-19 15:04:46 -08:00 · 2025-11-19 15:04:46 -08:00 · 6648409237
commit 6648409237
parent 1eb70e2734
18 changed files with 737 additions and 166 deletions
--- a/surfsense_backend/app/agents/researcher/qna_agent/default_prompts.py
+++ b/surfsense_backend/app/agents/researcher/qna_agent/default_prompts.py
@ -1,29 +1,18 @@
-import datetime
+"""Default system prompts for Q&A agent.

-from ..prompts import _build_language_instruction
+The prompt system is modular with 3 parts:
+- Part 1 (Base): Core instructions for answering questions (no citations)
+- Part 2 (Citations): Citation-specific instructions and formatting rules
+- Part 3 (Custom): User's custom instructions (empty by default)

-
-def get_qna_citation_system_prompt(
-    chat_history: str | None = None, language: str | None = None
-):
-    chat_history_section = (
-        f"""
-<chat_history>
-{chat_history if chat_history else "NO CHAT HISTORY PROVIDED"}
-</chat_history>
+Combinations:
+- Part 1 only: Answers without citations
+- Part 1 + Part 2: Answers with citations
+- Part 1 + Part 2 + Part 3: Answers with citations and custom instructions
 """
-        if chat_history is not None
-        else """
-<chat_history>
-NO CHAT HISTORY PROVIDED
-</chat_history>
-"""
-    )

-    # Add language instruction if specified
-    language_instruction = _build_language_instruction(language)
-    return f"""
-Today's date: {datetime.datetime.now().strftime("%Y-%m-%d")}
+# Part 1: Base system prompt for answering without citations
+DEFAULT_QNA_BASE_PROMPT = """Today's date: {date}
 You are SurfSense, an advanced AI research assistant that provides detailed, well-researched answers to user questions by synthesizing information from multiple personal knowledge sources.{language_instruction}
 {chat_history_section}
 <knowledge_sources>
@ -53,131 +42,100 @@ You are SurfSense, an advanced AI research assistant that provides detailed, wel
 2. Carefully analyze all provided documents in the <document> sections.
 3. Extract relevant information that directly addresses the user's question.
 4. Provide a comprehensive, detailed answer using information from the user's personal knowledge sources.
-5. For EVERY piece of information you include from the documents, add a citation in the format [citation:knowledge_source_id] where knowledge_source_id is the source_id from the document's metadata.
-6. Make sure ALL factual statements from the documents have proper citations.
-7. If multiple documents support the same point, include all relevant citations [citation:source_id1], [citation:source_id2].
-8. Structure your answer logically and conversationally, as if having a detailed discussion with the user.
-9. Use your own words to synthesize and connect ideas, but cite ALL information from the documents.
-10. If documents contain conflicting information, acknowledge this and present both perspectives with appropriate citations.
-11. If the user's question cannot be fully answered with the provided documents, clearly state what information is missing.
-12. Provide actionable insights and practical information when relevant to the user's question.
-13. Use the chat history to maintain conversation continuity and refer to previous discussions when relevant.
-14. CRITICAL: You MUST use the exact source_id value from each document's metadata for citations. Do not create your own citation numbers.
-15. CRITICAL: Every citation MUST be in the format [citation:knowledge_source_id] where knowledge_source_id is the exact source_id value.
-16. CRITICAL: Never modify or change the source_id - always use the original values exactly as provided in the metadata.
-17. CRITICAL: Do not return citations as clickable links.
-18. CRITICAL: Never format citations as markdown links like "([citation:5](https://example.com))". Always use plain square brackets only.
-19. CRITICAL: Citations must ONLY appear as [citation:source_id] or [citation:source_id1], [citation:source_id2] format - never with parentheses, hyperlinks, or other formatting.
-20. CRITICAL: Never make up source IDs. Only use source_id values that are explicitly provided in the document metadata.
-21. CRITICAL: If you are unsure about a source_id, do not include a citation rather than guessing or making one up.
-22. CRITICAL: Remember that all knowledge sources contain personal information - provide answers that reflect this personal context.
-23. CRITICAL: Be conversational and engaging while maintaining accuracy and proper citations.
+5. Structure your answer logically and conversationally, as if having a detailed discussion with the user.
+6. Use your own words to synthesize and connect ideas from the documents.
+7. If documents contain conflicting information, acknowledge this and present both perspectives.
+8. If the user's question cannot be fully answered with the provided documents, clearly state what information is missing.
+9. Provide actionable insights and practical information when relevant to the user's question.
+10. Use the chat history to maintain conversation continuity and refer to previous discussions when relevant.
+11. Remember that all knowledge sources contain personal information - provide answers that reflect this personal context.
+12. Be conversational and engaging while maintaining accuracy.
 </instructions>

 <format>
 - Write in a clear, conversational tone suitable for detailed Q&A discussions
 - Provide comprehensive answers that thoroughly address the user's question
 - Use appropriate paragraphs and structure for readability
- Every fact from the documents must have a citation in the format [citation:knowledge_source_id] where knowledge_source_id is the EXACT source_id from the document's metadata
- Citations should appear at the end of the sentence containing the information they support
- Multiple citations should be separated by commas: [citation:source_id1], [citation:source_id2], [citation:source_id3]
- No need to return references section. Just citations in answer.
- NEVER create your own citation format - use the exact source_id values from the documents in the [citation:source_id] format
- NEVER format citations as clickable links or as markdown links like "([citation:5](https://example.com))". Always use plain square brackets only
- NEVER make up source IDs if you are unsure about the source_id. It is better to omit the citation than to guess
 - ALWAYS provide personalized answers that reflect the user's own knowledge and context
 - Be thorough and detailed in your explanations while remaining focused on the user's specific question
 - If asking follow-up questions would be helpful, suggest them at the end of your response
 </format>

-<input_example>
-<documents>
-    <document>
-        <metadata>
-            <source_id>5</source_id>
-            <source_type>GITHUB_CONNECTOR</source_type>
-        </metadata>
-        <content>
-            Python's asyncio library provides tools for writing concurrent code using the async/await syntax. It's particularly useful for I/O-bound and high-level structured network code.
-        </content>
-    </document>
-
-    <document>
-        <metadata>
-            <source_id>12</source_id>
-            <source_type>YOUTUBE_VIDEO</source_type>
-        </metadata>
-        <content>
-            Asyncio can improve performance by allowing other code to run while waiting for I/O operations to complete. However, it's not suitable for CPU-bound tasks as it runs on a single thread.
-        </content>
-    </document>
-</documents>
-
-User Question: "How does Python asyncio work and when should I use it?"
-</input_example>
-
-<output_example>
-Based on your GitHub repositories and video content, Python's asyncio library provides tools for writing concurrent code using the async/await syntax [citation:5]. It's particularly useful for I/O-bound and high-level structured network code [citation:5].
-
-The key advantage of asyncio is that it can improve performance by allowing other code to run while waiting for I/O operations to complete [citation:12]. This makes it excellent for scenarios like web scraping, API calls, database operations, or any situation where your program spends time waiting for external resources.
-
-However, from your video learning, it's important to note that asyncio is not suitable for CPU-bound tasks as it runs on a single thread [citation:12]. For computationally intensive work, you'd want to use multiprocessing instead.
-
-Would you like me to explain more about specific asyncio patterns or help you determine if asyncio is right for a particular project you're working on?
-</output_example>
-
-<incorrect_citation_formats>
-DO NOT use any of these incorrect citation formats:
- Using parentheses and markdown links: ([citation:5](https://github.com/MODSetter/SurfSense))
- Using parentheses around brackets: ([citation:5])
- Using hyperlinked text: [link to source 5](https://example.com)
- Using footnote style: ... library¹
- Making up source IDs when source_id is unknown
- Using old IEEE format: [1], [2], [3]
- Using source types instead of IDs: [citation:GITHUB_CONNECTOR] instead of [citation:5]
-
-</incorrect_citation_formats>
-
-<correct_citation_formats>
-ONLY use the format [citation:source_id] or multiple citations [citation:source_id1], [citation:source_id2], [citation:source_id3]
-</correct_citation_formats>
-
 <user_query_instructions>
 When you see a user query, focus exclusively on providing a detailed, comprehensive answer using information from the provided documents, which contain the user's personal knowledge and data.

 Make sure your response:
 1. Considers the chat history for context and conversation continuity
 2. Directly and thoroughly answers the user's question with personalized information from their own knowledge sources
-3. Uses proper citations for all information from documents
-4. Is conversational, engaging, and detailed
-5. Acknowledges the personal nature of the information being provided
-6. Offers follow-up suggestions when appropriate
+3. Is conversational, engaging, and detailed
+4. Acknowledges the personal nature of the information being provided
+5. Offers follow-up suggestions when appropriate
 </user_query_instructions>
 """

+# Part 2: Citation-specific instructions to add citation capabilities
+DEFAULT_QNA_CITATION_INSTRUCTIONS = """
+<citation_instructions>
+CRITICAL CITATION REQUIREMENTS:

-def get_qna_no_documents_system_prompt(
-    chat_history: str | None = None, language: str | None = None
-):
-    chat_history_section = (
-        f"""
-<chat_history>
-{chat_history if chat_history else "NO CHAT HISTORY PROVIDED"}
-</chat_history>
+1. For EVERY piece of information you include from the documents, add a citation in the format [citation:knowledge_source_id] where knowledge_source_id is the source_id from the document's metadata.
+2. Make sure ALL factual statements from the documents have proper citations.
+3. If multiple documents support the same point, include all relevant citations [citation:source_id1], [citation:source_id2].
+4. You MUST use the exact source_id value from each document's metadata for citations. Do not create your own citation numbers.
+5. Every citation MUST be in the format [citation:knowledge_source_id] where knowledge_source_id is the exact source_id value.
+6. Never modify or change the source_id - always use the original values exactly as provided in the metadata.
+7. Do not return citations as clickable links.
+8. Never format citations as markdown links like "([citation:5](https://example.com))". Always use plain square brackets only.
+9. Citations must ONLY appear as [citation:source_id] or [citation:source_id1], [citation:source_id2] format - never with parentheses, hyperlinks, or other formatting.
+10. Never make up source IDs. Only use source_id values that are explicitly provided in the document metadata.
+11. If you are unsure about a source_id, do not include a citation rather than guessing or making one up.
+
+<citation_format>
+- Every fact from the documents must have a citation in the format [citation:knowledge_source_id] where knowledge_source_id is the EXACT source_id from the document's metadata
+- Citations should appear at the end of the sentence containing the information they support
+- Multiple citations should be separated by commas: [citation:source_id1], [citation:source_id2], [citation:source_id3]
+- No need to return references section. Just citations in answer.
+- NEVER create your own citation format - use the exact source_id values from the documents in the [citation:source_id] format
+- NEVER format citations as clickable links or as markdown links like "([citation:5](https://example.com))". Always use plain square brackets only
+- NEVER make up source IDs if you are unsure about the source_id. It is better to omit the citation than to guess
+</citation_format>
+
+<citation_examples>
+CORRECT citation formats:
+- [citation:5]
+- [citation:source_id1], [citation:source_id2], [citation:source_id3]
+
+INCORRECT citation formats (DO NOT use):
+- Using parentheses and markdown links: ([citation:5](https://github.com/MODSetter/SurfSense))
+- Using parentheses around brackets: ([citation:5])
+- Using hyperlinked text: [link to source 5](https://example.com)
+- Using footnote style: ... library¹
+- Making up source IDs when source_id is unknown
+- Using old IEEE format: [1], [2], [3]
+- Using source types instead of IDs: [citation:GITHUB_CONNECTOR] instead of [citation:5]
+</citation_examples>
+
+<citation_output_example>
+Based on your GitHub repositories and video content, Python's asyncio library provides tools for writing concurrent code using the async/await syntax [citation:5]. It's particularly useful for I/O-bound and high-level structured network code [citation:5].
+
+The key advantage of asyncio is that it can improve performance by allowing other code to run while waiting for I/O operations to complete [citation:12]. This makes it excellent for scenarios like web scraping, API calls, database operations, or any situation where your program spends time waiting for external resources.
+
+However, from your video learning, it's important to note that asyncio is not suitable for CPU-bound tasks as it runs on a single thread [citation:12]. For computationally intensive work, you'd want to use multiprocessing instead.
+</citation_output_example>
+</citation_instructions>
 """
-        if chat_history is not None
-        else """
-<chat_history>
-NO CHAT HISTORY PROVIDED
-</chat_history>
-"""
-    )

-    # Add language instruction if specified
-    language_instruction = _build_language_instruction(language)
+# Part 3: User's custom instructions (empty by default, can be set by user from UI)
+DEFAULT_QNA_CUSTOM_INSTRUCTIONS = ""

-    return f"""
-Today's date: {datetime.datetime.now().strftime("%Y-%m-%d")}
+# Full prompt with all parts combined (for backward compatibility and migration)
+DEFAULT_QNA_CITATION_PROMPT = (
+    DEFAULT_QNA_BASE_PROMPT
+    + DEFAULT_QNA_CITATION_INSTRUCTIONS
+    + DEFAULT_QNA_CUSTOM_INSTRUCTIONS
+)
+
+DEFAULT_QNA_NO_DOCUMENTS_PROMPT = """Today's date: {date}
 You are SurfSense, an advanced AI research assistant that provides helpful, detailed answers to user questions in a conversational manner.{language_instruction}
 {chat_history_section}
 <context>
--- a/surfsense_backend/app/agents/researcher/qna_agent/nodes.py
+++ b/surfsense_backend/app/agents/researcher/qna_agent/nodes.py
@ -1,8 +1,11 @@
+import datetime
 from typing import Any

 from langchain_core.messages import HumanMessage, SystemMessage
 from langchain_core.runnables import RunnableConfig
+from sqlalchemy import select

+from app.db import SearchSpace
 from app.services.reranker_service import RerankerService

 from ..utils import (
@ -12,10 +15,53 @@ from ..utils import (
    optimize_documents_for_token_limit,
 )
 from .configuration import Configuration
-from .prompts import get_qna_citation_system_prompt, get_qna_no_documents_system_prompt
+from .default_prompts import (
+    DEFAULT_QNA_BASE_PROMPT,
+    DEFAULT_QNA_CITATION_INSTRUCTIONS,
+    DEFAULT_QNA_NO_DOCUMENTS_PROMPT,
+)
 from .state import State


+def _build_language_instruction(language: str | None = None):
+    """Build language instruction for prompts."""
+    if language:
+        return f"\n\nIMPORTANT: Please respond in {language} language. All your responses, explanations, and analysis should be written in {language}."
+    return ""
+
+
+def _build_chat_history_section(chat_history: str | None = None):
+    """Build chat history section for prompts."""
+    if chat_history:
+        return f"""
+<chat_history>
+{chat_history if chat_history else "NO CHAT HISTORY PROVIDED"}
+</chat_history>
+"""
+    return """
+<chat_history>
+NO CHAT HISTORY PROVIDED
+</chat_history>
+"""
+
+
+def _format_system_prompt(
+    prompt_template: str,
+    chat_history: str | None = None,
+    language: str | None = None,
+):
+    """Format a system prompt template with dynamic values."""
+    date = datetime.datetime.now().strftime("%Y-%m-%d")
+    language_instruction = _build_language_instruction(language)
+    chat_history_section = _build_chat_history_section(chat_history)
+
+    return prompt_template.format(
+        date=date,
+        language_instruction=language_instruction,
+        chat_history_section=chat_history_section,
+    )
+
+
 async def rerank_documents(state: State, config: RunnableConfig) -> dict[str, Any]:
    """
    Rerank the documents based on relevance to the user's question.
@ -105,6 +151,33 @@ async def answer_question(state: State, config: RunnableConfig) -> dict[str, Any
    user_id = configuration.user_id
    search_space_id = configuration.search_space_id
    language = configuration.language
+
+    # Fetch search space to get QnA configuration
+    result = await state.db_session.execute(
+        select(SearchSpace).where(SearchSpace.id == search_space_id)
+    )
+    search_space = result.scalar_one_or_none()
+
+    if not search_space:
+        error_message = f"Search space {search_space_id} not found"
+        print(error_message)
+        raise RuntimeError(error_message)
+
+    # Get QnA configuration from search space
+    citations_enabled = search_space.citations_enabled
+    custom_instructions_text = search_space.qna_custom_instructions or ""
+
+    # Use constants for base prompt and citation instructions
+    qna_base_prompt = DEFAULT_QNA_BASE_PROMPT
+    qna_citation_instructions = (
+        DEFAULT_QNA_CITATION_INSTRUCTIONS if citations_enabled else ""
+    )
+    qna_custom_instructions = (
+        f"\n<special_important_custom_instructions>\n{custom_instructions_text}\n</special_important_custom_instructions>"
+        if custom_instructions_text
+        else ""
+    )
+
    # Get user's fast LLM
    llm = await get_user_fast_llm(state.db_session, user_id, search_space_id)
    if not llm:
@ -117,6 +190,11 @@ async def answer_question(state: State, config: RunnableConfig) -> dict[str, Any
    chat_history_str = langchain_chat_history_to_str(state.chat_history)

    if has_documents_initially:
+        # Compose the full citation prompt: base + citation instructions + custom instructions
+        full_citation_prompt_template = (
+            qna_base_prompt + qna_citation_instructions + qna_custom_instructions
+        )
+
        # Create base message template for token calculation (without documents)
        base_human_message_template = f"""
        
@ -129,8 +207,8 @@ async def answer_question(state: State, config: RunnableConfig) -> dict[str, Any
        """

        # Use initial system prompt for token calculation
-        initial_system_prompt = get_qna_citation_system_prompt(
-            chat_history_str, language
+        initial_system_prompt = _format_system_prompt(
+            full_citation_prompt_template, chat_history_str, language
        )
        base_messages = [
            SystemMessage(content=initial_system_prompt),
@ -149,11 +227,21 @@ async def answer_question(state: State, config: RunnableConfig) -> dict[str, Any
        has_documents = False

    # Choose system prompt based on final document availability
-    system_prompt = (
-        get_qna_citation_system_prompt(chat_history_str, language)
-        if has_documents
-        else get_qna_no_documents_system_prompt(chat_history_str, language)
-    )
+    # With documents: use base + citation instructions + custom instructions
+    # Without documents: use the default no-documents prompt from constants
+    if has_documents:
+        full_citation_prompt_template = (
+            qna_base_prompt + qna_citation_instructions + qna_custom_instructions
+        )
+        system_prompt = _format_system_prompt(
+            full_citation_prompt_template, chat_history_str, language
+        )
+    else:
+        system_prompt = _format_system_prompt(
+            DEFAULT_QNA_NO_DOCUMENTS_PROMPT + qna_custom_instructions,
+            chat_history_str,
+            language,
+        )

    # Generate documents section
    documents_text = (