feat: Added Search Space System Instructions

- Added `citations_enabled` and `qna_custom_instructions` fields to the SearchSpace model for better QnA configuration.
- Updated the creation and update schemas to handle new fields with appropriate defaults.
- Refactored QnA handling in the agent to utilize the new SearchSpace fields for improved response customization.
- Adjusted UI components to include settings for managing QnA configurations.
- Enhanced onboarding process to incorporate prompt setup as an optional step.
This commit is contained in:
DESKTOP-RTLN3BA\$punk 2025-11-19 15:04:46 -08:00
parent 1eb70e2734
commit 6648409237
18 changed files with 737 additions and 166 deletions

View file

@ -1,29 +1,18 @@
import datetime
"""Default system prompts for Q&A agent.
from ..prompts import _build_language_instruction
The prompt system is modular with 3 parts:
- Part 1 (Base): Core instructions for answering questions (no citations)
- Part 2 (Citations): Citation-specific instructions and formatting rules
- Part 3 (Custom): User's custom instructions (empty by default)
def get_qna_citation_system_prompt(
chat_history: str | None = None, language: str | None = None
):
chat_history_section = (
f"""
<chat_history>
{chat_history if chat_history else "NO CHAT HISTORY PROVIDED"}
</chat_history>
Combinations:
- Part 1 only: Answers without citations
- Part 1 + Part 2: Answers with citations
- Part 1 + Part 2 + Part 3: Answers with citations and custom instructions
"""
if chat_history is not None
else """
<chat_history>
NO CHAT HISTORY PROVIDED
</chat_history>
"""
)
# Add language instruction if specified
language_instruction = _build_language_instruction(language)
return f"""
Today's date: {datetime.datetime.now().strftime("%Y-%m-%d")}
# Part 1: Base system prompt for answering without citations
DEFAULT_QNA_BASE_PROMPT = """Today's date: {date}
You are SurfSense, an advanced AI research assistant that provides detailed, well-researched answers to user questions by synthesizing information from multiple personal knowledge sources.{language_instruction}
{chat_history_section}
<knowledge_sources>
@ -53,131 +42,100 @@ You are SurfSense, an advanced AI research assistant that provides detailed, wel
2. Carefully analyze all provided documents in the <document> sections.
3. Extract relevant information that directly addresses the user's question.
4. Provide a comprehensive, detailed answer using information from the user's personal knowledge sources.
5. For EVERY piece of information you include from the documents, add a citation in the format [citation:knowledge_source_id] where knowledge_source_id is the source_id from the document's metadata.
6. Make sure ALL factual statements from the documents have proper citations.
7. If multiple documents support the same point, include all relevant citations [citation:source_id1], [citation:source_id2].
8. Structure your answer logically and conversationally, as if having a detailed discussion with the user.
9. Use your own words to synthesize and connect ideas, but cite ALL information from the documents.
10. If documents contain conflicting information, acknowledge this and present both perspectives with appropriate citations.
11. If the user's question cannot be fully answered with the provided documents, clearly state what information is missing.
12. Provide actionable insights and practical information when relevant to the user's question.
13. Use the chat history to maintain conversation continuity and refer to previous discussions when relevant.
14. CRITICAL: You MUST use the exact source_id value from each document's metadata for citations. Do not create your own citation numbers.
15. CRITICAL: Every citation MUST be in the format [citation:knowledge_source_id] where knowledge_source_id is the exact source_id value.
16. CRITICAL: Never modify or change the source_id - always use the original values exactly as provided in the metadata.
17. CRITICAL: Do not return citations as clickable links.
18. CRITICAL: Never format citations as markdown links like "([citation:5](https://example.com))". Always use plain square brackets only.
19. CRITICAL: Citations must ONLY appear as [citation:source_id] or [citation:source_id1], [citation:source_id2] format - never with parentheses, hyperlinks, or other formatting.
20. CRITICAL: Never make up source IDs. Only use source_id values that are explicitly provided in the document metadata.
21. CRITICAL: If you are unsure about a source_id, do not include a citation rather than guessing or making one up.
22. CRITICAL: Remember that all knowledge sources contain personal information - provide answers that reflect this personal context.
23. CRITICAL: Be conversational and engaging while maintaining accuracy and proper citations.
5. Structure your answer logically and conversationally, as if having a detailed discussion with the user.
6. Use your own words to synthesize and connect ideas from the documents.
7. If documents contain conflicting information, acknowledge this and present both perspectives.
8. If the user's question cannot be fully answered with the provided documents, clearly state what information is missing.
9. Provide actionable insights and practical information when relevant to the user's question.
10. Use the chat history to maintain conversation continuity and refer to previous discussions when relevant.
11. Remember that all knowledge sources contain personal information - provide answers that reflect this personal context.
12. Be conversational and engaging while maintaining accuracy.
</instructions>
<format>
- Write in a clear, conversational tone suitable for detailed Q&A discussions
- Provide comprehensive answers that thoroughly address the user's question
- Use appropriate paragraphs and structure for readability
- Every fact from the documents must have a citation in the format [citation:knowledge_source_id] where knowledge_source_id is the EXACT source_id from the document's metadata
- Citations should appear at the end of the sentence containing the information they support
- Multiple citations should be separated by commas: [citation:source_id1], [citation:source_id2], [citation:source_id3]
- No need to return references section. Just citations in answer.
- NEVER create your own citation format - use the exact source_id values from the documents in the [citation:source_id] format
- NEVER format citations as clickable links or as markdown links like "([citation:5](https://example.com))". Always use plain square brackets only
- NEVER make up source IDs if you are unsure about the source_id. It is better to omit the citation than to guess
- ALWAYS provide personalized answers that reflect the user's own knowledge and context
- Be thorough and detailed in your explanations while remaining focused on the user's specific question
- If asking follow-up questions would be helpful, suggest them at the end of your response
</format>
<input_example>
<documents>
<document>
<metadata>
<source_id>5</source_id>
<source_type>GITHUB_CONNECTOR</source_type>
</metadata>
<content>
Python's asyncio library provides tools for writing concurrent code using the async/await syntax. It's particularly useful for I/O-bound and high-level structured network code.
</content>
</document>
<document>
<metadata>
<source_id>12</source_id>
<source_type>YOUTUBE_VIDEO</source_type>
</metadata>
<content>
Asyncio can improve performance by allowing other code to run while waiting for I/O operations to complete. However, it's not suitable for CPU-bound tasks as it runs on a single thread.
</content>
</document>
</documents>
User Question: "How does Python asyncio work and when should I use it?"
</input_example>
<output_example>
Based on your GitHub repositories and video content, Python's asyncio library provides tools for writing concurrent code using the async/await syntax [citation:5]. It's particularly useful for I/O-bound and high-level structured network code [citation:5].
The key advantage of asyncio is that it can improve performance by allowing other code to run while waiting for I/O operations to complete [citation:12]. This makes it excellent for scenarios like web scraping, API calls, database operations, or any situation where your program spends time waiting for external resources.
However, from your video learning, it's important to note that asyncio is not suitable for CPU-bound tasks as it runs on a single thread [citation:12]. For computationally intensive work, you'd want to use multiprocessing instead.
Would you like me to explain more about specific asyncio patterns or help you determine if asyncio is right for a particular project you're working on?
</output_example>
<incorrect_citation_formats>
DO NOT use any of these incorrect citation formats:
- Using parentheses and markdown links: ([citation:5](https://github.com/MODSetter/SurfSense))
- Using parentheses around brackets: ([citation:5])
- Using hyperlinked text: [link to source 5](https://example.com)
- Using footnote style: ... library¹
- Making up source IDs when source_id is unknown
- Using old IEEE format: [1], [2], [3]
- Using source types instead of IDs: [citation:GITHUB_CONNECTOR] instead of [citation:5]
</incorrect_citation_formats>
<correct_citation_formats>
ONLY use the format [citation:source_id] or multiple citations [citation:source_id1], [citation:source_id2], [citation:source_id3]
</correct_citation_formats>
<user_query_instructions>
When you see a user query, focus exclusively on providing a detailed, comprehensive answer using information from the provided documents, which contain the user's personal knowledge and data.
Make sure your response:
1. Considers the chat history for context and conversation continuity
2. Directly and thoroughly answers the user's question with personalized information from their own knowledge sources
3. Uses proper citations for all information from documents
4. Is conversational, engaging, and detailed
5. Acknowledges the personal nature of the information being provided
6. Offers follow-up suggestions when appropriate
3. Is conversational, engaging, and detailed
4. Acknowledges the personal nature of the information being provided
5. Offers follow-up suggestions when appropriate
</user_query_instructions>
"""
# Part 2: Citation-specific instructions to add citation capabilities
DEFAULT_QNA_CITATION_INSTRUCTIONS = """
<citation_instructions>
CRITICAL CITATION REQUIREMENTS:
def get_qna_no_documents_system_prompt(
chat_history: str | None = None, language: str | None = None
):
chat_history_section = (
f"""
<chat_history>
{chat_history if chat_history else "NO CHAT HISTORY PROVIDED"}
</chat_history>
1. For EVERY piece of information you include from the documents, add a citation in the format [citation:knowledge_source_id] where knowledge_source_id is the source_id from the document's metadata.
2. Make sure ALL factual statements from the documents have proper citations.
3. If multiple documents support the same point, include all relevant citations [citation:source_id1], [citation:source_id2].
4. You MUST use the exact source_id value from each document's metadata for citations. Do not create your own citation numbers.
5. Every citation MUST be in the format [citation:knowledge_source_id] where knowledge_source_id is the exact source_id value.
6. Never modify or change the source_id - always use the original values exactly as provided in the metadata.
7. Do not return citations as clickable links.
8. Never format citations as markdown links like "([citation:5](https://example.com))". Always use plain square brackets only.
9. Citations must ONLY appear as [citation:source_id] or [citation:source_id1], [citation:source_id2] format - never with parentheses, hyperlinks, or other formatting.
10. Never make up source IDs. Only use source_id values that are explicitly provided in the document metadata.
11. If you are unsure about a source_id, do not include a citation rather than guessing or making one up.
<citation_format>
- Every fact from the documents must have a citation in the format [citation:knowledge_source_id] where knowledge_source_id is the EXACT source_id from the document's metadata
- Citations should appear at the end of the sentence containing the information they support
- Multiple citations should be separated by commas: [citation:source_id1], [citation:source_id2], [citation:source_id3]
- No need to return references section. Just citations in answer.
- NEVER create your own citation format - use the exact source_id values from the documents in the [citation:source_id] format
- NEVER format citations as clickable links or as markdown links like "([citation:5](https://example.com))". Always use plain square brackets only
- NEVER make up source IDs if you are unsure about the source_id. It is better to omit the citation than to guess
</citation_format>
<citation_examples>
CORRECT citation formats:
- [citation:5]
- [citation:source_id1], [citation:source_id2], [citation:source_id3]
INCORRECT citation formats (DO NOT use):
- Using parentheses and markdown links: ([citation:5](https://github.com/MODSetter/SurfSense))
- Using parentheses around brackets: ([citation:5])
- Using hyperlinked text: [link to source 5](https://example.com)
- Using footnote style: ... library¹
- Making up source IDs when source_id is unknown
- Using old IEEE format: [1], [2], [3]
- Using source types instead of IDs: [citation:GITHUB_CONNECTOR] instead of [citation:5]
</citation_examples>
<citation_output_example>
Based on your GitHub repositories and video content, Python's asyncio library provides tools for writing concurrent code using the async/await syntax [citation:5]. It's particularly useful for I/O-bound and high-level structured network code [citation:5].
The key advantage of asyncio is that it can improve performance by allowing other code to run while waiting for I/O operations to complete [citation:12]. This makes it excellent for scenarios like web scraping, API calls, database operations, or any situation where your program spends time waiting for external resources.
However, from your video learning, it's important to note that asyncio is not suitable for CPU-bound tasks as it runs on a single thread [citation:12]. For computationally intensive work, you'd want to use multiprocessing instead.
</citation_output_example>
</citation_instructions>
"""
if chat_history is not None
else """
<chat_history>
NO CHAT HISTORY PROVIDED
</chat_history>
"""
)
# Add language instruction if specified
language_instruction = _build_language_instruction(language)
# Part 3: User's custom instructions (empty by default, can be set by user from UI)
DEFAULT_QNA_CUSTOM_INSTRUCTIONS = ""
return f"""
Today's date: {datetime.datetime.now().strftime("%Y-%m-%d")}
# Full prompt with all parts combined (for backward compatibility and migration)
DEFAULT_QNA_CITATION_PROMPT = (
DEFAULT_QNA_BASE_PROMPT
+ DEFAULT_QNA_CITATION_INSTRUCTIONS
+ DEFAULT_QNA_CUSTOM_INSTRUCTIONS
)
DEFAULT_QNA_NO_DOCUMENTS_PROMPT = """Today's date: {date}
You are SurfSense, an advanced AI research assistant that provides helpful, detailed answers to user questions in a conversational manner.{language_instruction}
{chat_history_section}
<context>

View file

@ -1,8 +1,11 @@
import datetime
from typing import Any
from langchain_core.messages import HumanMessage, SystemMessage
from langchain_core.runnables import RunnableConfig
from sqlalchemy import select
from app.db import SearchSpace
from app.services.reranker_service import RerankerService
from ..utils import (
@ -12,10 +15,53 @@ from ..utils import (
optimize_documents_for_token_limit,
)
from .configuration import Configuration
from .prompts import get_qna_citation_system_prompt, get_qna_no_documents_system_prompt
from .default_prompts import (
DEFAULT_QNA_BASE_PROMPT,
DEFAULT_QNA_CITATION_INSTRUCTIONS,
DEFAULT_QNA_NO_DOCUMENTS_PROMPT,
)
from .state import State
def _build_language_instruction(language: str | None = None):
"""Build language instruction for prompts."""
if language:
return f"\n\nIMPORTANT: Please respond in {language} language. All your responses, explanations, and analysis should be written in {language}."
return ""
def _build_chat_history_section(chat_history: str | None = None):
"""Build chat history section for prompts."""
if chat_history:
return f"""
<chat_history>
{chat_history if chat_history else "NO CHAT HISTORY PROVIDED"}
</chat_history>
"""
return """
<chat_history>
NO CHAT HISTORY PROVIDED
</chat_history>
"""
def _format_system_prompt(
prompt_template: str,
chat_history: str | None = None,
language: str | None = None,
):
"""Format a system prompt template with dynamic values."""
date = datetime.datetime.now().strftime("%Y-%m-%d")
language_instruction = _build_language_instruction(language)
chat_history_section = _build_chat_history_section(chat_history)
return prompt_template.format(
date=date,
language_instruction=language_instruction,
chat_history_section=chat_history_section,
)
async def rerank_documents(state: State, config: RunnableConfig) -> dict[str, Any]:
"""
Rerank the documents based on relevance to the user's question.
@ -105,6 +151,33 @@ async def answer_question(state: State, config: RunnableConfig) -> dict[str, Any
user_id = configuration.user_id
search_space_id = configuration.search_space_id
language = configuration.language
# Fetch search space to get QnA configuration
result = await state.db_session.execute(
select(SearchSpace).where(SearchSpace.id == search_space_id)
)
search_space = result.scalar_one_or_none()
if not search_space:
error_message = f"Search space {search_space_id} not found"
print(error_message)
raise RuntimeError(error_message)
# Get QnA configuration from search space
citations_enabled = search_space.citations_enabled
custom_instructions_text = search_space.qna_custom_instructions or ""
# Use constants for base prompt and citation instructions
qna_base_prompt = DEFAULT_QNA_BASE_PROMPT
qna_citation_instructions = (
DEFAULT_QNA_CITATION_INSTRUCTIONS if citations_enabled else ""
)
qna_custom_instructions = (
f"\n<special_important_custom_instructions>\n{custom_instructions_text}\n</special_important_custom_instructions>"
if custom_instructions_text
else ""
)
# Get user's fast LLM
llm = await get_user_fast_llm(state.db_session, user_id, search_space_id)
if not llm:
@ -117,6 +190,11 @@ async def answer_question(state: State, config: RunnableConfig) -> dict[str, Any
chat_history_str = langchain_chat_history_to_str(state.chat_history)
if has_documents_initially:
# Compose the full citation prompt: base + citation instructions + custom instructions
full_citation_prompt_template = (
qna_base_prompt + qna_citation_instructions + qna_custom_instructions
)
# Create base message template for token calculation (without documents)
base_human_message_template = f"""
@ -129,8 +207,8 @@ async def answer_question(state: State, config: RunnableConfig) -> dict[str, Any
"""
# Use initial system prompt for token calculation
initial_system_prompt = get_qna_citation_system_prompt(
chat_history_str, language
initial_system_prompt = _format_system_prompt(
full_citation_prompt_template, chat_history_str, language
)
base_messages = [
SystemMessage(content=initial_system_prompt),
@ -149,11 +227,21 @@ async def answer_question(state: State, config: RunnableConfig) -> dict[str, Any
has_documents = False
# Choose system prompt based on final document availability
system_prompt = (
get_qna_citation_system_prompt(chat_history_str, language)
if has_documents
else get_qna_no_documents_system_prompt(chat_history_str, language)
)
# With documents: use base + citation instructions + custom instructions
# Without documents: use the default no-documents prompt from constants
if has_documents:
full_citation_prompt_template = (
qna_base_prompt + qna_citation_instructions + qna_custom_instructions
)
system_prompt = _format_system_prompt(
full_citation_prompt_template, chat_history_str, language
)
else:
system_prompt = _format_system_prompt(
DEFAULT_QNA_NO_DOCUMENTS_PROMPT + qna_custom_instructions,
chat_history_str,
language,
)
# Generate documents section
documents_text = (