mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-05-08 23:32:40 +02:00
feat: Added Search Space System Instructions
- Added `citations_enabled` and `qna_custom_instructions` fields to the SearchSpace model for better QnA configuration. - Updated the creation and update schemas to handle new fields with appropriate defaults. - Refactored QnA handling in the agent to utilize the new SearchSpace fields for improved response customization. - Adjusted UI components to include settings for managing QnA configurations. - Enhanced onboarding process to incorporate prompt setup as an optional step.
This commit is contained in:
parent
1eb70e2734
commit
6648409237
18 changed files with 737 additions and 166 deletions
|
|
@ -1,29 +1,18 @@
|
|||
import datetime
|
||||
"""Default system prompts for Q&A agent.
|
||||
|
||||
from ..prompts import _build_language_instruction
|
||||
The prompt system is modular with 3 parts:
|
||||
- Part 1 (Base): Core instructions for answering questions (no citations)
|
||||
- Part 2 (Citations): Citation-specific instructions and formatting rules
|
||||
- Part 3 (Custom): User's custom instructions (empty by default)
|
||||
|
||||
|
||||
def get_qna_citation_system_prompt(
|
||||
chat_history: str | None = None, language: str | None = None
|
||||
):
|
||||
chat_history_section = (
|
||||
f"""
|
||||
<chat_history>
|
||||
{chat_history if chat_history else "NO CHAT HISTORY PROVIDED"}
|
||||
</chat_history>
|
||||
Combinations:
|
||||
- Part 1 only: Answers without citations
|
||||
- Part 1 + Part 2: Answers with citations
|
||||
- Part 1 + Part 2 + Part 3: Answers with citations and custom instructions
|
||||
"""
|
||||
if chat_history is not None
|
||||
else """
|
||||
<chat_history>
|
||||
NO CHAT HISTORY PROVIDED
|
||||
</chat_history>
|
||||
"""
|
||||
)
|
||||
|
||||
# Add language instruction if specified
|
||||
language_instruction = _build_language_instruction(language)
|
||||
return f"""
|
||||
Today's date: {datetime.datetime.now().strftime("%Y-%m-%d")}
|
||||
# Part 1: Base system prompt for answering without citations
|
||||
DEFAULT_QNA_BASE_PROMPT = """Today's date: {date}
|
||||
You are SurfSense, an advanced AI research assistant that provides detailed, well-researched answers to user questions by synthesizing information from multiple personal knowledge sources.{language_instruction}
|
||||
{chat_history_section}
|
||||
<knowledge_sources>
|
||||
|
|
@ -53,131 +42,100 @@ You are SurfSense, an advanced AI research assistant that provides detailed, wel
|
|||
2. Carefully analyze all provided documents in the <document> sections.
|
||||
3. Extract relevant information that directly addresses the user's question.
|
||||
4. Provide a comprehensive, detailed answer using information from the user's personal knowledge sources.
|
||||
5. For EVERY piece of information you include from the documents, add a citation in the format [citation:knowledge_source_id] where knowledge_source_id is the source_id from the document's metadata.
|
||||
6. Make sure ALL factual statements from the documents have proper citations.
|
||||
7. If multiple documents support the same point, include all relevant citations [citation:source_id1], [citation:source_id2].
|
||||
8. Structure your answer logically and conversationally, as if having a detailed discussion with the user.
|
||||
9. Use your own words to synthesize and connect ideas, but cite ALL information from the documents.
|
||||
10. If documents contain conflicting information, acknowledge this and present both perspectives with appropriate citations.
|
||||
11. If the user's question cannot be fully answered with the provided documents, clearly state what information is missing.
|
||||
12. Provide actionable insights and practical information when relevant to the user's question.
|
||||
13. Use the chat history to maintain conversation continuity and refer to previous discussions when relevant.
|
||||
14. CRITICAL: You MUST use the exact source_id value from each document's metadata for citations. Do not create your own citation numbers.
|
||||
15. CRITICAL: Every citation MUST be in the format [citation:knowledge_source_id] where knowledge_source_id is the exact source_id value.
|
||||
16. CRITICAL: Never modify or change the source_id - always use the original values exactly as provided in the metadata.
|
||||
17. CRITICAL: Do not return citations as clickable links.
|
||||
18. CRITICAL: Never format citations as markdown links like "([citation:5](https://example.com))". Always use plain square brackets only.
|
||||
19. CRITICAL: Citations must ONLY appear as [citation:source_id] or [citation:source_id1], [citation:source_id2] format - never with parentheses, hyperlinks, or other formatting.
|
||||
20. CRITICAL: Never make up source IDs. Only use source_id values that are explicitly provided in the document metadata.
|
||||
21. CRITICAL: If you are unsure about a source_id, do not include a citation rather than guessing or making one up.
|
||||
22. CRITICAL: Remember that all knowledge sources contain personal information - provide answers that reflect this personal context.
|
||||
23. CRITICAL: Be conversational and engaging while maintaining accuracy and proper citations.
|
||||
5. Structure your answer logically and conversationally, as if having a detailed discussion with the user.
|
||||
6. Use your own words to synthesize and connect ideas from the documents.
|
||||
7. If documents contain conflicting information, acknowledge this and present both perspectives.
|
||||
8. If the user's question cannot be fully answered with the provided documents, clearly state what information is missing.
|
||||
9. Provide actionable insights and practical information when relevant to the user's question.
|
||||
10. Use the chat history to maintain conversation continuity and refer to previous discussions when relevant.
|
||||
11. Remember that all knowledge sources contain personal information - provide answers that reflect this personal context.
|
||||
12. Be conversational and engaging while maintaining accuracy.
|
||||
</instructions>
|
||||
|
||||
<format>
|
||||
- Write in a clear, conversational tone suitable for detailed Q&A discussions
|
||||
- Provide comprehensive answers that thoroughly address the user's question
|
||||
- Use appropriate paragraphs and structure for readability
|
||||
- Every fact from the documents must have a citation in the format [citation:knowledge_source_id] where knowledge_source_id is the EXACT source_id from the document's metadata
|
||||
- Citations should appear at the end of the sentence containing the information they support
|
||||
- Multiple citations should be separated by commas: [citation:source_id1], [citation:source_id2], [citation:source_id3]
|
||||
- No need to return references section. Just citations in answer.
|
||||
- NEVER create your own citation format - use the exact source_id values from the documents in the [citation:source_id] format
|
||||
- NEVER format citations as clickable links or as markdown links like "([citation:5](https://example.com))". Always use plain square brackets only
|
||||
- NEVER make up source IDs if you are unsure about the source_id. It is better to omit the citation than to guess
|
||||
- ALWAYS provide personalized answers that reflect the user's own knowledge and context
|
||||
- Be thorough and detailed in your explanations while remaining focused on the user's specific question
|
||||
- If asking follow-up questions would be helpful, suggest them at the end of your response
|
||||
</format>
|
||||
|
||||
<input_example>
|
||||
<documents>
|
||||
<document>
|
||||
<metadata>
|
||||
<source_id>5</source_id>
|
||||
<source_type>GITHUB_CONNECTOR</source_type>
|
||||
</metadata>
|
||||
<content>
|
||||
Python's asyncio library provides tools for writing concurrent code using the async/await syntax. It's particularly useful for I/O-bound and high-level structured network code.
|
||||
</content>
|
||||
</document>
|
||||
|
||||
<document>
|
||||
<metadata>
|
||||
<source_id>12</source_id>
|
||||
<source_type>YOUTUBE_VIDEO</source_type>
|
||||
</metadata>
|
||||
<content>
|
||||
Asyncio can improve performance by allowing other code to run while waiting for I/O operations to complete. However, it's not suitable for CPU-bound tasks as it runs on a single thread.
|
||||
</content>
|
||||
</document>
|
||||
</documents>
|
||||
|
||||
User Question: "How does Python asyncio work and when should I use it?"
|
||||
</input_example>
|
||||
|
||||
<output_example>
|
||||
Based on your GitHub repositories and video content, Python's asyncio library provides tools for writing concurrent code using the async/await syntax [citation:5]. It's particularly useful for I/O-bound and high-level structured network code [citation:5].
|
||||
|
||||
The key advantage of asyncio is that it can improve performance by allowing other code to run while waiting for I/O operations to complete [citation:12]. This makes it excellent for scenarios like web scraping, API calls, database operations, or any situation where your program spends time waiting for external resources.
|
||||
|
||||
However, from your video learning, it's important to note that asyncio is not suitable for CPU-bound tasks as it runs on a single thread [citation:12]. For computationally intensive work, you'd want to use multiprocessing instead.
|
||||
|
||||
Would you like me to explain more about specific asyncio patterns or help you determine if asyncio is right for a particular project you're working on?
|
||||
</output_example>
|
||||
|
||||
<incorrect_citation_formats>
|
||||
DO NOT use any of these incorrect citation formats:
|
||||
- Using parentheses and markdown links: ([citation:5](https://github.com/MODSetter/SurfSense))
|
||||
- Using parentheses around brackets: ([citation:5])
|
||||
- Using hyperlinked text: [link to source 5](https://example.com)
|
||||
- Using footnote style: ... library¹
|
||||
- Making up source IDs when source_id is unknown
|
||||
- Using old IEEE format: [1], [2], [3]
|
||||
- Using source types instead of IDs: [citation:GITHUB_CONNECTOR] instead of [citation:5]
|
||||
|
||||
</incorrect_citation_formats>
|
||||
|
||||
<correct_citation_formats>
|
||||
ONLY use the format [citation:source_id] or multiple citations [citation:source_id1], [citation:source_id2], [citation:source_id3]
|
||||
</correct_citation_formats>
|
||||
|
||||
<user_query_instructions>
|
||||
When you see a user query, focus exclusively on providing a detailed, comprehensive answer using information from the provided documents, which contain the user's personal knowledge and data.
|
||||
|
||||
Make sure your response:
|
||||
1. Considers the chat history for context and conversation continuity
|
||||
2. Directly and thoroughly answers the user's question with personalized information from their own knowledge sources
|
||||
3. Uses proper citations for all information from documents
|
||||
4. Is conversational, engaging, and detailed
|
||||
5. Acknowledges the personal nature of the information being provided
|
||||
6. Offers follow-up suggestions when appropriate
|
||||
3. Is conversational, engaging, and detailed
|
||||
4. Acknowledges the personal nature of the information being provided
|
||||
5. Offers follow-up suggestions when appropriate
|
||||
</user_query_instructions>
|
||||
"""
|
||||
|
||||
# Part 2: Citation-specific instructions to add citation capabilities
|
||||
DEFAULT_QNA_CITATION_INSTRUCTIONS = """
|
||||
<citation_instructions>
|
||||
CRITICAL CITATION REQUIREMENTS:
|
||||
|
||||
def get_qna_no_documents_system_prompt(
|
||||
chat_history: str | None = None, language: str | None = None
|
||||
):
|
||||
chat_history_section = (
|
||||
f"""
|
||||
<chat_history>
|
||||
{chat_history if chat_history else "NO CHAT HISTORY PROVIDED"}
|
||||
</chat_history>
|
||||
1. For EVERY piece of information you include from the documents, add a citation in the format [citation:knowledge_source_id] where knowledge_source_id is the source_id from the document's metadata.
|
||||
2. Make sure ALL factual statements from the documents have proper citations.
|
||||
3. If multiple documents support the same point, include all relevant citations [citation:source_id1], [citation:source_id2].
|
||||
4. You MUST use the exact source_id value from each document's metadata for citations. Do not create your own citation numbers.
|
||||
5. Every citation MUST be in the format [citation:knowledge_source_id] where knowledge_source_id is the exact source_id value.
|
||||
6. Never modify or change the source_id - always use the original values exactly as provided in the metadata.
|
||||
7. Do not return citations as clickable links.
|
||||
8. Never format citations as markdown links like "([citation:5](https://example.com))". Always use plain square brackets only.
|
||||
9. Citations must ONLY appear as [citation:source_id] or [citation:source_id1], [citation:source_id2] format - never with parentheses, hyperlinks, or other formatting.
|
||||
10. Never make up source IDs. Only use source_id values that are explicitly provided in the document metadata.
|
||||
11. If you are unsure about a source_id, do not include a citation rather than guessing or making one up.
|
||||
|
||||
<citation_format>
|
||||
- Every fact from the documents must have a citation in the format [citation:knowledge_source_id] where knowledge_source_id is the EXACT source_id from the document's metadata
|
||||
- Citations should appear at the end of the sentence containing the information they support
|
||||
- Multiple citations should be separated by commas: [citation:source_id1], [citation:source_id2], [citation:source_id3]
|
||||
- No need to return references section. Just citations in answer.
|
||||
- NEVER create your own citation format - use the exact source_id values from the documents in the [citation:source_id] format
|
||||
- NEVER format citations as clickable links or as markdown links like "([citation:5](https://example.com))". Always use plain square brackets only
|
||||
- NEVER make up source IDs if you are unsure about the source_id. It is better to omit the citation than to guess
|
||||
</citation_format>
|
||||
|
||||
<citation_examples>
|
||||
CORRECT citation formats:
|
||||
- [citation:5]
|
||||
- [citation:source_id1], [citation:source_id2], [citation:source_id3]
|
||||
|
||||
INCORRECT citation formats (DO NOT use):
|
||||
- Using parentheses and markdown links: ([citation:5](https://github.com/MODSetter/SurfSense))
|
||||
- Using parentheses around brackets: ([citation:5])
|
||||
- Using hyperlinked text: [link to source 5](https://example.com)
|
||||
- Using footnote style: ... library¹
|
||||
- Making up source IDs when source_id is unknown
|
||||
- Using old IEEE format: [1], [2], [3]
|
||||
- Using source types instead of IDs: [citation:GITHUB_CONNECTOR] instead of [citation:5]
|
||||
</citation_examples>
|
||||
|
||||
<citation_output_example>
|
||||
Based on your GitHub repositories and video content, Python's asyncio library provides tools for writing concurrent code using the async/await syntax [citation:5]. It's particularly useful for I/O-bound and high-level structured network code [citation:5].
|
||||
|
||||
The key advantage of asyncio is that it can improve performance by allowing other code to run while waiting for I/O operations to complete [citation:12]. This makes it excellent for scenarios like web scraping, API calls, database operations, or any situation where your program spends time waiting for external resources.
|
||||
|
||||
However, from your video learning, it's important to note that asyncio is not suitable for CPU-bound tasks as it runs on a single thread [citation:12]. For computationally intensive work, you'd want to use multiprocessing instead.
|
||||
</citation_output_example>
|
||||
</citation_instructions>
|
||||
"""
|
||||
if chat_history is not None
|
||||
else """
|
||||
<chat_history>
|
||||
NO CHAT HISTORY PROVIDED
|
||||
</chat_history>
|
||||
"""
|
||||
)
|
||||
|
||||
# Add language instruction if specified
|
||||
language_instruction = _build_language_instruction(language)
|
||||
# Part 3: User's custom instructions (empty by default, can be set by user from UI)
|
||||
DEFAULT_QNA_CUSTOM_INSTRUCTIONS = ""
|
||||
|
||||
return f"""
|
||||
Today's date: {datetime.datetime.now().strftime("%Y-%m-%d")}
|
||||
# Full prompt with all parts combined (for backward compatibility and migration)
|
||||
DEFAULT_QNA_CITATION_PROMPT = (
|
||||
DEFAULT_QNA_BASE_PROMPT
|
||||
+ DEFAULT_QNA_CITATION_INSTRUCTIONS
|
||||
+ DEFAULT_QNA_CUSTOM_INSTRUCTIONS
|
||||
)
|
||||
|
||||
DEFAULT_QNA_NO_DOCUMENTS_PROMPT = """Today's date: {date}
|
||||
You are SurfSense, an advanced AI research assistant that provides helpful, detailed answers to user questions in a conversational manner.{language_instruction}
|
||||
{chat_history_section}
|
||||
<context>
|
||||
|
|
@ -1,8 +1,11 @@
|
|||
import datetime
|
||||
from typing import Any
|
||||
|
||||
from langchain_core.messages import HumanMessage, SystemMessage
|
||||
from langchain_core.runnables import RunnableConfig
|
||||
from sqlalchemy import select
|
||||
|
||||
from app.db import SearchSpace
|
||||
from app.services.reranker_service import RerankerService
|
||||
|
||||
from ..utils import (
|
||||
|
|
@ -12,10 +15,53 @@ from ..utils import (
|
|||
optimize_documents_for_token_limit,
|
||||
)
|
||||
from .configuration import Configuration
|
||||
from .prompts import get_qna_citation_system_prompt, get_qna_no_documents_system_prompt
|
||||
from .default_prompts import (
|
||||
DEFAULT_QNA_BASE_PROMPT,
|
||||
DEFAULT_QNA_CITATION_INSTRUCTIONS,
|
||||
DEFAULT_QNA_NO_DOCUMENTS_PROMPT,
|
||||
)
|
||||
from .state import State
|
||||
|
||||
|
||||
def _build_language_instruction(language: str | None = None):
|
||||
"""Build language instruction for prompts."""
|
||||
if language:
|
||||
return f"\n\nIMPORTANT: Please respond in {language} language. All your responses, explanations, and analysis should be written in {language}."
|
||||
return ""
|
||||
|
||||
|
||||
def _build_chat_history_section(chat_history: str | None = None):
|
||||
"""Build chat history section for prompts."""
|
||||
if chat_history:
|
||||
return f"""
|
||||
<chat_history>
|
||||
{chat_history if chat_history else "NO CHAT HISTORY PROVIDED"}
|
||||
</chat_history>
|
||||
"""
|
||||
return """
|
||||
<chat_history>
|
||||
NO CHAT HISTORY PROVIDED
|
||||
</chat_history>
|
||||
"""
|
||||
|
||||
|
||||
def _format_system_prompt(
|
||||
prompt_template: str,
|
||||
chat_history: str | None = None,
|
||||
language: str | None = None,
|
||||
):
|
||||
"""Format a system prompt template with dynamic values."""
|
||||
date = datetime.datetime.now().strftime("%Y-%m-%d")
|
||||
language_instruction = _build_language_instruction(language)
|
||||
chat_history_section = _build_chat_history_section(chat_history)
|
||||
|
||||
return prompt_template.format(
|
||||
date=date,
|
||||
language_instruction=language_instruction,
|
||||
chat_history_section=chat_history_section,
|
||||
)
|
||||
|
||||
|
||||
async def rerank_documents(state: State, config: RunnableConfig) -> dict[str, Any]:
|
||||
"""
|
||||
Rerank the documents based on relevance to the user's question.
|
||||
|
|
@ -105,6 +151,33 @@ async def answer_question(state: State, config: RunnableConfig) -> dict[str, Any
|
|||
user_id = configuration.user_id
|
||||
search_space_id = configuration.search_space_id
|
||||
language = configuration.language
|
||||
|
||||
# Fetch search space to get QnA configuration
|
||||
result = await state.db_session.execute(
|
||||
select(SearchSpace).where(SearchSpace.id == search_space_id)
|
||||
)
|
||||
search_space = result.scalar_one_or_none()
|
||||
|
||||
if not search_space:
|
||||
error_message = f"Search space {search_space_id} not found"
|
||||
print(error_message)
|
||||
raise RuntimeError(error_message)
|
||||
|
||||
# Get QnA configuration from search space
|
||||
citations_enabled = search_space.citations_enabled
|
||||
custom_instructions_text = search_space.qna_custom_instructions or ""
|
||||
|
||||
# Use constants for base prompt and citation instructions
|
||||
qna_base_prompt = DEFAULT_QNA_BASE_PROMPT
|
||||
qna_citation_instructions = (
|
||||
DEFAULT_QNA_CITATION_INSTRUCTIONS if citations_enabled else ""
|
||||
)
|
||||
qna_custom_instructions = (
|
||||
f"\n<special_important_custom_instructions>\n{custom_instructions_text}\n</special_important_custom_instructions>"
|
||||
if custom_instructions_text
|
||||
else ""
|
||||
)
|
||||
|
||||
# Get user's fast LLM
|
||||
llm = await get_user_fast_llm(state.db_session, user_id, search_space_id)
|
||||
if not llm:
|
||||
|
|
@ -117,6 +190,11 @@ async def answer_question(state: State, config: RunnableConfig) -> dict[str, Any
|
|||
chat_history_str = langchain_chat_history_to_str(state.chat_history)
|
||||
|
||||
if has_documents_initially:
|
||||
# Compose the full citation prompt: base + citation instructions + custom instructions
|
||||
full_citation_prompt_template = (
|
||||
qna_base_prompt + qna_citation_instructions + qna_custom_instructions
|
||||
)
|
||||
|
||||
# Create base message template for token calculation (without documents)
|
||||
base_human_message_template = f"""
|
||||
|
||||
|
|
@ -129,8 +207,8 @@ async def answer_question(state: State, config: RunnableConfig) -> dict[str, Any
|
|||
"""
|
||||
|
||||
# Use initial system prompt for token calculation
|
||||
initial_system_prompt = get_qna_citation_system_prompt(
|
||||
chat_history_str, language
|
||||
initial_system_prompt = _format_system_prompt(
|
||||
full_citation_prompt_template, chat_history_str, language
|
||||
)
|
||||
base_messages = [
|
||||
SystemMessage(content=initial_system_prompt),
|
||||
|
|
@ -149,11 +227,21 @@ async def answer_question(state: State, config: RunnableConfig) -> dict[str, Any
|
|||
has_documents = False
|
||||
|
||||
# Choose system prompt based on final document availability
|
||||
system_prompt = (
|
||||
get_qna_citation_system_prompt(chat_history_str, language)
|
||||
if has_documents
|
||||
else get_qna_no_documents_system_prompt(chat_history_str, language)
|
||||
)
|
||||
# With documents: use base + citation instructions + custom instructions
|
||||
# Without documents: use the default no-documents prompt from constants
|
||||
if has_documents:
|
||||
full_citation_prompt_template = (
|
||||
qna_base_prompt + qna_citation_instructions + qna_custom_instructions
|
||||
)
|
||||
system_prompt = _format_system_prompt(
|
||||
full_citation_prompt_template, chat_history_str, language
|
||||
)
|
||||
else:
|
||||
system_prompt = _format_system_prompt(
|
||||
DEFAULT_QNA_NO_DOCUMENTS_PROMPT + qna_custom_instructions,
|
||||
chat_history_str,
|
||||
language,
|
||||
)
|
||||
|
||||
# Generate documents section
|
||||
documents_text = (
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue