feat: Added Q/A Mode in Research Agent

2026-05-08 23:32:40 +02:00 · 2025-06-03 00:10:35 -07:00 · 2025-06-03 00:10:35 -07:00 · 0c07898f4a
commit 0c07898f4a
parent 4820caf901
18 changed files with 792 additions and 42 deletions
--- a/surfsense_backend/app/agents/researcher/qna_agent/init.py
+++ b/surfsense_backend/app/agents/researcher/qna_agent/init.py
@ -0,0 +1,6 @@
+"""QnA Agent.
+"""
+
+from .graph import graph
+
+__all__ = ["graph"]
--- a/surfsense_backend/app/agents/researcher/qna_agent/configuration.py
+++ b/surfsense_backend/app/agents/researcher/qna_agent/configuration.py
@ -0,0 +1,28 @@
+"""Define the configurable parameters for the agent."""
+
+from __future__ import annotations
+
+from dataclasses import dataclass, fields
+from typing import Optional, List, Any
+
+from langchain_core.runnables import RunnableConfig
+
+
+@dataclass(kw_only=True)
+class Configuration:
+    """The configuration for the Q&A agent."""
+
+    # Configuration parameters for the Q&A agent
+    user_query: str  # The user's question to answer
+    relevant_documents: List[Any]  # Documents provided directly to the agent for answering
+    user_id: str  # User identifier
+    search_space_id: int  # Search space identifier
+
+    @classmethod
+    def from_runnable_config(
+        cls, config: Optional[RunnableConfig] = None
+    ) -> Configuration:
+        """Create a Configuration instance from a RunnableConfig object."""
+        configurable = (config.get("configurable") or {}) if config else {}
+        _fields = {f.name for f in fields(cls) if f.init}
+        return cls(**{k: v for k, v in configurable.items() if k in _fields})
--- a/surfsense_backend/app/agents/researcher/qna_agent/graph.py
+++ b/surfsense_backend/app/agents/researcher/qna_agent/graph.py
@ -0,0 +1,20 @@
+from langgraph.graph import StateGraph
+from .state import State
+from .nodes import rerank_documents, answer_question
+from .configuration import Configuration
+
+# Define a new graph
+workflow = StateGraph(State, config_schema=Configuration)
+
+# Add the nodes to the graph
+workflow.add_node("rerank_documents", rerank_documents)
+workflow.add_node("answer_question", answer_question)
+
+# Connect the nodes
+workflow.add_edge("__start__", "rerank_documents")
+workflow.add_edge("rerank_documents", "answer_question")
+workflow.add_edge("answer_question", "__end__")
+
+# Compile the workflow into an executable graph
+graph = workflow.compile()
+graph.name = "SurfSense QnA Agent"  # This defines the custom name in LangSmith
--- a/surfsense_backend/app/agents/researcher/qna_agent/nodes.py
+++ b/surfsense_backend/app/agents/researcher/qna_agent/nodes.py
@ -0,0 +1,148 @@
+from .configuration import Configuration
+from langchain_core.runnables import RunnableConfig
+from .state import State
+from typing import Any, Dict
+from app.config import config as app_config
+from .prompts import get_qna_citation_system_prompt
+from langchain_core.messages import HumanMessage, SystemMessage
+
+async def rerank_documents(state: State, config: RunnableConfig) -> Dict[str, Any]:
+    """
+    Rerank the documents based on relevance to the user's question.
+    
+    This node takes the relevant documents provided in the configuration,
+    reranks them using the reranker service based on the user's query,
+    and updates the state with the reranked documents.
+    
+    Returns:
+        Dict containing the reranked documents.
+    """
+    # Get configuration and relevant documents
+    configuration = Configuration.from_runnable_config(config)
+    documents = configuration.relevant_documents
+    user_query = configuration.user_query
+
+    # If no documents were provided, return empty list
+    if not documents or len(documents) == 0:
+        return {
+            "reranked_documents": []
+        }
+    
+    # Get reranker service from app config
+    reranker_service = getattr(app_config, "reranker_service", None)
+    
+    # Use documents as is if no reranker service is available
+    reranked_docs = documents
+    
+    if reranker_service:
+        try:
+            # Convert documents to format expected by reranker if needed
+            reranker_input_docs = [
+                {
+                    "chunk_id": doc.get("chunk_id", f"chunk_{i}"),
+                    "content": doc.get("content", ""),
+                    "score": doc.get("score", 0.0),
+                    "document": {
+                        "id": doc.get("document", {}).get("id", ""),
+                        "title": doc.get("document", {}).get("title", ""),
+                        "document_type": doc.get("document", {}).get("document_type", ""),
+                        "metadata": doc.get("document", {}).get("metadata", {})
+                    }
+                } for i, doc in enumerate(documents)
+            ]
+            
+            # Rerank documents using the user's query
+            reranked_docs = reranker_service.rerank_documents(user_query, reranker_input_docs)
+            
+            # Sort by score in descending order
+            reranked_docs.sort(key=lambda x: x.get("score", 0), reverse=True)
+            
+            print(f"Reranked {len(reranked_docs)} documents for Q&A query: {user_query}")
+        except Exception as e:
+            print(f"Error during reranking: {str(e)}")
+            # Use original docs if reranking fails
+    
+    return {
+        "reranked_documents": reranked_docs
+    }
+
+async def answer_question(state: State, config: RunnableConfig) -> Dict[str, Any]:
+    """
+    Answer the user's question using the provided documents.
+    
+    This node takes the relevant documents provided in the configuration and uses
+    an LLM to generate a comprehensive answer to the user's question with
+    proper citations. The citations follow IEEE format using source IDs from the
+    documents.
+    
+    Returns:
+        Dict containing the final answer in the "final_answer" key.
+    """
+    
+    # Get configuration and relevant documents from configuration
+    configuration = Configuration.from_runnable_config(config)
+    documents = configuration.relevant_documents
+    user_query = configuration.user_query
+    
+    # Initialize LLM
+    llm = app_config.fast_llm_instance
+    
+    # If no documents were provided, return a message indicating this
+    if not documents or len(documents) == 0:
+        return {
+            "final_answer": "I don't have any relevant documents in your personal knowledge base to answer this question. Please try asking about topics covered in your saved content, or add more documents to your knowledge base."
+        }
+    
+    # Prepare documents for citation formatting
+    formatted_documents = []
+    for i, doc in enumerate(documents):
+        # Extract content and metadata
+        content = doc.get("content", "")
+        doc_info = doc.get("document", {})
+        document_id = doc_info.get("id")  # Use document ID
+        
+        # Format document according to the citation system prompt's expected format
+        formatted_doc = f"""
+        <document>
+            <metadata>
+                <source_id>{document_id}</source_id>
+                <source_type>{doc_info.get("document_type", "CRAWLED_URL")}</source_type>
+            </metadata>
+            <content>
+                {content}
+            </content>
+        </document>
+        """
+        formatted_documents.append(formatted_doc)
+    
+    # Create the formatted documents text
+    documents_text = "\n".join(formatted_documents)
+    
+    # Construct a clear, structured query for the LLM
+    human_message_content = f"""
+    Source material from your personal knowledge base:
+    <documents>
+        {documents_text}
+    </documents>
+    
+    User's question:
+    <user_query>
+        {user_query}
+    </user_query>
+    
+    Please provide a detailed, comprehensive answer to the user's question using the information from their personal knowledge sources. Make sure to cite all information appropriately and engage in a conversational manner.
+    """
+    
+    # Create messages for the LLM, including chat history for context
+    messages_with_chat_history = state.chat_history + [
+        SystemMessage(content=get_qna_citation_system_prompt()),
+        HumanMessage(content=human_message_content)
+    ]
+    
+    # Call the LLM and get the response
+    response = await llm.ainvoke(messages_with_chat_history)
+    final_answer = response.content
+    
+    return {
+        "final_answer": final_answer
+    }
--- a/surfsense_backend/app/agents/researcher/qna_agent/prompts.py
+++ b/surfsense_backend/app/agents/researcher/qna_agent/prompts.py
@ -0,0 +1,120 @@
+import datetime
+
+
+def get_qna_citation_system_prompt():
+    return f"""
+Today's date: {datetime.datetime.now().strftime("%Y-%m-%d")}
+You are SurfSense, an advanced AI research assistant that provides detailed, well-researched answers to user questions by synthesizing information from multiple personal knowledge sources.
+
+<knowledge_sources>
+- EXTENSION: "Web content saved via SurfSense browser extension" (personal browsing history)
+- CRAWLED_URL: "Webpages indexed by SurfSense web crawler" (personally selected websites)
+- FILE: "User-uploaded documents (PDFs, Word, etc.)" (personal files)
+- SLACK_CONNECTOR: "Slack conversations and shared content" (personal workspace communications)
+- NOTION_CONNECTOR: "Notion workspace pages and databases" (personal knowledge management)
+- YOUTUBE_VIDEO: "YouTube video transcripts and metadata" (personally saved videos)
+- GITHUB_CONNECTOR: "GitHub repository content and issues" (personal repositories and interactions)
+- LINEAR_CONNECTOR: "Linear project issues and discussions" (personal project management)
+- DISCORD_CONNECTOR: "Discord server messages and channels" (personal community interactions)
+- TAVILY_API: "Tavily search API results" (personalized search results)
+- LINKUP_API: "Linkup search API results" (personalized search results)
+</knowledge_sources>
+
+<instructions>
+1. Carefully analyze all provided documents in the <document> sections.
+2. Extract relevant information that directly addresses the user's question.
+3. Provide a comprehensive, detailed answer using information from the user's personal knowledge sources.
+4. For EVERY piece of information you include from the documents, add an IEEE-style citation in square brackets [X] where X is the source_id from the document's metadata.
+5. Make sure ALL factual statements from the documents have proper citations.
+6. If multiple documents support the same point, include all relevant citations [X], [Y].
+7. Structure your answer logically and conversationally, as if having a detailed discussion with the user.
+8. Use your own words to synthesize and connect ideas, but cite ALL information from the documents.
+9. If documents contain conflicting information, acknowledge this and present both perspectives with appropriate citations.
+10. If the user's question cannot be fully answered with the provided documents, clearly state what information is missing.
+11. Provide actionable insights and practical information when relevant to the user's question.
+12. CRITICAL: You MUST use the exact source_id value from each document's metadata for citations. Do not create your own citation numbers.
+13. CRITICAL: Every citation MUST be in the IEEE format [X] where X is the exact source_id value.
+14. CRITICAL: Never renumber or reorder citations - always use the original source_id values.
+15. CRITICAL: Do not return citations as clickable links.
+16. CRITICAL: Never format citations as markdown links like "([1](https://example.com))". Always use plain square brackets only.
+17. CRITICAL: Citations must ONLY appear as [X] or [X], [Y], [Z] format - never with parentheses, hyperlinks, or other formatting.
+18. CRITICAL: Never make up citation numbers. Only use source_id values that are explicitly provided in the document metadata.
+19. CRITICAL: If you are unsure about a source_id, do not include a citation rather than guessing or making one up.
+20. CRITICAL: Remember that all knowledge sources contain personal information - provide answers that reflect this personal context.
+21. CRITICAL: Be conversational and engaging while maintaining accuracy and proper citations.
+</instructions>
+
+<format>
+- Write in a clear, conversational tone suitable for detailed Q&A discussions
+- Provide comprehensive answers that thoroughly address the user's question
+- Use appropriate paragraphs and structure for readability
+- Every fact from the documents must have an IEEE-style citation in square brackets [X] where X is the EXACT source_id from the document's metadata
+- Citations should appear at the end of the sentence containing the information they support
+- Multiple citations should be separated by commas: [X], [Y], [Z]
+- No need to return references section. Just citation numbers in answer.
+- NEVER create your own citation numbering system - use the exact source_id values from the documents
+- NEVER format citations as clickable links or as markdown links like "([1](https://example.com))". Always use plain square brackets only
+- NEVER make up citation numbers if you are unsure about the source_id. It is better to omit the citation than to guess
+- ALWAYS provide personalized answers that reflect the user's own knowledge and context
+- Be thorough and detailed in your explanations while remaining focused on the user's specific question
+- If asking follow-up questions would be helpful, suggest them at the end of your response
+</format>
+
+<input_example>
+<documents>
+    <document>
+        <metadata>
+            <source_id>5</source_id>
+            <source_type>GITHUB_CONNECTOR</source_type>
+        </metadata>
+        <content>
+            Python's asyncio library provides tools for writing concurrent code using the async/await syntax. It's particularly useful for I/O-bound and high-level structured network code.
+        </content>
+    </document>
+    
+    <document>
+        <metadata>
+            <source_id>12</source_id>
+            <source_type>YOUTUBE_VIDEO</source_type>
+        </metadata>
+        <content>
+            Asyncio can improve performance by allowing other code to run while waiting for I/O operations to complete. However, it's not suitable for CPU-bound tasks as it runs on a single thread.
+        </content>
+    </document>
+</documents>
+
+User Question: "How does Python asyncio work and when should I use it?"
+</input_example>
+
+<output_example>
+Based on your GitHub repositories and video content, Python's asyncio library provides tools for writing concurrent code using the async/await syntax [5]. It's particularly useful for I/O-bound and high-level structured network code [5].
+
+The key advantage of asyncio is that it can improve performance by allowing other code to run while waiting for I/O operations to complete [12]. This makes it excellent for scenarios like web scraping, API calls, database operations, or any situation where your program spends time waiting for external resources.
+
+However, from your video learning, it's important to note that asyncio is not suitable for CPU-bound tasks as it runs on a single thread [12]. For computationally intensive work, you'd want to use multiprocessing instead.
+
+Would you like me to explain more about specific asyncio patterns or help you determine if asyncio is right for a particular project you're working on?
+</output_example>
+
+<incorrect_citation_formats>
+DO NOT use any of these incorrect citation formats:
+- Using parentheses and markdown links: ([1](https://github.com/MODSetter/SurfSense))
+- Using parentheses around brackets: ([1])
+- Using hyperlinked text: [link to source 1](https://example.com)
+- Using footnote style: ... library¹
+- Making up citation numbers when source_id is unknown
+
+ONLY use plain square brackets [1] or multiple citations [1], [2], [3]
+</incorrect_citation_formats>
+
+<user_query_instructions>
+When you see a user query, focus exclusively on providing a detailed, comprehensive answer using information from the provided documents, which contain the user's personal knowledge and data.
+
+Make sure your response:
+1. Directly and thoroughly answers the user's question with personalized information from their own knowledge sources
+2. Uses proper citations for all information from documents
+3. Is conversational, engaging, and detailed
+4. Acknowledges the personal nature of the information being provided
+5. Offers follow-up suggestions when appropriate
+</user_query_instructions>
+"""
--- a/surfsense_backend/app/agents/researcher/qna_agent/state.py
+++ b/surfsense_backend/app/agents/researcher/qna_agent/state.py
@ -0,0 +1,25 @@
+"""Define the state structures for the agent."""
+
+from __future__ import annotations
+
+from dataclasses import dataclass, field
+from typing import List, Optional, Any
+from sqlalchemy.ext.asyncio import AsyncSession
+
+@dataclass
+class State:
+    """Defines the dynamic state for the Q&A agent during execution.
+
+    This state tracks the database session, chat history, and the outputs 
+    generated by the agent's nodes during question answering.
+    See: https://langchain-ai.github.io/langgraph/concepts/low_level/#state
+    for more information.
+    """
+
+    # Runtime context
+    db_session: AsyncSession
+    
+    chat_history: Optional[List[Any]] = field(default_factory=list)
+    # OUTPUT: Populated by agent nodes
+    reranked_documents: Optional[List[Any]] = None
+    final_answer: Optional[str] = None