Merge remote-tracking branch 'upstream/dev' into feat/ui

2026-05-10 16:22:38 +02:00 · 2026-01-25 16:19:18 +05:30 · 2026-01-25 16:19:18 +05:30 · c7c9eb3eb2
commit c7c9eb3eb2
parent 03e38ab178 13fdb3e309
13 changed files with 825 additions and 97 deletions
--- a/README.md
+++ b/README.md
@ -29,8 +29,7 @@ SurfSense is a highly customizable AI research agent, connected to external sour
 # Video 
-https://github.com/user-attachments/assets/42a29ea1-d4d8-4213-9c69-972b5b806d58
+https://github.com/user-attachments/assets/cc0c84d3-1f2f-4f7a-b519-2ecce22310b1
 ## Podcast Sample
@ -52,8 +51,10 @@ https://github.com/user-attachments/assets/a0a16566-6967-4374-ac51-9b3e07fbecd7
 - Interact in Natural Language and get cited answers.
 ### 📄 **Cited Answers**
 - Get Cited answers just like Perplexity.
 ### 🧩 **Universal Compatibility**
 - Connect virtually any inference provider via the OpenAI spec and LiteLLM.
 ### 🔔 **Privacy & Local LLM Support**
- Works Flawlessly with Ollama local LLMs.
+- Works Flawlessly with local LLMs like vLLM and Ollama.
 ### 🏠 **Self Hostable**
 - Open source and easy to deploy locally.
 ### 👥 **Team Collaboration with RBAC**
@ -61,6 +62,7 @@ https://github.com/user-attachments/assets/a0a16566-6967-4374-ac51-9b3e07fbecd7
 - Invite team members with customizable roles (Owner, Admin, Editor, Viewer)
 - Granular permissions for documents, chats, connectors, and settings
 - Share knowledge bases securely within your organization
 - Team chats update in real-time and "Chat about the chat" in comment threads
 ### 🎙️ Podcasts 
 - Blazingly fast podcast generation agent. (Creates a 3-minute podcast in under 20 seconds.)
 - Convert your chat conversations into engaging audio content
@ -237,6 +239,8 @@ Before self-hosting installation, make sure to complete the [prerequisite setup
 ### **BackEnd** 
 -  **LiteLLM**: Universal LLM integration supporting 100+ models (OpenAI, Anthropic, Ollama, etc.)
 -  **FastAPI**: Modern, fast web framework for building APIs with Python
 -  **PostgreSQL with pgvector**: Database with vector search capabilities for similarity searches
@ -253,8 +257,6 @@ Before self-hosting installation, make sure to complete the [prerequisite setup
 -  **LangChain**: Framework for developing AI-powered applications.
 -  **LiteLLM**: Universal LLM integration supporting 100+ models (OpenAI, Anthropic, Ollama, etc.)
 -  **Rerankers**: Advanced result ranking for improved search relevance
 -  **Hybrid Search**: Combines vector similarity and full-text search for optimal results using Reciprocal Rank Fusion (RRF)
--- a/surfsense_backend/app/agents/new_chat/chat_deepagent.py
+++ b/surfsense_backend/app/agents/new_chat/chat_deepagent.py
@ -7,6 +7,7 @@ via NewLLMConfig.
 """
 from collections.abc import Sequence
 from typing import Any
 from deepagents import create_deep_agent
 from langchain_core.tools import BaseTool
@ -23,6 +24,90 @@ from app.agents.new_chat.system_prompt import (
 from app.agents.new_chat.tools.registry import build_tools_async
 from app.services.connector_service import ConnectorService
 # =============================================================================
 # Connector Type Mapping
 # =============================================================================
 # Maps SearchSourceConnectorType enum values to the searchable document/connector types
 # used by the knowledge_base tool. Some connectors map to different document types.
 _CONNECTOR_TYPE_TO_SEARCHABLE: dict[str, str] = {
    # Direct mappings (connector type == searchable type)
    "TAVILY_API": "TAVILY_API",
    "SEARXNG_API": "SEARXNG_API",
    "LINKUP_API": "LINKUP_API",
    "BAIDU_SEARCH_API": "BAIDU_SEARCH_API",
    "SLACK_CONNECTOR": "SLACK_CONNECTOR",
    "TEAMS_CONNECTOR": "TEAMS_CONNECTOR",
    "NOTION_CONNECTOR": "NOTION_CONNECTOR",
    "GITHUB_CONNECTOR": "GITHUB_CONNECTOR",
    "LINEAR_CONNECTOR": "LINEAR_CONNECTOR",
    "DISCORD_CONNECTOR": "DISCORD_CONNECTOR",
    "JIRA_CONNECTOR": "JIRA_CONNECTOR",
    "CONFLUENCE_CONNECTOR": "CONFLUENCE_CONNECTOR",
    "CLICKUP_CONNECTOR": "CLICKUP_CONNECTOR",
    "GOOGLE_CALENDAR_CONNECTOR": "GOOGLE_CALENDAR_CONNECTOR",
    "GOOGLE_GMAIL_CONNECTOR": "GOOGLE_GMAIL_CONNECTOR",
    "GOOGLE_DRIVE_CONNECTOR": "GOOGLE_DRIVE_FILE",  # Connector type differs from document type
    "AIRTABLE_CONNECTOR": "AIRTABLE_CONNECTOR",
    "LUMA_CONNECTOR": "LUMA_CONNECTOR",
    "ELASTICSEARCH_CONNECTOR": "ELASTICSEARCH_CONNECTOR",
    "WEBCRAWLER_CONNECTOR": "CRAWLED_URL",  # Maps to document type
    "BOOKSTACK_CONNECTOR": "BOOKSTACK_CONNECTOR",
    "CIRCLEBACK_CONNECTOR": "CIRCLEBACK",  # Connector type differs from document type
    "OBSIDIAN_CONNECTOR": "OBSIDIAN_CONNECTOR",
    # Composio connectors
    "COMPOSIO_GOOGLE_DRIVE_CONNECTOR": "COMPOSIO_GOOGLE_DRIVE_CONNECTOR",
    "COMPOSIO_GMAIL_CONNECTOR": "COMPOSIO_GMAIL_CONNECTOR",
    "COMPOSIO_GOOGLE_CALENDAR_CONNECTOR": "COMPOSIO_GOOGLE_CALENDAR_CONNECTOR",
 }
 # Document types that don't come from SearchSourceConnector but should always be searchable
 _ALWAYS_AVAILABLE_DOC_TYPES: list[str] = [
    "EXTENSION",  # Browser extension data
    "FILE",  # Uploaded files
    "NOTE",  # User notes
    "YOUTUBE_VIDEO",  # YouTube videos
 ]
 def _map_connectors_to_searchable_types(
    connector_types: list[Any],
 ) -> list[str]:
    """
    Map SearchSourceConnectorType enums to searchable document/connector types.
    This function:
    1. Converts connector type enums to their searchable counterparts
    2. Includes always-available document types (EXTENSION, FILE, NOTE, YOUTUBE_VIDEO)
    3. Deduplicates while preserving order
    Args:
        connector_types: List of SearchSourceConnectorType enum values
    Returns:
        List of searchable connector/document type strings
    """
    result_set: set[str] = set()
    result_list: list[str] = []
    # Add always-available document types first
    for doc_type in _ALWAYS_AVAILABLE_DOC_TYPES:
        if doc_type not in result_set:
            result_set.add(doc_type)
            result_list.append(doc_type)
    # Map each connector type to its searchable equivalent
    for ct in connector_types:
        # Handle both enum and string types
        ct_str = ct.value if hasattr(ct, "value") else str(ct)
        searchable = _CONNECTOR_TYPE_TO_SEARCHABLE.get(ct_str)
        if searchable and searchable not in result_set:
            result_set.add(searchable)
            result_list.append(searchable)
    return result_list
 # =============================================================================
 # Deep Agent Factory
 # =============================================================================
@ -116,6 +201,30 @@ async def create_surfsense_deep_agent(
            additional_tools=[my_custom_tool]
        )
    """
    # Discover available connectors and document types for this search space
    # This enables dynamic tool docstrings that inform the LLM about what's actually available
    available_connectors: list[str] | None = None
    available_document_types: list[str] | None = None
    try:
        # Get enabled search source connectors for this search space
        connector_types = await connector_service.get_available_connectors(
            search_space_id
        )
        if connector_types:
            # Convert enum values to strings and also include mapped document types
            available_connectors = _map_connectors_to_searchable_types(connector_types)
        # Get document types that have at least one document indexed
        available_document_types = await connector_service.get_available_document_types(
            search_space_id
        )
    except Exception as e:
        # Log but don't fail - fall back to all connectors if discovery fails
        import logging
        logging.warning(f"Failed to discover available connectors/document types: {e}")
    # Build dependencies dict for the tools registry
    dependencies = {
        "search_space_id": search_space_id,
@ -123,6 +232,9 @@ async def create_surfsense_deep_agent(
        "connector_service": connector_service,
        "firecrawl_api_key": firecrawl_api_key,
        "user_id": user_id,  # Required for memory tools
        # Dynamic connector/document type discovery for knowledge base tool
        "available_connectors": available_connectors,
        "available_document_types": available_document_types,
    }
    # Build tools using the async registry (includes MCP tools)
--- a/surfsense_backend/app/agents/new_chat/tools/init.py
+++ b/surfsense_backend/app/agents/new_chat/tools/init.py
@ -19,6 +19,7 @@ Available tools:
 # Tool factory exports (for direct use)
 from .display_image import create_display_image_tool
 from .knowledge_base import (
    CONNECTOR_DESCRIPTIONS,
    create_search_knowledge_base_tool,
    format_documents_for_context,
    search_knowledge_base_async,
@ -40,6 +41,8 @@ from .user_memory import create_recall_memory_tool, create_save_memory_tool
 __all__ = [
    # Registry
    "BUILTIN_TOOLS",
    # Knowledge base utilities
    "CONNECTOR_DESCRIPTIONS",
    "ToolDefinition",
    "build_tools",
    # Tool factories
@ -51,7 +54,6 @@ __all__ = [
    "create_scrape_webpage_tool",
    "create_search_knowledge_base_tool",
    "create_search_surfsense_docs_tool",
    # Knowledge base utilities
    "format_documents_for_context",
    "get_all_tool_names",
    "get_default_enabled_tools",
--- a/surfsense_backend/app/agents/new_chat/tools/knowledge_base.py
+++ b/surfsense_backend/app/agents/new_chat/tools/knowledge_base.py
@ -12,7 +12,8 @@ import json
 from datetime import datetime
 from typing import Any
-from langchain_core.tools import tool
+from langchain_core.tools import StructuredTool
 from pydantic import BaseModel, Field
 from sqlalchemy.ext.asyncio import AsyncSession
 from app.services.connector_service import ConnectorService
@ -22,6 +23,7 @@ from app.services.connector_service import ConnectorService
 # =============================================================================
 # Canonical connector values used internally by ConnectorService
 # Includes all document types and search source connectors
 _ALL_CONNECTORS: list[str] = [
    "EXTENSION",
    "FILE",
@ -50,41 +52,117 @@ _ALL_CONNECTORS: list[str] = [
    "CRAWLED_URL",
    "CIRCLEBACK",
    "OBSIDIAN_CONNECTOR",
    # Composio connectors
    "COMPOSIO_GOOGLE_DRIVE_CONNECTOR",
    "COMPOSIO_GMAIL_CONNECTOR",
    "COMPOSIO_GOOGLE_CALENDAR_CONNECTOR",
 ]
 # Human-readable descriptions for each connector type
 # Used for generating dynamic docstrings and informing the LLM
 CONNECTOR_DESCRIPTIONS: dict[str, str] = {
    "EXTENSION": "Web content saved via SurfSense browser extension (personal browsing history)",
    "FILE": "User-uploaded documents (PDFs, Word, etc.) (personal files)",
    "NOTE": "SurfSense Notes (notes created inside SurfSense)",
    "SLACK_CONNECTOR": "Slack conversations and shared content (personal workspace communications)",
    "TEAMS_CONNECTOR": "Microsoft Teams messages and conversations (personal Teams communications)",
    "NOTION_CONNECTOR": "Notion workspace pages and databases (personal knowledge management)",
    "YOUTUBE_VIDEO": "YouTube video transcripts and metadata (personally saved videos)",
    "GITHUB_CONNECTOR": "GitHub repository content and issues (personal repositories and interactions)",
    "ELASTICSEARCH_CONNECTOR": "Elasticsearch indexed documents and data (personal Elasticsearch instances)",
    "LINEAR_CONNECTOR": "Linear project issues and discussions (personal project management)",
    "JIRA_CONNECTOR": "Jira project issues, tickets, and comments (personal project tracking)",
    "CONFLUENCE_CONNECTOR": "Confluence pages and comments (personal project documentation)",
    "CLICKUP_CONNECTOR": "ClickUp tasks and project data (personal task management)",
    "GOOGLE_CALENDAR_CONNECTOR": "Google Calendar events, meetings, and schedules (personal calendar)",
    "GOOGLE_GMAIL_CONNECTOR": "Google Gmail emails and conversations (personal emails)",
    "GOOGLE_DRIVE_FILE": "Google Drive files and documents (personal cloud storage)",
    "DISCORD_CONNECTOR": "Discord server conversations and shared content (personal community)",
    "AIRTABLE_CONNECTOR": "Airtable records, tables, and database content (personal data)",
    "TAVILY_API": "Tavily web search API results (real-time web search)",
    "SEARXNG_API": "SearxNG search API results (privacy-focused web search)",
    "LINKUP_API": "Linkup search API results (web search)",
    "BAIDU_SEARCH_API": "Baidu search API results (Chinese web search)",
    "LUMA_CONNECTOR": "Luma events and meetings",
    "WEBCRAWLER_CONNECTOR": "Webpages indexed by SurfSense (personally selected websites)",
    "CRAWLED_URL": "Webpages indexed by SurfSense (personally selected websites)",
    "BOOKSTACK_CONNECTOR": "BookStack pages (personal documentation)",
    "CIRCLEBACK": "Circleback meeting notes, transcripts, and action items",
    "OBSIDIAN_CONNECTOR": "Obsidian vault notes and markdown files (personal notes)",
    # Composio connectors
    "COMPOSIO_GOOGLE_DRIVE_CONNECTOR": "Google Drive files via Composio (personal cloud storage)",
    "COMPOSIO_GMAIL_CONNECTOR": "Gmail emails via Composio (personal emails)",
    "COMPOSIO_GOOGLE_CALENDAR_CONNECTOR": "Google Calendar events via Composio (personal calendar)",
 }
-def _normalize_connectors(connectors_to_search: list[str] | None) -> list[str]:
+
 def _normalize_connectors(
    connectors_to_search: list[str] | None,
    available_connectors: list[str] | None = None,
 ) -> list[str]:
    """
    Normalize connectors provided by the model.
    - Accepts user-facing enums like WEBCRAWLER_CONNECTOR and maps them to canonical
      ConnectorService types.
    - Drops unknown values.
-    - If None/empty, defaults to searching across all known connectors.
+    - If available_connectors is provided, only includes connectors from that list.
    - If connectors_to_search is None/empty, defaults to available_connectors or all.
    Args:
        connectors_to_search: List of connectors requested by the model
        available_connectors: List of connectors actually available in the search space
    Returns:
        List of normalized connector strings to search
    """
    # Determine the set of valid connectors to consider
    valid_set = (
        set(available_connectors) if available_connectors else set(_ALL_CONNECTORS)
    )
    if not connectors_to_search:
-        return list(_ALL_CONNECTORS)
+        # Search all available connectors if none specified
        return (
            list(available_connectors)
            if available_connectors
            else list(_ALL_CONNECTORS)
        )
    normalized: list[str] = []
    for raw in connectors_to_search:
        c = (raw or "").strip().upper()
        if not c:
            continue
        # Map user-facing aliases to canonical names
        if c == "WEBCRAWLER_CONNECTOR":
            c = "CRAWLED_URL"
        normalized.append(c)
-    # de-dupe while preserving order + filter unknown
+    # de-dupe while preserving order + filter to valid connectors
    seen: set[str] = set()
    out: list[str] = []
    for c in normalized:
        if c in seen:
            continue
        # Only include if it's a known connector AND available
        if c not in _ALL_CONNECTORS:
            continue
        if c not in valid_set:
            continue
        seen.add(c)
        out.append(c)
-    return out if out else list(_ALL_CONNECTORS)
+
    # Fallback to all available if nothing matched
    return (
        out
        if out
        else (
            list(available_connectors)
            if available_connectors
            else list(_ALL_CONNECTORS)
        )
    )
 # =============================================================================
@ -233,6 +311,7 @@ async def search_knowledge_base_async(
    top_k: int = 10,
    start_date: datetime | None = None,
    end_date: datetime | None = None,
    available_connectors: list[str] | None = None,
 ) -> str:
    """
    Search the user's knowledge base for relevant documents.
@ -248,6 +327,8 @@ async def search_knowledge_base_async(
        top_k: Number of results per connector
        start_date: Optional start datetime (UTC) for filtering documents
        end_date: Optional end datetime (UTC) for filtering documents
        available_connectors: Optional list of connectors actually available in the search space.
                            If provided, only these connectors will be searched.
    Returns:
        Formatted string with search results
@ -262,7 +343,7 @@ async def search_knowledge_base_async(
        end_date=end_date,
    )
-    connectors = _normalize_connectors(connectors_to_search)
+    connectors = _normalize_connectors(connectors_to_search, available_connectors)
    for connector in connectors:
        try:
@ -316,6 +397,16 @@ async def search_knowledge_base_async(
                )
                all_documents.extend(chunks)
            elif connector == "TEAMS_CONNECTOR":
                _, chunks = await connector_service.search_teams(
                    user_query=query,
                    search_space_id=search_space_id,
                    top_k=top_k,
                    start_date=resolved_start_date,
                    end_date=resolved_end_date,
                )
                all_documents.extend(chunks)
            elif connector == "NOTION_CONNECTOR":
                _, chunks = await connector_service.search_notion(
                    user_query=query,
@ -519,6 +610,39 @@ async def search_knowledge_base_async(
                )
                all_documents.extend(chunks)
            # =========================================================
            # Composio Connectors
            # =========================================================
            elif connector == "COMPOSIO_GOOGLE_DRIVE_CONNECTOR":
                _, chunks = await connector_service.search_composio_google_drive(
                    user_query=query,
                    search_space_id=search_space_id,
                    top_k=top_k,
                    start_date=resolved_start_date,
                    end_date=resolved_end_date,
                )
                all_documents.extend(chunks)
            elif connector == "COMPOSIO_GMAIL_CONNECTOR":
                _, chunks = await connector_service.search_composio_gmail(
                    user_query=query,
                    search_space_id=search_space_id,
                    top_k=top_k,
                    start_date=resolved_start_date,
                    end_date=resolved_end_date,
                )
                all_documents.extend(chunks)
            elif connector == "COMPOSIO_GOOGLE_CALENDAR_CONNECTOR":
                _, chunks = await connector_service.search_composio_google_calendar(
                    user_query=query,
                    search_space_id=search_space_id,
                    top_k=top_k,
                    start_date=resolved_start_date,
                    end_date=resolved_end_date,
                )
                all_documents.extend(chunks)
        except Exception as e:
            print(f"Error searching connector {connector}: {e}")
            continue
@ -543,11 +667,68 @@ async def search_knowledge_base_async(
    return format_documents_for_context(deduplicated)
 def _build_connector_docstring(available_connectors: list[str] | None) -> str:
    """
    Build the connector documentation section for the tool docstring.
    Args:
        available_connectors: List of available connector types, or None for all
    Returns:
        Formatted docstring section listing available connectors
    """
    connectors = available_connectors if available_connectors else list(_ALL_CONNECTORS)
    lines = []
    for connector in connectors:
        # Skip internal names, prefer user-facing aliases
        if connector == "CRAWLED_URL":
            # Show as WEBCRAWLER_CONNECTOR for user-facing docs
            description = CONNECTOR_DESCRIPTIONS.get(connector, connector)
            lines.append(f"- WEBCRAWLER_CONNECTOR: {description}")
        else:
            description = CONNECTOR_DESCRIPTIONS.get(connector, connector)
            lines.append(f"- {connector}: {description}")
    return "\n".join(lines)
 # =============================================================================
 # Tool Input Schema
 # =============================================================================
 class SearchKnowledgeBaseInput(BaseModel):
    """Input schema for the search_knowledge_base tool."""
    query: str = Field(
        description="The search query - be specific and include key terms"
    )
    top_k: int = Field(
        default=10,
        description="Number of results to retrieve (default: 10)",
    )
    start_date: str | None = Field(
        default=None,
        description="Optional ISO date/datetime (e.g. '2025-12-12' or '2025-12-12T00:00:00+00:00')",
    )
    end_date: str | None = Field(
        default=None,
        description="Optional ISO date/datetime (e.g. '2025-12-19' or '2025-12-19T23:59:59+00:00')",
    )
    connectors_to_search: list[str] | None = Field(
        default=None,
        description="Optional list of connector enums to search. If omitted, searches all available.",
    )
 def create_search_knowledge_base_tool(
    search_space_id: int,
    db_session: AsyncSession,
    connector_service: ConnectorService,
-):
+    available_connectors: list[str] | None = None,
    available_document_types: list[str] | None = None,
 ) -> StructuredTool:
    """
    Factory function to create the search_knowledge_base tool with injected dependencies.
@ -555,72 +736,57 @@ def create_search_knowledge_base_tool(
        search_space_id: The user's search space ID
        db_session: Database session
        connector_service: Initialized connector service
        available_connectors: Optional list of connector types available in the search space.
                            Used to dynamically generate the tool docstring.
        available_document_types: Optional list of document types that have data in the search space.
                                Used to inform the LLM about what data exists.
    Returns:
-        A configured tool function
+        A configured StructuredTool instance
    """
    # Build connector documentation dynamically
    connector_docs = _build_connector_docstring(available_connectors)
-    @tool
+    # Build context about available document types
-    async def search_knowledge_base(
+    doc_types_info = ""
    if available_document_types:
        doc_types_info = f"""
 ## Document types with indexed content in this search space
 The following document types have content available for search:
 {", ".join(available_document_types)}
 Focus searches on these types for best results."""
    # Build the dynamic description for the tool
    # This is what the LLM sees when deciding whether/how to use the tool
    dynamic_description = f"""Search the user's personal knowledge base for relevant information.
 Use this tool to find documents, notes, files, web pages, and other content that may help answer the user's question.
 IMPORTANT:
 - If the user requests a specific source type (e.g. "my notes", "Slack messages"), pass `connectors_to_search=[...]` using the enums below.
 - If `connectors_to_search` is omitted/empty, the system will search broadly.
 - Only connectors that are enabled/configured for this search space are available.{doc_types_info}
 ## Available connector enums for `connectors_to_search`
 {connector_docs}
 NOTE: `WEBCRAWLER_CONNECTOR` is mapped internally to the canonical document type `CRAWLED_URL`."""
    # Capture for closure
    _available_connectors = available_connectors
    async def _search_knowledge_base_impl(
        query: str,
        top_k: int = 10,
        start_date: str | None = None,
        end_date: str | None = None,
        connectors_to_search: list[str] | None = None,
    ) -> str:
-        """
+        """Implementation function for knowledge base search."""
        Search the user's personal knowledge base for relevant information.
        Use this tool to find documents, notes, files, web pages, and other content
        that may help answer the user's question.
        IMPORTANT:
        - If the user requests a specific source type (e.g. "my notes", "Slack messages"),
          pass `connectors_to_search=[...]` using the enums below.
        - If `connectors_to_search` is omitted/empty, the system will search broadly.
        ## Available connector enums for `connectors_to_search`
        - EXTENSION: "Web content saved via SurfSense browser extension" (personal browsing history)
        - FILE: "User-uploaded documents (PDFs, Word, etc.)" (personal files)
        - NOTE: "SurfSense Notes" (notes created inside SurfSense)
        - SLACK_CONNECTOR: "Slack conversations and shared content" (personal workspace communications)
        - TEAMS_CONNECTOR: "Microsoft Teams messages and conversations" (personal Teams communications)
        - NOTION_CONNECTOR: "Notion workspace pages and databases" (personal knowledge management)
        - YOUTUBE_VIDEO: "YouTube video transcripts and metadata" (personally saved videos)
        - GITHUB_CONNECTOR: "GitHub repository content and issues" (personal repositories and interactions)
        - ELASTICSEARCH_CONNECTOR: "Elasticsearch indexed documents and data" (personal Elasticsearch instances and custom data sources)
        - LINEAR_CONNECTOR: "Linear project issues and discussions" (personal project management)
        - JIRA_CONNECTOR: "Jira project issues, tickets, and comments" (personal project tracking)
        - CONFLUENCE_CONNECTOR: "Confluence pages and comments" (personal project documentation)
        - CLICKUP_CONNECTOR: "ClickUp tasks and project data" (personal task management)
        - GOOGLE_CALENDAR_CONNECTOR: "Google Calendar events, meetings, and schedules" (personal calendar and time management)
        - GOOGLE_GMAIL_CONNECTOR: "Google Gmail emails and conversations" (personal emails and communications)
        - GOOGLE_DRIVE_FILE: "Google Drive files and documents" (personal cloud storage and file management)
        - DISCORD_CONNECTOR: "Discord server conversations and shared content" (personal community communications)
        - AIRTABLE_CONNECTOR: "Airtable records, tables, and database content" (personal data management and organization)
        - TAVILY_API: "Tavily search API results" (personalized search results)
        - SEARXNG_API: "SearxNG search API results" (personalized search results)
        - LINKUP_API: "Linkup search API results" (personalized search results)
        - BAIDU_SEARCH_API: "Baidu search API results" (personalized search results)
        - LUMA_CONNECTOR: "Luma events"
        - WEBCRAWLER_CONNECTOR: "Webpages indexed by SurfSense" (personally selected websites)
        - BOOKSTACK_CONNECTOR: "BookStack pages" (personal documentation)
        - CIRCLEBACK: "Circleback meeting notes, transcripts, and action items" (personal meeting records)
        - OBSIDIAN_CONNECTOR: "Obsidian vault notes and markdown files" (personal notes and knowledge management)
        NOTE: `WEBCRAWLER_CONNECTOR` is mapped internally to the canonical document type `CRAWLED_URL`.
        Args:
            query: The search query - be specific and include key terms
            top_k: Number of results to retrieve (default: 10)
            start_date: Optional ISO date/datetime (e.g. "2025-12-12" or "2025-12-12T00:00:00+00:00")
            end_date: Optional ISO date/datetime (e.g. "2025-12-19" or "2025-12-19T23:59:59+00:00")
            connectors_to_search: Optional list of connector enums to search. If omitted, searches all.
        Returns:
            Formatted string with relevant documents and their content
        """
        from app.agents.new_chat.utils import parse_date_or_datetime
        parsed_start: datetime | None = None
@ -640,6 +806,16 @@ def create_search_knowledge_base_tool(
            top_k=top_k,
            start_date=parsed_start,
            end_date=parsed_end,
            available_connectors=_available_connectors,
        )
-    return search_knowledge_base
+    # Create StructuredTool with dynamic description
    # This properly sets the description that the LLM sees
    tool = StructuredTool(
        name="search_knowledge_base",
        description=dynamic_description,
        coroutine=_search_knowledge_base_impl,
        args_schema=SearchKnowledgeBaseInput,
    )
    return tool
--- a/surfsense_backend/app/agents/new_chat/tools/registry.py
+++ b/surfsense_backend/app/agents/new_chat/tools/registry.py
@ -85,6 +85,7 @@ class ToolDefinition:
 # Contributors: Add your new tools here!
 BUILTIN_TOOLS: list[ToolDefinition] = [
    # Core tool - searches the user's knowledge base
    # Now supports dynamic connector/document type discovery
    ToolDefinition(
        name="search_knowledge_base",
        description="Search the user's personal knowledge base for relevant information",
@ -92,8 +93,12 @@ BUILTIN_TOOLS: list[ToolDefinition] = [
            search_space_id=deps["search_space_id"],
            db_session=deps["db_session"],
            connector_service=deps["connector_service"],
            # Optional: dynamically discovered connectors/document types
            available_connectors=deps.get("available_connectors"),
            available_document_types=deps.get("available_document_types"),
        ),
        requires=["search_space_id", "db_session", "connector_service"],
        # Note: available_connectors and available_document_types are optional
    ),
    # Podcast generation tool
    ToolDefinition(
--- a/surfsense_backend/app/services/connector_service.py
+++ b/surfsense_backend/app/services/connector_service.py
@ -2871,3 +2871,350 @@ class ConnectorService:
        }
        return result_object, obsidian_docs
    # =========================================================================
    # Composio Connector Search Methods
    # =========================================================================
    async def search_composio_google_drive(
        self,
        user_query: str,
        search_space_id: int,
        top_k: int = 20,
        start_date: datetime | None = None,
        end_date: datetime | None = None,
    ) -> tuple:
        """
        Search for Composio Google Drive files and return both the source information
        and langchain documents.
        Uses combined chunk-level and document-level hybrid search with RRF fusion.
        Args:
            user_query: The user's query
            search_space_id: The search space ID to search in
            top_k: Maximum number of results to return
            start_date: Optional start date for filtering documents by updated_at
            end_date: Optional end date for filtering documents by updated_at
        Returns:
            tuple: (sources_info, langchain_documents)
        """
        composio_drive_docs = await self._combined_rrf_search(
            query_text=user_query,
            search_space_id=search_space_id,
            document_type="COMPOSIO_GOOGLE_DRIVE_CONNECTOR",
            top_k=top_k,
            start_date=start_date,
            end_date=end_date,
        )
        # Early return if no results
        if not composio_drive_docs:
            return {
                "id": 54,
                "name": "Google Drive (Composio)",
                "type": "COMPOSIO_GOOGLE_DRIVE_CONNECTOR",
                "sources": [],
            }, []
        def _title_fn(doc_info: dict[str, Any], metadata: dict[str, Any]) -> str:
            return (
                doc_info.get("title")
                or metadata.get("title")
                or metadata.get("file_name")
                or "Untitled Document"
            )
        def _url_fn(_doc_info: dict[str, Any], metadata: dict[str, Any]) -> str:
            return metadata.get("url") or metadata.get("web_view_link") or ""
        def _description_fn(
            chunk: dict[str, Any], _doc_info: dict[str, Any], metadata: dict[str, Any]
        ) -> str:
            description = self._chunk_preview(chunk.get("content", ""), limit=200)
            info_parts = []
            mime_type = metadata.get("mime_type")
            modified_time = metadata.get("modified_time")
            if mime_type:
                info_parts.append(f"Type: {mime_type}")
            if modified_time:
                info_parts.append(f"Modified: {modified_time}")
            if info_parts:
                description = (description + " | " + " | ".join(info_parts)).strip(" |")
            return description
        def _extra_fields_fn(
            _chunk: dict[str, Any], _doc_info: dict[str, Any], metadata: dict[str, Any]
        ) -> dict[str, Any]:
            return {
                "mime_type": metadata.get("mime_type", ""),
                "file_id": metadata.get("file_id", ""),
                "modified_time": metadata.get("modified_time", ""),
            }
        sources_list = self._build_chunk_sources_from_documents(
            composio_drive_docs,
            title_fn=_title_fn,
            url_fn=_url_fn,
            description_fn=_description_fn,
            extra_fields_fn=_extra_fields_fn,
        )
        # Create result object
        result_object = {
            "id": 54,
            "name": "Google Drive (Composio)",
            "type": "COMPOSIO_GOOGLE_DRIVE_CONNECTOR",
            "sources": sources_list,
        }
        return result_object, composio_drive_docs
    async def search_composio_gmail(
        self,
        user_query: str,
        search_space_id: int,
        top_k: int = 20,
        start_date: datetime | None = None,
        end_date: datetime | None = None,
    ) -> tuple:
        """
        Search for Composio Gmail messages and return both the source information
        and langchain documents.
        Uses combined chunk-level and document-level hybrid search with RRF fusion.
        Args:
            user_query: The user's query
            search_space_id: The search space ID to search in
            top_k: Maximum number of results to return
            start_date: Optional start date for filtering documents by updated_at
            end_date: Optional end date for filtering documents by updated_at
        Returns:
            tuple: (sources_info, langchain_documents)
        """
        composio_gmail_docs = await self._combined_rrf_search(
            query_text=user_query,
            search_space_id=search_space_id,
            document_type="COMPOSIO_GMAIL_CONNECTOR",
            top_k=top_k,
            start_date=start_date,
            end_date=end_date,
        )
        # Early return if no results
        if not composio_gmail_docs:
            return {
                "id": 55,
                "name": "Gmail (Composio)",
                "type": "COMPOSIO_GMAIL_CONNECTOR",
                "sources": [],
            }, []
        def _title_fn(doc_info: dict[str, Any], metadata: dict[str, Any]) -> str:
            return (
                doc_info.get("title")
                or metadata.get("subject")
                or metadata.get("title")
                or "Untitled Email"
            )
        def _url_fn(_doc_info: dict[str, Any], metadata: dict[str, Any]) -> str:
            return metadata.get("url") or ""
        def _description_fn(
            chunk: dict[str, Any], _doc_info: dict[str, Any], metadata: dict[str, Any]
        ) -> str:
            description = self._chunk_preview(chunk.get("content", ""), limit=200)
            info_parts = []
            sender = metadata.get("from") or metadata.get("sender")
            date = metadata.get("date") or metadata.get("received_at")
            if sender:
                info_parts.append(f"From: {sender}")
            if date:
                info_parts.append(f"Date: {date}")
            if info_parts:
                description = (description + " | " + " | ".join(info_parts)).strip(" |")
            return description
        def _extra_fields_fn(
            _chunk: dict[str, Any], _doc_info: dict[str, Any], metadata: dict[str, Any]
        ) -> dict[str, Any]:
            return {
                "message_id": metadata.get("message_id", ""),
                "thread_id": metadata.get("thread_id", ""),
                "from": metadata.get("from", ""),
                "to": metadata.get("to", ""),
                "date": metadata.get("date", ""),
            }
        sources_list = self._build_chunk_sources_from_documents(
            composio_gmail_docs,
            title_fn=_title_fn,
            url_fn=_url_fn,
            description_fn=_description_fn,
            extra_fields_fn=_extra_fields_fn,
        )
        # Create result object
        result_object = {
            "id": 55,
            "name": "Gmail (Composio)",
            "type": "COMPOSIO_GMAIL_CONNECTOR",
            "sources": sources_list,
        }
        return result_object, composio_gmail_docs
    async def search_composio_google_calendar(
        self,
        user_query: str,
        search_space_id: int,
        top_k: int = 20,
        start_date: datetime | None = None,
        end_date: datetime | None = None,
    ) -> tuple:
        """
        Search for Composio Google Calendar events and return both the source information
        and langchain documents.
        Uses combined chunk-level and document-level hybrid search with RRF fusion.
        Args:
            user_query: The user's query
            search_space_id: The search space ID to search in
            top_k: Maximum number of results to return
            start_date: Optional start date for filtering documents by updated_at
            end_date: Optional end date for filtering documents by updated_at
        Returns:
            tuple: (sources_info, langchain_documents)
        """
        composio_calendar_docs = await self._combined_rrf_search(
            query_text=user_query,
            search_space_id=search_space_id,
            document_type="COMPOSIO_GOOGLE_CALENDAR_CONNECTOR",
            top_k=top_k,
            start_date=start_date,
            end_date=end_date,
        )
        # Early return if no results
        if not composio_calendar_docs:
            return {
                "id": 56,
                "name": "Google Calendar (Composio)",
                "type": "COMPOSIO_GOOGLE_CALENDAR_CONNECTOR",
                "sources": [],
            }, []
        def _title_fn(doc_info: dict[str, Any], metadata: dict[str, Any]) -> str:
            return (
                doc_info.get("title")
                or metadata.get("summary")
                or metadata.get("title")
                or "Untitled Event"
            )
        def _url_fn(_doc_info: dict[str, Any], metadata: dict[str, Any]) -> str:
            return metadata.get("url") or metadata.get("html_link") or ""
        def _description_fn(
            chunk: dict[str, Any], _doc_info: dict[str, Any], metadata: dict[str, Any]
        ) -> str:
            description = self._chunk_preview(chunk.get("content", ""), limit=200)
            info_parts = []
            start_time = metadata.get("start_time") or metadata.get("start")
            end_time = metadata.get("end_time") or metadata.get("end")
            if start_time:
                info_parts.append(f"Start: {start_time}")
            if end_time:
                info_parts.append(f"End: {end_time}")
            if info_parts:
                description = (description + " | " + " | ".join(info_parts)).strip(" |")
            return description
        def _extra_fields_fn(
            _chunk: dict[str, Any], _doc_info: dict[str, Any], metadata: dict[str, Any]
        ) -> dict[str, Any]:
            return {
                "event_id": metadata.get("event_id", ""),
                "calendar_id": metadata.get("calendar_id", ""),
                "start_time": metadata.get("start_time", ""),
                "end_time": metadata.get("end_time", ""),
                "location": metadata.get("location", ""),
            }
        sources_list = self._build_chunk_sources_from_documents(
            composio_calendar_docs,
            title_fn=_title_fn,
            url_fn=_url_fn,
            description_fn=_description_fn,
            extra_fields_fn=_extra_fields_fn,
        )
        # Create result object
        result_object = {
            "id": 56,
            "name": "Google Calendar (Composio)",
            "type": "COMPOSIO_GOOGLE_CALENDAR_CONNECTOR",
            "sources": sources_list,
        }
        return result_object, composio_calendar_docs
    # =========================================================================
    # Utility Methods for Connector Discovery
    # =========================================================================
    async def get_available_connectors(
        self,
        search_space_id: int,
    ) -> list[SearchSourceConnectorType]:
        """
        Get all available (enabled) connector types for a search space.
        Args:
            search_space_id: The search space ID
        Returns:
            List of SearchSourceConnectorType enums for enabled connectors
        """
        query = (
            select(SearchSourceConnector.connector_type)
            .filter(
                SearchSourceConnector.search_space_id == search_space_id,
            )
            .distinct()
        )
        result = await self.session.execute(query)
        connector_types = result.scalars().all()
        return list(connector_types)
    async def get_available_document_types(
        self,
        search_space_id: int,
    ) -> list[str]:
        """
        Get all document types that have at least one document in the search space.
        Args:
            search_space_id: The search space ID
        Returns:
            List of document type strings that have documents indexed
        """
        from sqlalchemy import distinct
        from app.db import Document
        query = select(distinct(Document.document_type)).filter(
            Document.search_space_id == search_space_id,
        )
        result = await self.session.execute(query)
        doc_types = result.scalars().all()
        return [str(dt) for dt in doc_types]
--- a/surfsense_backend/app/tasks/chat/stream_new_chat.py
+++ b/surfsense_backend/app/tasks/chat/stream_new_chat.py
@ -54,21 +54,64 @@ def format_attachments_as_context(attachments: list[ChatAttachment]) -> str:
 def format_mentioned_documents_as_context(documents: list[Document]) -> str:
-    """Format mentioned documents as context for the agent."""
+    """
    Format mentioned documents as context for the agent.
    Uses the same XML structure as knowledge_base.format_documents_for_context
    to ensure citations work properly with chunk IDs.
    """
    if not documents:
        return ""
    context_parts = ["<mentioned_documents>"]
    context_parts.append(
        "The user has explicitly mentioned the following documents from their knowledge base. "
-        "These documents are directly relevant to the query and should be prioritized as primary sources."
+        "These documents are directly relevant to the query and should be prioritized as primary sources. "
        "Use [citation:CHUNK_ID] format for citations (e.g., [citation:123])."
    )
-    for i, doc in enumerate(documents, 1):
+    context_parts.append("")
-        context_parts.append(
+
-            f"<document index='{i}' id='{doc.id}' title='{doc.title}' type='{doc.document_type.value}'>"
+    for doc in documents:
        # Build metadata JSON
        metadata = doc.document_metadata or {}
        metadata_json = json.dumps(metadata, ensure_ascii=False)
        # Get URL from metadata
        url = (
            metadata.get("url")
            or metadata.get("source")
            or metadata.get("page_url")
            or ""
        )
-        context_parts.append(f"<![CDATA[{doc.content}]]>")
+
        context_parts.append("<document>")
        context_parts.append("<document_metadata>")
        context_parts.append(f"  <document_id>{doc.id}</document_id>")
        context_parts.append(f"  <document_type>{doc.document_type.value}</document_type>")
        context_parts.append(f"  <title><![CDATA[{doc.title}]]></title>")
        context_parts.append(f"  <url><![CDATA[{url}]]></url>")
        context_parts.append(f"  <metadata_json><![CDATA[{metadata_json}]]></metadata_json>")
        context_parts.append("</document_metadata>")
        context_parts.append("")
        context_parts.append("<document_content>")
        # Use chunks if available (preferred for proper citations)
        if hasattr(doc, "chunks") and doc.chunks:
            for chunk in doc.chunks:
                context_parts.append(
                    f"  <chunk id='{chunk.id}'><![CDATA[{chunk.content}]]></chunk>"
                )
        else:
            # Fallback to document content if chunks not loaded
            # Use document ID as chunk ID prefix for consistency
            context_parts.append(
                f"  <chunk id='{doc.id}'><![CDATA[{doc.content}]]></chunk>"
            )
        context_parts.append("</document_content>")
        context_parts.append("</document>")
        context_parts.append("")
    context_parts.append("</mentioned_documents>")
    return "\n".join(context_parts)
@ -81,8 +124,6 @@ def format_mentioned_surfsense_docs_as_context(
    if not documents:
        return ""
    import json
    context_parts = ["<mentioned_surfsense_docs>"]
    context_parts.append(
        "The user has explicitly mentioned the following SurfSense documentation pages. "
@ -262,11 +303,15 @@ async def stream_new_chat(
        # Build input with message history from frontend
        langchain_messages = []
-        # Fetch mentioned documents if any
+        # Fetch mentioned documents if any (with chunks for proper citations)
        mentioned_documents: list[Document] = []
        if mentioned_document_ids:
            from sqlalchemy.orm import selectinload as doc_selectinload
            result = await session.execute(
-                select(Document).filter(
+                select(Document)
                .options(doc_selectinload(Document.chunks))
                .filter(
                    Document.id.in_(mentioned_document_ids),
                    Document.search_space_id == search_space_id,
                )
--- a/surfsense_backend/pyproject.toml
+++ b/surfsense_backend/pyproject.toml
@ -1,6 +1,6 @@
 [project]
 name = "surf-new-backend"
-version = "0.0.11"
+version = "0.0.12"
 description = "SurfSense Backend"
 requires-python = ">=3.12"
 dependencies = [
--- a/surfsense_backend/uv.lock
+++ b/surfsense_backend/uv.lock
@ -6545,7 +6545,7 @@ wheels = [
 [[package]]
 name = "surf-new-backend"
-version = "0.0.11"
+version = "0.0.12"
 source = { editable = "." }
 dependencies = [
    { name = "alembic" },
--- a/surfsense_browser_extension/package.json
+++ b/surfsense_browser_extension/package.json
@ -1,7 +1,7 @@
 {
 	"name": "surfsense_browser_extension",
 	"displayName": "Surfsense Browser Extension",
-	"version": "0.0.11",
+	"version": "0.0.12",
 	"description": "Extension to collect Browsing History for SurfSense.",
 	"author": "https://github.com/MODSetter",
 	"engines": {
--- a/surfsense_web/components/assistant-ui/connector-popup/config/connector-status-config.json
+++ b/surfsense_web/components/assistant-ui/connector-popup/config/connector-status-config.json
@ -24,6 +24,16 @@
 			"enabled": true,
 			"status": "warning",
 			"statusMessage": "Some requests may be blocked if not using Firecrawl."
 		},
 		"COMPOSIO_GOOGLE_DRIVE_CONNECTOR": {
 			"enabled": false,
 			"status": "disabled",
 			"statusMessage": "Not available yet."
 		},
 		"GITHUB_CONNECTOR": {
 			"enabled": false,
 			"status": "warning",
 			"statusMessage": "Some issues with indexing repositories."
 		}
 	},
 	"globalSettings": {
--- a/surfsense_web/components/layout/providers/LayoutDataProvider.tsx
+++ b/surfsense_web/components/layout/providers/LayoutDataProvider.tsx
@ -1,13 +1,14 @@
 "use client";
 import { useQuery, useQueryClient } from "@tanstack/react-query";
-import { useAtomValue } from "jotai";
+import { useAtomValue, useSetAtom } from "jotai";
 import { Inbox, LogOut, SquareLibrary, Trash2 } from "lucide-react";
 import { useParams, usePathname, useRouter } from "next/navigation";
 import { useTranslations } from "next-intl";
 import { useTheme } from "next-themes";
-import { useCallback, useMemo, useState } from "react";
+import { useCallback, useEffect, useMemo, useState } from "react";
 import { toast } from "sonner";
 import { currentThreadAtom, resetCurrentThreadAtom } from "@/atoms/chat/current-thread.atom";
 import { deleteSearchSpaceMutationAtom } from "@/atoms/search-spaces/search-space-mutation.atoms";
 import { searchSpacesAtom } from "@/atoms/search-spaces/search-space-query.atoms";
 import { currentUserAtom } from "@/atoms/user/user-query.atoms";
@ -68,11 +69,16 @@ export function LayoutDataProvider({
 	const { data: user } = useAtomValue(currentUserAtom);
 	const { data: searchSpacesData, refetch: refetchSearchSpaces } = useAtomValue(searchSpacesAtom);
 	const { mutateAsync: deleteSearchSpace } = useAtomValue(deleteSearchSpaceMutationAtom);
 	const currentThreadState = useAtomValue(currentThreadAtom);
 	const resetCurrentThread = useSetAtom(resetCurrentThreadAtom);
-	// Current IDs from URL
+	// State for handling new chat navigation when router is out of sync
 	const [pendingNewChat, setPendingNewChat] = useState(false);
 	// Current IDs from URL, with fallback to atom for replaceState updates
 	const currentChatId = params?.chat_id
 		? Number(Array.isArray(params.chat_id) ? params.chat_id[0] : params.chat_id)
-		: null;
+		: currentThreadState.id;
 	// Fetch current search space (for caching purposes)
 	useQuery({
@ -124,6 +130,17 @@ export function LayoutDataProvider({
 	const [isDeletingSearchSpace, setIsDeletingSearchSpace] = useState(false);
 	const [isLeavingSearchSpace, setIsLeavingSearchSpace] = useState(false);
 	// Effect to complete new chat navigation after router syncs
 	// This runs when handleNewChat detected an out-of-sync state and triggered a sync
 	useEffect(() => {
 		if (pendingNewChat && params?.chat_id) {
 			// Router is now synced (chat_id is in params), complete navigation to new-chat
 			resetCurrentThread();
 			router.push(`/dashboard/${searchSpaceId}/new-chat`);
 			setPendingNewChat(false);
 		}
 	}, [pendingNewChat, params?.chat_id, router, searchSpaceId, resetCurrentThread]);
 	const searchSpaces: SearchSpace[] = useMemo(() => {
 		if (!searchSpacesData || !Array.isArray(searchSpacesData)) return [];
 		return searchSpacesData.map((space) => ({
@ -175,12 +192,6 @@ export function LayoutDataProvider({
 	// Navigation items
 	const navItems: NavItem[] = useMemo(
 		() => [
 			{
 				title: "Documents",
 				url: `/dashboard/${searchSpaceId}/documents`,
 				icon: SquareLibrary,
 				isActive: pathname?.includes("/documents"),
 			},
 			{
 				title: "Inbox",
 				url: "#inbox", // Special URL to indicate this is handled differently
@ -188,6 +199,12 @@ export function LayoutDataProvider({
 				isActive: isInboxSidebarOpen,
 				badge: unreadCount > 0 ? formatInboxCount(unreadCount) : undefined,
 			},
 			{
 				title: "Documents",
 				url: `/dashboard/${searchSpaceId}/documents`,
 				icon: SquareLibrary,
 				isActive: pathname?.includes("/documents"),
 			},
 		],
 		[searchSpaceId, pathname, isInboxSidebarOpen, unreadCount]
 	);
@ -292,8 +309,20 @@ export function LayoutDataProvider({
 	);
 	const handleNewChat = useCallback(() => {
-		router.push(`/dashboard/${searchSpaceId}/new-chat`);
+		// Check if router is out of sync (thread created via replaceState but params don't have chat_id)
-	}, [router, searchSpaceId]);
+		const isOutOfSync = currentThreadState.id !== null && !params?.chat_id;
 		if (isOutOfSync) {
 			// First sync Next.js router by navigating to the current chat's actual URL
 			// This updates the router's internal state to match the browser URL
 			router.replace(`/dashboard/${searchSpaceId}/new-chat/${currentThreadState.id}`);
 			// Set flag to trigger navigation to new-chat after params update
 			setPendingNewChat(true);
 		} else {
 			// Normal navigation - router is in sync
 			router.push(`/dashboard/${searchSpaceId}/new-chat`);
 		}
 	}, [router, searchSpaceId, currentThreadState.id, params?.chat_id]);
 	const handleChatSelect = useCallback(
 		(chat: ChatItem) => {
--- a/surfsense_web/package.json
+++ b/surfsense_web/package.json
@ -1,6 +1,6 @@
 {
 	"name": "surfsense_web",
-	"version": "0.0.11",
+	"version": "0.0.12",
 	"private": true,
 	"description": "SurfSense Frontend",
 	"scripts": {