merge: upstream/dev with migration renumbering

2026-04-29 19:06:24 +02:00 · 2026-01-27 11:22:26 +02:00 · 2026-01-27 11:22:26 +02:00 · a7145b2c63
commit a7145b2c63
parent 6091e070f3 2434e64093
176 changed files with 8791 additions and 3608 deletions
--- a/surfsense_backend/app/agents/new_chat/tools/init.py
+++ b/surfsense_backend/app/agents/new_chat/tools/init.py
@ -19,6 +19,7 @@ Available tools:
 # Tool factory exports (for direct use)
 from .display_image import create_display_image_tool
 from .knowledge_base import (
+    CONNECTOR_DESCRIPTIONS,
    create_search_knowledge_base_tool,
    format_documents_for_context,
    search_knowledge_base_async,
@ -40,6 +41,8 @@ from .user_memory import create_recall_memory_tool, create_save_memory_tool
 __all__ = [
    # Registry
    "BUILTIN_TOOLS",
+    # Knowledge base utilities
+    "CONNECTOR_DESCRIPTIONS",
    "ToolDefinition",
    "build_tools",
    # Tool factories
@ -51,7 +54,6 @@ __all__ = [
    "create_scrape_webpage_tool",
    "create_search_knowledge_base_tool",
    "create_search_surfsense_docs_tool",
-    # Knowledge base utilities
    "format_documents_for_context",
    "get_all_tool_names",
    "get_default_enabled_tools",
--- a/surfsense_backend/app/agents/new_chat/tools/knowledge_base.py
+++ b/surfsense_backend/app/agents/new_chat/tools/knowledge_base.py
@ -12,7 +12,8 @@ import json
 from datetime import datetime
 from typing import Any

-from langchain_core.tools import tool
+from langchain_core.tools import StructuredTool
+from pydantic import BaseModel, Field
 from sqlalchemy.ext.asyncio import AsyncSession

 from app.services.connector_service import ConnectorService
@ -22,6 +23,7 @@ from app.services.connector_service import ConnectorService
 # =============================================================================

 # Canonical connector values used internally by ConnectorService
+# Includes all document types and search source connectors
 _ALL_CONNECTORS: list[str] = [
    "EXTENSION",
    "FILE",
@ -50,41 +52,117 @@ _ALL_CONNECTORS: list[str] = [
    "CRAWLED_URL",
    "CIRCLEBACK",
    "OBSIDIAN_CONNECTOR",
+    # Composio connectors
+    "COMPOSIO_GOOGLE_DRIVE_CONNECTOR",
+    "COMPOSIO_GMAIL_CONNECTOR",
+    "COMPOSIO_GOOGLE_CALENDAR_CONNECTOR",
 ]

+# Human-readable descriptions for each connector type
+# Used for generating dynamic docstrings and informing the LLM
+CONNECTOR_DESCRIPTIONS: dict[str, str] = {
+    "EXTENSION": "Web content saved via SurfSense browser extension (personal browsing history)",
+    "FILE": "User-uploaded documents (PDFs, Word, etc.) (personal files)",
+    "NOTE": "SurfSense Notes (notes created inside SurfSense)",
+    "SLACK_CONNECTOR": "Slack conversations and shared content (personal workspace communications)",
+    "TEAMS_CONNECTOR": "Microsoft Teams messages and conversations (personal Teams communications)",
+    "NOTION_CONNECTOR": "Notion workspace pages and databases (personal knowledge management)",
+    "YOUTUBE_VIDEO": "YouTube video transcripts and metadata (personally saved videos)",
+    "GITHUB_CONNECTOR": "GitHub repository content and issues (personal repositories and interactions)",
+    "ELASTICSEARCH_CONNECTOR": "Elasticsearch indexed documents and data (personal Elasticsearch instances)",
+    "LINEAR_CONNECTOR": "Linear project issues and discussions (personal project management)",
+    "JIRA_CONNECTOR": "Jira project issues, tickets, and comments (personal project tracking)",
+    "CONFLUENCE_CONNECTOR": "Confluence pages and comments (personal project documentation)",
+    "CLICKUP_CONNECTOR": "ClickUp tasks and project data (personal task management)",
+    "GOOGLE_CALENDAR_CONNECTOR": "Google Calendar events, meetings, and schedules (personal calendar)",
+    "GOOGLE_GMAIL_CONNECTOR": "Google Gmail emails and conversations (personal emails)",
+    "GOOGLE_DRIVE_FILE": "Google Drive files and documents (personal cloud storage)",
+    "DISCORD_CONNECTOR": "Discord server conversations and shared content (personal community)",
+    "AIRTABLE_CONNECTOR": "Airtable records, tables, and database content (personal data)",
+    "TAVILY_API": "Tavily web search API results (real-time web search)",
+    "SEARXNG_API": "SearxNG search API results (privacy-focused web search)",
+    "LINKUP_API": "Linkup search API results (web search)",
+    "BAIDU_SEARCH_API": "Baidu search API results (Chinese web search)",
+    "LUMA_CONNECTOR": "Luma events and meetings",
+    "WEBCRAWLER_CONNECTOR": "Webpages indexed by SurfSense (personally selected websites)",
+    "CRAWLED_URL": "Webpages indexed by SurfSense (personally selected websites)",
+    "BOOKSTACK_CONNECTOR": "BookStack pages (personal documentation)",
+    "CIRCLEBACK": "Circleback meeting notes, transcripts, and action items",
+    "OBSIDIAN_CONNECTOR": "Obsidian vault notes and markdown files (personal notes)",
+    # Composio connectors
+    "COMPOSIO_GOOGLE_DRIVE_CONNECTOR": "Google Drive files via Composio (personal cloud storage)",
+    "COMPOSIO_GMAIL_CONNECTOR": "Gmail emails via Composio (personal emails)",
+    "COMPOSIO_GOOGLE_CALENDAR_CONNECTOR": "Google Calendar events via Composio (personal calendar)",
+}

-def _normalize_connectors(connectors_to_search: list[str] | None) -> list[str]:
+
+def _normalize_connectors(
+    connectors_to_search: list[str] | None,
+    available_connectors: list[str] | None = None,
+) -> list[str]:
    """
    Normalize connectors provided by the model.

    - Accepts user-facing enums like WEBCRAWLER_CONNECTOR and maps them to canonical
      ConnectorService types.
    - Drops unknown values.
-    - If None/empty, defaults to searching across all known connectors.
+    - If available_connectors is provided, only includes connectors from that list.
+    - If connectors_to_search is None/empty, defaults to available_connectors or all.
+
+    Args:
+        connectors_to_search: List of connectors requested by the model
+        available_connectors: List of connectors actually available in the search space
+
+    Returns:
+        List of normalized connector strings to search
    """
+    # Determine the set of valid connectors to consider
+    valid_set = (
+        set(available_connectors) if available_connectors else set(_ALL_CONNECTORS)
+    )
+
    if not connectors_to_search:
-        return list(_ALL_CONNECTORS)
+        # Search all available connectors if none specified
+        return (
+            list(available_connectors)
+            if available_connectors
+            else list(_ALL_CONNECTORS)
+        )

    normalized: list[str] = []
    for raw in connectors_to_search:
        c = (raw or "").strip().upper()
        if not c:
            continue
+        # Map user-facing aliases to canonical names
        if c == "WEBCRAWLER_CONNECTOR":
            c = "CRAWLED_URL"
        normalized.append(c)

-    # de-dupe while preserving order + filter unknown
+    # de-dupe while preserving order + filter to valid connectors
    seen: set[str] = set()
    out: list[str] = []
    for c in normalized:
        if c in seen:
            continue
+        # Only include if it's a known connector AND available
        if c not in _ALL_CONNECTORS:
            continue
+        if c not in valid_set:
+            continue
        seen.add(c)
        out.append(c)
-    return out if out else list(_ALL_CONNECTORS)
+
+    # Fallback to all available if nothing matched
+    return (
+        out
+        if out
+        else (
+            list(available_connectors)
+            if available_connectors
+            else list(_ALL_CONNECTORS)
+        )
+    )


 # =============================================================================
@ -233,6 +311,7 @@ async def search_knowledge_base_async(
    top_k: int = 10,
    start_date: datetime | None = None,
    end_date: datetime | None = None,
+    available_connectors: list[str] | None = None,
 ) -> str:
    """
    Search the user's knowledge base for relevant documents.
@ -248,6 +327,8 @@ async def search_knowledge_base_async(
        top_k: Number of results per connector
        start_date: Optional start datetime (UTC) for filtering documents
        end_date: Optional end datetime (UTC) for filtering documents
+        available_connectors: Optional list of connectors actually available in the search space.
+                            If provided, only these connectors will be searched.

    Returns:
        Formatted string with search results
@ -262,7 +343,7 @@ async def search_knowledge_base_async(
        end_date=end_date,
    )

-    connectors = _normalize_connectors(connectors_to_search)
+    connectors = _normalize_connectors(connectors_to_search, available_connectors)

    for connector in connectors:
        try:
@ -316,6 +397,16 @@ async def search_knowledge_base_async(
                )
                all_documents.extend(chunks)

+            elif connector == "TEAMS_CONNECTOR":
+                _, chunks = await connector_service.search_teams(
+                    user_query=query,
+                    search_space_id=search_space_id,
+                    top_k=top_k,
+                    start_date=resolved_start_date,
+                    end_date=resolved_end_date,
+                )
+                all_documents.extend(chunks)
+
            elif connector == "NOTION_CONNECTOR":
                _, chunks = await connector_service.search_notion(
                    user_query=query,
@ -519,6 +610,39 @@ async def search_knowledge_base_async(
                )
                all_documents.extend(chunks)

+            # =========================================================
+            # Composio Connectors
+            # =========================================================
+            elif connector == "COMPOSIO_GOOGLE_DRIVE_CONNECTOR":
+                _, chunks = await connector_service.search_composio_google_drive(
+                    user_query=query,
+                    search_space_id=search_space_id,
+                    top_k=top_k,
+                    start_date=resolved_start_date,
+                    end_date=resolved_end_date,
+                )
+                all_documents.extend(chunks)
+
+            elif connector == "COMPOSIO_GMAIL_CONNECTOR":
+                _, chunks = await connector_service.search_composio_gmail(
+                    user_query=query,
+                    search_space_id=search_space_id,
+                    top_k=top_k,
+                    start_date=resolved_start_date,
+                    end_date=resolved_end_date,
+                )
+                all_documents.extend(chunks)
+
+            elif connector == "COMPOSIO_GOOGLE_CALENDAR_CONNECTOR":
+                _, chunks = await connector_service.search_composio_google_calendar(
+                    user_query=query,
+                    search_space_id=search_space_id,
+                    top_k=top_k,
+                    start_date=resolved_start_date,
+                    end_date=resolved_end_date,
+                )
+                all_documents.extend(chunks)
+
        except Exception as e:
            print(f"Error searching connector {connector}: {e}")
            continue
@ -543,11 +667,68 @@ async def search_knowledge_base_async(
    return format_documents_for_context(deduplicated)


+def _build_connector_docstring(available_connectors: list[str] | None) -> str:
+    """
+    Build the connector documentation section for the tool docstring.
+
+    Args:
+        available_connectors: List of available connector types, or None for all
+
+    Returns:
+        Formatted docstring section listing available connectors
+    """
+    connectors = available_connectors if available_connectors else list(_ALL_CONNECTORS)
+
+    lines = []
+    for connector in connectors:
+        # Skip internal names, prefer user-facing aliases
+        if connector == "CRAWLED_URL":
+            # Show as WEBCRAWLER_CONNECTOR for user-facing docs
+            description = CONNECTOR_DESCRIPTIONS.get(connector, connector)
+            lines.append(f"- WEBCRAWLER_CONNECTOR: {description}")
+        else:
+            description = CONNECTOR_DESCRIPTIONS.get(connector, connector)
+            lines.append(f"- {connector}: {description}")
+
+    return "\n".join(lines)
+
+
+# =============================================================================
+# Tool Input Schema
+# =============================================================================
+
+
+class SearchKnowledgeBaseInput(BaseModel):
+    """Input schema for the search_knowledge_base tool."""
+
+    query: str = Field(
+        description="The search query - be specific and include key terms"
+    )
+    top_k: int = Field(
+        default=10,
+        description="Number of results to retrieve (default: 10)",
+    )
+    start_date: str | None = Field(
+        default=None,
+        description="Optional ISO date/datetime (e.g. '2025-12-12' or '2025-12-12T00:00:00+00:00')",
+    )
+    end_date: str | None = Field(
+        default=None,
+        description="Optional ISO date/datetime (e.g. '2025-12-19' or '2025-12-19T23:59:59+00:00')",
+    )
+    connectors_to_search: list[str] | None = Field(
+        default=None,
+        description="Optional list of connector enums to search. If omitted, searches all available.",
+    )
+
+
 def create_search_knowledge_base_tool(
    search_space_id: int,
    db_session: AsyncSession,
    connector_service: ConnectorService,
-):
+    available_connectors: list[str] | None = None,
+    available_document_types: list[str] | None = None,
+) -> StructuredTool:
    """
    Factory function to create the search_knowledge_base tool with injected dependencies.

@ -555,72 +736,57 @@ def create_search_knowledge_base_tool(
        search_space_id: The user's search space ID
        db_session: Database session
        connector_service: Initialized connector service
+        available_connectors: Optional list of connector types available in the search space.
+                            Used to dynamically generate the tool docstring.
+        available_document_types: Optional list of document types that have data in the search space.
+                                Used to inform the LLM about what data exists.

    Returns:
-        A configured tool function
+        A configured StructuredTool instance
    """
+    # Build connector documentation dynamically
+    connector_docs = _build_connector_docstring(available_connectors)

-    @tool
-    async def search_knowledge_base(
+    # Build context about available document types
+    doc_types_info = ""
+    if available_document_types:
+        doc_types_info = f"""
+
+## Document types with indexed content in this search space
+
+The following document types have content available for search:
+{", ".join(available_document_types)}
+
+Focus searches on these types for best results."""
+
+    # Build the dynamic description for the tool
+    # This is what the LLM sees when deciding whether/how to use the tool
+    dynamic_description = f"""Search the user's personal knowledge base for relevant information.
+
+Use this tool to find documents, notes, files, web pages, and other content that may help answer the user's question.
+
+IMPORTANT:
+- If the user requests a specific source type (e.g. "my notes", "Slack messages"), pass `connectors_to_search=[...]` using the enums below.
+- If `connectors_to_search` is omitted/empty, the system will search broadly.
+- Only connectors that are enabled/configured for this search space are available.{doc_types_info}
+
+## Available connector enums for `connectors_to_search`
+
+{connector_docs}
+
+NOTE: `WEBCRAWLER_CONNECTOR` is mapped internally to the canonical document type `CRAWLED_URL`."""
+
+    # Capture for closure
+    _available_connectors = available_connectors
+
+    async def _search_knowledge_base_impl(
        query: str,
        top_k: int = 10,
        start_date: str | None = None,
        end_date: str | None = None,
        connectors_to_search: list[str] | None = None,
    ) -> str:
-        """
-        Search the user's personal knowledge base for relevant information.
-
-        Use this tool to find documents, notes, files, web pages, and other content
-        that may help answer the user's question.
-
-        IMPORTANT:
-        - If the user requests a specific source type (e.g. "my notes", "Slack messages"),
-          pass `connectors_to_search=[...]` using the enums below.
-        - If `connectors_to_search` is omitted/empty, the system will search broadly.
-
-        ## Available connector enums for `connectors_to_search`
-
-        - EXTENSION: "Web content saved via SurfSense browser extension" (personal browsing history)
-        - FILE: "User-uploaded documents (PDFs, Word, etc.)" (personal files)
-        - NOTE: "SurfSense Notes" (notes created inside SurfSense)
-        - SLACK_CONNECTOR: "Slack conversations and shared content" (personal workspace communications)
-        - TEAMS_CONNECTOR: "Microsoft Teams messages and conversations" (personal Teams communications)
-        - NOTION_CONNECTOR: "Notion workspace pages and databases" (personal knowledge management)
-        - YOUTUBE_VIDEO: "YouTube video transcripts and metadata" (personally saved videos)
-        - GITHUB_CONNECTOR: "GitHub repository content and issues" (personal repositories and interactions)
-        - ELASTICSEARCH_CONNECTOR: "Elasticsearch indexed documents and data" (personal Elasticsearch instances and custom data sources)
-        - LINEAR_CONNECTOR: "Linear project issues and discussions" (personal project management)
-        - JIRA_CONNECTOR: "Jira project issues, tickets, and comments" (personal project tracking)
-        - CONFLUENCE_CONNECTOR: "Confluence pages and comments" (personal project documentation)
-        - CLICKUP_CONNECTOR: "ClickUp tasks and project data" (personal task management)
-        - GOOGLE_CALENDAR_CONNECTOR: "Google Calendar events, meetings, and schedules" (personal calendar and time management)
-        - GOOGLE_GMAIL_CONNECTOR: "Google Gmail emails and conversations" (personal emails and communications)
-        - GOOGLE_DRIVE_FILE: "Google Drive files and documents" (personal cloud storage and file management)
-        - DISCORD_CONNECTOR: "Discord server conversations and shared content" (personal community communications)
-        - AIRTABLE_CONNECTOR: "Airtable records, tables, and database content" (personal data management and organization)
-        - TAVILY_API: "Tavily search API results" (personalized search results)
-        - SEARXNG_API: "SearxNG search API results" (personalized search results)
-        - LINKUP_API: "Linkup search API results" (personalized search results)
-        - BAIDU_SEARCH_API: "Baidu search API results" (personalized search results)
-        - LUMA_CONNECTOR: "Luma events"
-        - WEBCRAWLER_CONNECTOR: "Webpages indexed by SurfSense" (personally selected websites)
-        - BOOKSTACK_CONNECTOR: "BookStack pages" (personal documentation)
-        - CIRCLEBACK: "Circleback meeting notes, transcripts, and action items" (personal meeting records)
-        - OBSIDIAN_CONNECTOR: "Obsidian vault notes and markdown files" (personal notes and knowledge management)
-
-        NOTE: `WEBCRAWLER_CONNECTOR` is mapped internally to the canonical document type `CRAWLED_URL`.
-
-        Args:
-            query: The search query - be specific and include key terms
-            top_k: Number of results to retrieve (default: 10)
-            start_date: Optional ISO date/datetime (e.g. "2025-12-12" or "2025-12-12T00:00:00+00:00")
-            end_date: Optional ISO date/datetime (e.g. "2025-12-19" or "2025-12-19T23:59:59+00:00")
-            connectors_to_search: Optional list of connector enums to search. If omitted, searches all.
-
-        Returns:
-            Formatted string with relevant documents and their content
-        """
+        """Implementation function for knowledge base search."""
        from app.agents.new_chat.utils import parse_date_or_datetime

        parsed_start: datetime | None = None
@ -640,6 +806,16 @@ def create_search_knowledge_base_tool(
            top_k=top_k,
            start_date=parsed_start,
            end_date=parsed_end,
+            available_connectors=_available_connectors,
        )

-    return search_knowledge_base
+    # Create StructuredTool with dynamic description
+    # This properly sets the description that the LLM sees
+    tool = StructuredTool(
+        name="search_knowledge_base",
+        description=dynamic_description,
+        coroutine=_search_knowledge_base_impl,
+        args_schema=SearchKnowledgeBaseInput,
+    )
+
+    return tool
--- a/surfsense_backend/app/agents/new_chat/tools/registry.py
+++ b/surfsense_backend/app/agents/new_chat/tools/registry.py
@ -85,6 +85,7 @@ class ToolDefinition:
 # Contributors: Add your new tools here!
 BUILTIN_TOOLS: list[ToolDefinition] = [
    # Core tool - searches the user's knowledge base
+    # Now supports dynamic connector/document type discovery
    ToolDefinition(
        name="search_knowledge_base",
        description="Search the user's personal knowledge base for relevant information",
@ -92,8 +93,12 @@ BUILTIN_TOOLS: list[ToolDefinition] = [
            search_space_id=deps["search_space_id"],
            db_session=deps["db_session"],
            connector_service=deps["connector_service"],
+            # Optional: dynamically discovered connectors/document types
+            available_connectors=deps.get("available_connectors"),
+            available_document_types=deps.get("available_document_types"),
        ),
        requires=["search_space_id", "db_session", "connector_service"],
+        # Note: available_connectors and available_document_types are optional
    ),
    # Podcast generation tool
    ToolDefinition(