diff --git a/README.md b/README.md index 7f50b924c..4dd368c04 100644 --- a/README.md +++ b/README.md @@ -29,8 +29,7 @@ SurfSense is a highly customizable AI research agent, connected to external sour # Video -https://github.com/user-attachments/assets/42a29ea1-d4d8-4213-9c69-972b5b806d58 - +https://github.com/user-attachments/assets/cc0c84d3-1f2f-4f7a-b519-2ecce22310b1 ## Podcast Sample @@ -52,8 +51,10 @@ https://github.com/user-attachments/assets/a0a16566-6967-4374-ac51-9b3e07fbecd7 - Interact in Natural Language and get cited answers. ### 📄 **Cited Answers** - Get Cited answers just like Perplexity. +### 🧩 **Universal Compatibility** +- Connect virtually any inference provider via the OpenAI spec and LiteLLM. ### 🔔 **Privacy & Local LLM Support** -- Works Flawlessly with Ollama local LLMs. +- Works Flawlessly with local LLMs like vLLM and Ollama. ### 🏠 **Self Hostable** - Open source and easy to deploy locally. ### 👥 **Team Collaboration with RBAC** @@ -61,6 +62,7 @@ https://github.com/user-attachments/assets/a0a16566-6967-4374-ac51-9b3e07fbecd7 - Invite team members with customizable roles (Owner, Admin, Editor, Viewer) - Granular permissions for documents, chats, connectors, and settings - Share knowledge bases securely within your organization +- Team chats update in real-time and "Chat about the chat" in comment threads ### 🎙️ Podcasts - Blazingly fast podcast generation agent. (Creates a 3-minute podcast in under 20 seconds.) - Convert your chat conversations into engaging audio content @@ -237,6 +239,8 @@ Before self-hosting installation, make sure to complete the [prerequisite setup ### **BackEnd** +- **LiteLLM**: Universal LLM integration supporting 100+ models (OpenAI, Anthropic, Ollama, etc.) + - **FastAPI**: Modern, fast web framework for building APIs with Python - **PostgreSQL with pgvector**: Database with vector search capabilities for similarity searches @@ -253,8 +257,6 @@ Before self-hosting installation, make sure to complete the [prerequisite setup - **LangChain**: Framework for developing AI-powered applications. -- **LiteLLM**: Universal LLM integration supporting 100+ models (OpenAI, Anthropic, Ollama, etc.) - - **Rerankers**: Advanced result ranking for improved search relevance - **Hybrid Search**: Combines vector similarity and full-text search for optimal results using Reciprocal Rank Fusion (RRF) diff --git a/surfsense_backend/app/agents/new_chat/chat_deepagent.py b/surfsense_backend/app/agents/new_chat/chat_deepagent.py index 5bc6ac2e2..53e1b14bd 100644 --- a/surfsense_backend/app/agents/new_chat/chat_deepagent.py +++ b/surfsense_backend/app/agents/new_chat/chat_deepagent.py @@ -7,6 +7,7 @@ via NewLLMConfig. """ from collections.abc import Sequence +from typing import Any from deepagents import create_deep_agent from langchain_core.tools import BaseTool @@ -23,6 +24,90 @@ from app.agents.new_chat.system_prompt import ( from app.agents.new_chat.tools.registry import build_tools_async from app.services.connector_service import ConnectorService +# ============================================================================= +# Connector Type Mapping +# ============================================================================= + +# Maps SearchSourceConnectorType enum values to the searchable document/connector types +# used by the knowledge_base tool. Some connectors map to different document types. +_CONNECTOR_TYPE_TO_SEARCHABLE: dict[str, str] = { + # Direct mappings (connector type == searchable type) + "TAVILY_API": "TAVILY_API", + "SEARXNG_API": "SEARXNG_API", + "LINKUP_API": "LINKUP_API", + "BAIDU_SEARCH_API": "BAIDU_SEARCH_API", + "SLACK_CONNECTOR": "SLACK_CONNECTOR", + "TEAMS_CONNECTOR": "TEAMS_CONNECTOR", + "NOTION_CONNECTOR": "NOTION_CONNECTOR", + "GITHUB_CONNECTOR": "GITHUB_CONNECTOR", + "LINEAR_CONNECTOR": "LINEAR_CONNECTOR", + "DISCORD_CONNECTOR": "DISCORD_CONNECTOR", + "JIRA_CONNECTOR": "JIRA_CONNECTOR", + "CONFLUENCE_CONNECTOR": "CONFLUENCE_CONNECTOR", + "CLICKUP_CONNECTOR": "CLICKUP_CONNECTOR", + "GOOGLE_CALENDAR_CONNECTOR": "GOOGLE_CALENDAR_CONNECTOR", + "GOOGLE_GMAIL_CONNECTOR": "GOOGLE_GMAIL_CONNECTOR", + "GOOGLE_DRIVE_CONNECTOR": "GOOGLE_DRIVE_FILE", # Connector type differs from document type + "AIRTABLE_CONNECTOR": "AIRTABLE_CONNECTOR", + "LUMA_CONNECTOR": "LUMA_CONNECTOR", + "ELASTICSEARCH_CONNECTOR": "ELASTICSEARCH_CONNECTOR", + "WEBCRAWLER_CONNECTOR": "CRAWLED_URL", # Maps to document type + "BOOKSTACK_CONNECTOR": "BOOKSTACK_CONNECTOR", + "CIRCLEBACK_CONNECTOR": "CIRCLEBACK", # Connector type differs from document type + "OBSIDIAN_CONNECTOR": "OBSIDIAN_CONNECTOR", + # Composio connectors + "COMPOSIO_GOOGLE_DRIVE_CONNECTOR": "COMPOSIO_GOOGLE_DRIVE_CONNECTOR", + "COMPOSIO_GMAIL_CONNECTOR": "COMPOSIO_GMAIL_CONNECTOR", + "COMPOSIO_GOOGLE_CALENDAR_CONNECTOR": "COMPOSIO_GOOGLE_CALENDAR_CONNECTOR", +} + +# Document types that don't come from SearchSourceConnector but should always be searchable +_ALWAYS_AVAILABLE_DOC_TYPES: list[str] = [ + "EXTENSION", # Browser extension data + "FILE", # Uploaded files + "NOTE", # User notes + "YOUTUBE_VIDEO", # YouTube videos +] + + +def _map_connectors_to_searchable_types( + connector_types: list[Any], +) -> list[str]: + """ + Map SearchSourceConnectorType enums to searchable document/connector types. + + This function: + 1. Converts connector type enums to their searchable counterparts + 2. Includes always-available document types (EXTENSION, FILE, NOTE, YOUTUBE_VIDEO) + 3. Deduplicates while preserving order + + Args: + connector_types: List of SearchSourceConnectorType enum values + + Returns: + List of searchable connector/document type strings + """ + result_set: set[str] = set() + result_list: list[str] = [] + + # Add always-available document types first + for doc_type in _ALWAYS_AVAILABLE_DOC_TYPES: + if doc_type not in result_set: + result_set.add(doc_type) + result_list.append(doc_type) + + # Map each connector type to its searchable equivalent + for ct in connector_types: + # Handle both enum and string types + ct_str = ct.value if hasattr(ct, "value") else str(ct) + searchable = _CONNECTOR_TYPE_TO_SEARCHABLE.get(ct_str) + if searchable and searchable not in result_set: + result_set.add(searchable) + result_list.append(searchable) + + return result_list + + # ============================================================================= # Deep Agent Factory # ============================================================================= @@ -116,6 +201,30 @@ async def create_surfsense_deep_agent( additional_tools=[my_custom_tool] ) """ + # Discover available connectors and document types for this search space + # This enables dynamic tool docstrings that inform the LLM about what's actually available + available_connectors: list[str] | None = None + available_document_types: list[str] | None = None + + try: + # Get enabled search source connectors for this search space + connector_types = await connector_service.get_available_connectors( + search_space_id + ) + if connector_types: + # Convert enum values to strings and also include mapped document types + available_connectors = _map_connectors_to_searchable_types(connector_types) + + # Get document types that have at least one document indexed + available_document_types = await connector_service.get_available_document_types( + search_space_id + ) + except Exception as e: + # Log but don't fail - fall back to all connectors if discovery fails + import logging + + logging.warning(f"Failed to discover available connectors/document types: {e}") + # Build dependencies dict for the tools registry dependencies = { "search_space_id": search_space_id, @@ -123,6 +232,9 @@ async def create_surfsense_deep_agent( "connector_service": connector_service, "firecrawl_api_key": firecrawl_api_key, "user_id": user_id, # Required for memory tools + # Dynamic connector/document type discovery for knowledge base tool + "available_connectors": available_connectors, + "available_document_types": available_document_types, } # Build tools using the async registry (includes MCP tools) diff --git a/surfsense_backend/app/agents/new_chat/tools/__init__.py b/surfsense_backend/app/agents/new_chat/tools/__init__.py index acbdbcb3a..9e1a4f19c 100644 --- a/surfsense_backend/app/agents/new_chat/tools/__init__.py +++ b/surfsense_backend/app/agents/new_chat/tools/__init__.py @@ -19,6 +19,7 @@ Available tools: # Tool factory exports (for direct use) from .display_image import create_display_image_tool from .knowledge_base import ( + CONNECTOR_DESCRIPTIONS, create_search_knowledge_base_tool, format_documents_for_context, search_knowledge_base_async, @@ -40,6 +41,8 @@ from .user_memory import create_recall_memory_tool, create_save_memory_tool __all__ = [ # Registry "BUILTIN_TOOLS", + # Knowledge base utilities + "CONNECTOR_DESCRIPTIONS", "ToolDefinition", "build_tools", # Tool factories @@ -51,7 +54,6 @@ __all__ = [ "create_scrape_webpage_tool", "create_search_knowledge_base_tool", "create_search_surfsense_docs_tool", - # Knowledge base utilities "format_documents_for_context", "get_all_tool_names", "get_default_enabled_tools", diff --git a/surfsense_backend/app/agents/new_chat/tools/knowledge_base.py b/surfsense_backend/app/agents/new_chat/tools/knowledge_base.py index 552019dda..a11e4ac38 100644 --- a/surfsense_backend/app/agents/new_chat/tools/knowledge_base.py +++ b/surfsense_backend/app/agents/new_chat/tools/knowledge_base.py @@ -12,7 +12,8 @@ import json from datetime import datetime from typing import Any -from langchain_core.tools import tool +from langchain_core.tools import StructuredTool +from pydantic import BaseModel, Field from sqlalchemy.ext.asyncio import AsyncSession from app.services.connector_service import ConnectorService @@ -22,6 +23,7 @@ from app.services.connector_service import ConnectorService # ============================================================================= # Canonical connector values used internally by ConnectorService +# Includes all document types and search source connectors _ALL_CONNECTORS: list[str] = [ "EXTENSION", "FILE", @@ -50,41 +52,117 @@ _ALL_CONNECTORS: list[str] = [ "CRAWLED_URL", "CIRCLEBACK", "OBSIDIAN_CONNECTOR", + # Composio connectors + "COMPOSIO_GOOGLE_DRIVE_CONNECTOR", + "COMPOSIO_GMAIL_CONNECTOR", + "COMPOSIO_GOOGLE_CALENDAR_CONNECTOR", ] +# Human-readable descriptions for each connector type +# Used for generating dynamic docstrings and informing the LLM +CONNECTOR_DESCRIPTIONS: dict[str, str] = { + "EXTENSION": "Web content saved via SurfSense browser extension (personal browsing history)", + "FILE": "User-uploaded documents (PDFs, Word, etc.) (personal files)", + "NOTE": "SurfSense Notes (notes created inside SurfSense)", + "SLACK_CONNECTOR": "Slack conversations and shared content (personal workspace communications)", + "TEAMS_CONNECTOR": "Microsoft Teams messages and conversations (personal Teams communications)", + "NOTION_CONNECTOR": "Notion workspace pages and databases (personal knowledge management)", + "YOUTUBE_VIDEO": "YouTube video transcripts and metadata (personally saved videos)", + "GITHUB_CONNECTOR": "GitHub repository content and issues (personal repositories and interactions)", + "ELASTICSEARCH_CONNECTOR": "Elasticsearch indexed documents and data (personal Elasticsearch instances)", + "LINEAR_CONNECTOR": "Linear project issues and discussions (personal project management)", + "JIRA_CONNECTOR": "Jira project issues, tickets, and comments (personal project tracking)", + "CONFLUENCE_CONNECTOR": "Confluence pages and comments (personal project documentation)", + "CLICKUP_CONNECTOR": "ClickUp tasks and project data (personal task management)", + "GOOGLE_CALENDAR_CONNECTOR": "Google Calendar events, meetings, and schedules (personal calendar)", + "GOOGLE_GMAIL_CONNECTOR": "Google Gmail emails and conversations (personal emails)", + "GOOGLE_DRIVE_FILE": "Google Drive files and documents (personal cloud storage)", + "DISCORD_CONNECTOR": "Discord server conversations and shared content (personal community)", + "AIRTABLE_CONNECTOR": "Airtable records, tables, and database content (personal data)", + "TAVILY_API": "Tavily web search API results (real-time web search)", + "SEARXNG_API": "SearxNG search API results (privacy-focused web search)", + "LINKUP_API": "Linkup search API results (web search)", + "BAIDU_SEARCH_API": "Baidu search API results (Chinese web search)", + "LUMA_CONNECTOR": "Luma events and meetings", + "WEBCRAWLER_CONNECTOR": "Webpages indexed by SurfSense (personally selected websites)", + "CRAWLED_URL": "Webpages indexed by SurfSense (personally selected websites)", + "BOOKSTACK_CONNECTOR": "BookStack pages (personal documentation)", + "CIRCLEBACK": "Circleback meeting notes, transcripts, and action items", + "OBSIDIAN_CONNECTOR": "Obsidian vault notes and markdown files (personal notes)", + # Composio connectors + "COMPOSIO_GOOGLE_DRIVE_CONNECTOR": "Google Drive files via Composio (personal cloud storage)", + "COMPOSIO_GMAIL_CONNECTOR": "Gmail emails via Composio (personal emails)", + "COMPOSIO_GOOGLE_CALENDAR_CONNECTOR": "Google Calendar events via Composio (personal calendar)", +} -def _normalize_connectors(connectors_to_search: list[str] | None) -> list[str]: + +def _normalize_connectors( + connectors_to_search: list[str] | None, + available_connectors: list[str] | None = None, +) -> list[str]: """ Normalize connectors provided by the model. - Accepts user-facing enums like WEBCRAWLER_CONNECTOR and maps them to canonical ConnectorService types. - Drops unknown values. - - If None/empty, defaults to searching across all known connectors. + - If available_connectors is provided, only includes connectors from that list. + - If connectors_to_search is None/empty, defaults to available_connectors or all. + + Args: + connectors_to_search: List of connectors requested by the model + available_connectors: List of connectors actually available in the search space + + Returns: + List of normalized connector strings to search """ + # Determine the set of valid connectors to consider + valid_set = ( + set(available_connectors) if available_connectors else set(_ALL_CONNECTORS) + ) + if not connectors_to_search: - return list(_ALL_CONNECTORS) + # Search all available connectors if none specified + return ( + list(available_connectors) + if available_connectors + else list(_ALL_CONNECTORS) + ) normalized: list[str] = [] for raw in connectors_to_search: c = (raw or "").strip().upper() if not c: continue + # Map user-facing aliases to canonical names if c == "WEBCRAWLER_CONNECTOR": c = "CRAWLED_URL" normalized.append(c) - # de-dupe while preserving order + filter unknown + # de-dupe while preserving order + filter to valid connectors seen: set[str] = set() out: list[str] = [] for c in normalized: if c in seen: continue + # Only include if it's a known connector AND available if c not in _ALL_CONNECTORS: continue + if c not in valid_set: + continue seen.add(c) out.append(c) - return out if out else list(_ALL_CONNECTORS) + + # Fallback to all available if nothing matched + return ( + out + if out + else ( + list(available_connectors) + if available_connectors + else list(_ALL_CONNECTORS) + ) + ) # ============================================================================= @@ -233,6 +311,7 @@ async def search_knowledge_base_async( top_k: int = 10, start_date: datetime | None = None, end_date: datetime | None = None, + available_connectors: list[str] | None = None, ) -> str: """ Search the user's knowledge base for relevant documents. @@ -248,6 +327,8 @@ async def search_knowledge_base_async( top_k: Number of results per connector start_date: Optional start datetime (UTC) for filtering documents end_date: Optional end datetime (UTC) for filtering documents + available_connectors: Optional list of connectors actually available in the search space. + If provided, only these connectors will be searched. Returns: Formatted string with search results @@ -262,7 +343,7 @@ async def search_knowledge_base_async( end_date=end_date, ) - connectors = _normalize_connectors(connectors_to_search) + connectors = _normalize_connectors(connectors_to_search, available_connectors) for connector in connectors: try: @@ -316,6 +397,16 @@ async def search_knowledge_base_async( ) all_documents.extend(chunks) + elif connector == "TEAMS_CONNECTOR": + _, chunks = await connector_service.search_teams( + user_query=query, + search_space_id=search_space_id, + top_k=top_k, + start_date=resolved_start_date, + end_date=resolved_end_date, + ) + all_documents.extend(chunks) + elif connector == "NOTION_CONNECTOR": _, chunks = await connector_service.search_notion( user_query=query, @@ -519,6 +610,39 @@ async def search_knowledge_base_async( ) all_documents.extend(chunks) + # ========================================================= + # Composio Connectors + # ========================================================= + elif connector == "COMPOSIO_GOOGLE_DRIVE_CONNECTOR": + _, chunks = await connector_service.search_composio_google_drive( + user_query=query, + search_space_id=search_space_id, + top_k=top_k, + start_date=resolved_start_date, + end_date=resolved_end_date, + ) + all_documents.extend(chunks) + + elif connector == "COMPOSIO_GMAIL_CONNECTOR": + _, chunks = await connector_service.search_composio_gmail( + user_query=query, + search_space_id=search_space_id, + top_k=top_k, + start_date=resolved_start_date, + end_date=resolved_end_date, + ) + all_documents.extend(chunks) + + elif connector == "COMPOSIO_GOOGLE_CALENDAR_CONNECTOR": + _, chunks = await connector_service.search_composio_google_calendar( + user_query=query, + search_space_id=search_space_id, + top_k=top_k, + start_date=resolved_start_date, + end_date=resolved_end_date, + ) + all_documents.extend(chunks) + except Exception as e: print(f"Error searching connector {connector}: {e}") continue @@ -543,11 +667,68 @@ async def search_knowledge_base_async( return format_documents_for_context(deduplicated) +def _build_connector_docstring(available_connectors: list[str] | None) -> str: + """ + Build the connector documentation section for the tool docstring. + + Args: + available_connectors: List of available connector types, or None for all + + Returns: + Formatted docstring section listing available connectors + """ + connectors = available_connectors if available_connectors else list(_ALL_CONNECTORS) + + lines = [] + for connector in connectors: + # Skip internal names, prefer user-facing aliases + if connector == "CRAWLED_URL": + # Show as WEBCRAWLER_CONNECTOR for user-facing docs + description = CONNECTOR_DESCRIPTIONS.get(connector, connector) + lines.append(f"- WEBCRAWLER_CONNECTOR: {description}") + else: + description = CONNECTOR_DESCRIPTIONS.get(connector, connector) + lines.append(f"- {connector}: {description}") + + return "\n".join(lines) + + +# ============================================================================= +# Tool Input Schema +# ============================================================================= + + +class SearchKnowledgeBaseInput(BaseModel): + """Input schema for the search_knowledge_base tool.""" + + query: str = Field( + description="The search query - be specific and include key terms" + ) + top_k: int = Field( + default=10, + description="Number of results to retrieve (default: 10)", + ) + start_date: str | None = Field( + default=None, + description="Optional ISO date/datetime (e.g. '2025-12-12' or '2025-12-12T00:00:00+00:00')", + ) + end_date: str | None = Field( + default=None, + description="Optional ISO date/datetime (e.g. '2025-12-19' or '2025-12-19T23:59:59+00:00')", + ) + connectors_to_search: list[str] | None = Field( + default=None, + description="Optional list of connector enums to search. If omitted, searches all available.", + ) + + def create_search_knowledge_base_tool( search_space_id: int, db_session: AsyncSession, connector_service: ConnectorService, -): + available_connectors: list[str] | None = None, + available_document_types: list[str] | None = None, +) -> StructuredTool: """ Factory function to create the search_knowledge_base tool with injected dependencies. @@ -555,72 +736,57 @@ def create_search_knowledge_base_tool( search_space_id: The user's search space ID db_session: Database session connector_service: Initialized connector service + available_connectors: Optional list of connector types available in the search space. + Used to dynamically generate the tool docstring. + available_document_types: Optional list of document types that have data in the search space. + Used to inform the LLM about what data exists. Returns: - A configured tool function + A configured StructuredTool instance """ + # Build connector documentation dynamically + connector_docs = _build_connector_docstring(available_connectors) - @tool - async def search_knowledge_base( + # Build context about available document types + doc_types_info = "" + if available_document_types: + doc_types_info = f""" + +## Document types with indexed content in this search space + +The following document types have content available for search: +{", ".join(available_document_types)} + +Focus searches on these types for best results.""" + + # Build the dynamic description for the tool + # This is what the LLM sees when deciding whether/how to use the tool + dynamic_description = f"""Search the user's personal knowledge base for relevant information. + +Use this tool to find documents, notes, files, web pages, and other content that may help answer the user's question. + +IMPORTANT: +- If the user requests a specific source type (e.g. "my notes", "Slack messages"), pass `connectors_to_search=[...]` using the enums below. +- If `connectors_to_search` is omitted/empty, the system will search broadly. +- Only connectors that are enabled/configured for this search space are available.{doc_types_info} + +## Available connector enums for `connectors_to_search` + +{connector_docs} + +NOTE: `WEBCRAWLER_CONNECTOR` is mapped internally to the canonical document type `CRAWLED_URL`.""" + + # Capture for closure + _available_connectors = available_connectors + + async def _search_knowledge_base_impl( query: str, top_k: int = 10, start_date: str | None = None, end_date: str | None = None, connectors_to_search: list[str] | None = None, ) -> str: - """ - Search the user's personal knowledge base for relevant information. - - Use this tool to find documents, notes, files, web pages, and other content - that may help answer the user's question. - - IMPORTANT: - - If the user requests a specific source type (e.g. "my notes", "Slack messages"), - pass `connectors_to_search=[...]` using the enums below. - - If `connectors_to_search` is omitted/empty, the system will search broadly. - - ## Available connector enums for `connectors_to_search` - - - EXTENSION: "Web content saved via SurfSense browser extension" (personal browsing history) - - FILE: "User-uploaded documents (PDFs, Word, etc.)" (personal files) - - NOTE: "SurfSense Notes" (notes created inside SurfSense) - - SLACK_CONNECTOR: "Slack conversations and shared content" (personal workspace communications) - - TEAMS_CONNECTOR: "Microsoft Teams messages and conversations" (personal Teams communications) - - NOTION_CONNECTOR: "Notion workspace pages and databases" (personal knowledge management) - - YOUTUBE_VIDEO: "YouTube video transcripts and metadata" (personally saved videos) - - GITHUB_CONNECTOR: "GitHub repository content and issues" (personal repositories and interactions) - - ELASTICSEARCH_CONNECTOR: "Elasticsearch indexed documents and data" (personal Elasticsearch instances and custom data sources) - - LINEAR_CONNECTOR: "Linear project issues and discussions" (personal project management) - - JIRA_CONNECTOR: "Jira project issues, tickets, and comments" (personal project tracking) - - CONFLUENCE_CONNECTOR: "Confluence pages and comments" (personal project documentation) - - CLICKUP_CONNECTOR: "ClickUp tasks and project data" (personal task management) - - GOOGLE_CALENDAR_CONNECTOR: "Google Calendar events, meetings, and schedules" (personal calendar and time management) - - GOOGLE_GMAIL_CONNECTOR: "Google Gmail emails and conversations" (personal emails and communications) - - GOOGLE_DRIVE_FILE: "Google Drive files and documents" (personal cloud storage and file management) - - DISCORD_CONNECTOR: "Discord server conversations and shared content" (personal community communications) - - AIRTABLE_CONNECTOR: "Airtable records, tables, and database content" (personal data management and organization) - - TAVILY_API: "Tavily search API results" (personalized search results) - - SEARXNG_API: "SearxNG search API results" (personalized search results) - - LINKUP_API: "Linkup search API results" (personalized search results) - - BAIDU_SEARCH_API: "Baidu search API results" (personalized search results) - - LUMA_CONNECTOR: "Luma events" - - WEBCRAWLER_CONNECTOR: "Webpages indexed by SurfSense" (personally selected websites) - - BOOKSTACK_CONNECTOR: "BookStack pages" (personal documentation) - - CIRCLEBACK: "Circleback meeting notes, transcripts, and action items" (personal meeting records) - - OBSIDIAN_CONNECTOR: "Obsidian vault notes and markdown files" (personal notes and knowledge management) - - NOTE: `WEBCRAWLER_CONNECTOR` is mapped internally to the canonical document type `CRAWLED_URL`. - - Args: - query: The search query - be specific and include key terms - top_k: Number of results to retrieve (default: 10) - start_date: Optional ISO date/datetime (e.g. "2025-12-12" or "2025-12-12T00:00:00+00:00") - end_date: Optional ISO date/datetime (e.g. "2025-12-19" or "2025-12-19T23:59:59+00:00") - connectors_to_search: Optional list of connector enums to search. If omitted, searches all. - - Returns: - Formatted string with relevant documents and their content - """ + """Implementation function for knowledge base search.""" from app.agents.new_chat.utils import parse_date_or_datetime parsed_start: datetime | None = None @@ -640,6 +806,16 @@ def create_search_knowledge_base_tool( top_k=top_k, start_date=parsed_start, end_date=parsed_end, + available_connectors=_available_connectors, ) - return search_knowledge_base + # Create StructuredTool with dynamic description + # This properly sets the description that the LLM sees + tool = StructuredTool( + name="search_knowledge_base", + description=dynamic_description, + coroutine=_search_knowledge_base_impl, + args_schema=SearchKnowledgeBaseInput, + ) + + return tool diff --git a/surfsense_backend/app/agents/new_chat/tools/registry.py b/surfsense_backend/app/agents/new_chat/tools/registry.py index e4ce7a6b7..968e51445 100644 --- a/surfsense_backend/app/agents/new_chat/tools/registry.py +++ b/surfsense_backend/app/agents/new_chat/tools/registry.py @@ -85,6 +85,7 @@ class ToolDefinition: # Contributors: Add your new tools here! BUILTIN_TOOLS: list[ToolDefinition] = [ # Core tool - searches the user's knowledge base + # Now supports dynamic connector/document type discovery ToolDefinition( name="search_knowledge_base", description="Search the user's personal knowledge base for relevant information", @@ -92,8 +93,12 @@ BUILTIN_TOOLS: list[ToolDefinition] = [ search_space_id=deps["search_space_id"], db_session=deps["db_session"], connector_service=deps["connector_service"], + # Optional: dynamically discovered connectors/document types + available_connectors=deps.get("available_connectors"), + available_document_types=deps.get("available_document_types"), ), requires=["search_space_id", "db_session", "connector_service"], + # Note: available_connectors and available_document_types are optional ), # Podcast generation tool ToolDefinition( diff --git a/surfsense_backend/app/services/connector_service.py b/surfsense_backend/app/services/connector_service.py index dc43697e7..4c5599815 100644 --- a/surfsense_backend/app/services/connector_service.py +++ b/surfsense_backend/app/services/connector_service.py @@ -2871,3 +2871,350 @@ class ConnectorService: } return result_object, obsidian_docs + + # ========================================================================= + # Composio Connector Search Methods + # ========================================================================= + + async def search_composio_google_drive( + self, + user_query: str, + search_space_id: int, + top_k: int = 20, + start_date: datetime | None = None, + end_date: datetime | None = None, + ) -> tuple: + """ + Search for Composio Google Drive files and return both the source information + and langchain documents. + + Uses combined chunk-level and document-level hybrid search with RRF fusion. + + Args: + user_query: The user's query + search_space_id: The search space ID to search in + top_k: Maximum number of results to return + start_date: Optional start date for filtering documents by updated_at + end_date: Optional end date for filtering documents by updated_at + + Returns: + tuple: (sources_info, langchain_documents) + """ + composio_drive_docs = await self._combined_rrf_search( + query_text=user_query, + search_space_id=search_space_id, + document_type="COMPOSIO_GOOGLE_DRIVE_CONNECTOR", + top_k=top_k, + start_date=start_date, + end_date=end_date, + ) + + # Early return if no results + if not composio_drive_docs: + return { + "id": 54, + "name": "Google Drive (Composio)", + "type": "COMPOSIO_GOOGLE_DRIVE_CONNECTOR", + "sources": [], + }, [] + + def _title_fn(doc_info: dict[str, Any], metadata: dict[str, Any]) -> str: + return ( + doc_info.get("title") + or metadata.get("title") + or metadata.get("file_name") + or "Untitled Document" + ) + + def _url_fn(_doc_info: dict[str, Any], metadata: dict[str, Any]) -> str: + return metadata.get("url") or metadata.get("web_view_link") or "" + + def _description_fn( + chunk: dict[str, Any], _doc_info: dict[str, Any], metadata: dict[str, Any] + ) -> str: + description = self._chunk_preview(chunk.get("content", ""), limit=200) + info_parts = [] + mime_type = metadata.get("mime_type") + modified_time = metadata.get("modified_time") + if mime_type: + info_parts.append(f"Type: {mime_type}") + if modified_time: + info_parts.append(f"Modified: {modified_time}") + if info_parts: + description = (description + " | " + " | ".join(info_parts)).strip(" |") + return description + + def _extra_fields_fn( + _chunk: dict[str, Any], _doc_info: dict[str, Any], metadata: dict[str, Any] + ) -> dict[str, Any]: + return { + "mime_type": metadata.get("mime_type", ""), + "file_id": metadata.get("file_id", ""), + "modified_time": metadata.get("modified_time", ""), + } + + sources_list = self._build_chunk_sources_from_documents( + composio_drive_docs, + title_fn=_title_fn, + url_fn=_url_fn, + description_fn=_description_fn, + extra_fields_fn=_extra_fields_fn, + ) + + # Create result object + result_object = { + "id": 54, + "name": "Google Drive (Composio)", + "type": "COMPOSIO_GOOGLE_DRIVE_CONNECTOR", + "sources": sources_list, + } + + return result_object, composio_drive_docs + + async def search_composio_gmail( + self, + user_query: str, + search_space_id: int, + top_k: int = 20, + start_date: datetime | None = None, + end_date: datetime | None = None, + ) -> tuple: + """ + Search for Composio Gmail messages and return both the source information + and langchain documents. + + Uses combined chunk-level and document-level hybrid search with RRF fusion. + + Args: + user_query: The user's query + search_space_id: The search space ID to search in + top_k: Maximum number of results to return + start_date: Optional start date for filtering documents by updated_at + end_date: Optional end date for filtering documents by updated_at + + Returns: + tuple: (sources_info, langchain_documents) + """ + composio_gmail_docs = await self._combined_rrf_search( + query_text=user_query, + search_space_id=search_space_id, + document_type="COMPOSIO_GMAIL_CONNECTOR", + top_k=top_k, + start_date=start_date, + end_date=end_date, + ) + + # Early return if no results + if not composio_gmail_docs: + return { + "id": 55, + "name": "Gmail (Composio)", + "type": "COMPOSIO_GMAIL_CONNECTOR", + "sources": [], + }, [] + + def _title_fn(doc_info: dict[str, Any], metadata: dict[str, Any]) -> str: + return ( + doc_info.get("title") + or metadata.get("subject") + or metadata.get("title") + or "Untitled Email" + ) + + def _url_fn(_doc_info: dict[str, Any], metadata: dict[str, Any]) -> str: + return metadata.get("url") or "" + + def _description_fn( + chunk: dict[str, Any], _doc_info: dict[str, Any], metadata: dict[str, Any] + ) -> str: + description = self._chunk_preview(chunk.get("content", ""), limit=200) + info_parts = [] + sender = metadata.get("from") or metadata.get("sender") + date = metadata.get("date") or metadata.get("received_at") + if sender: + info_parts.append(f"From: {sender}") + if date: + info_parts.append(f"Date: {date}") + if info_parts: + description = (description + " | " + " | ".join(info_parts)).strip(" |") + return description + + def _extra_fields_fn( + _chunk: dict[str, Any], _doc_info: dict[str, Any], metadata: dict[str, Any] + ) -> dict[str, Any]: + return { + "message_id": metadata.get("message_id", ""), + "thread_id": metadata.get("thread_id", ""), + "from": metadata.get("from", ""), + "to": metadata.get("to", ""), + "date": metadata.get("date", ""), + } + + sources_list = self._build_chunk_sources_from_documents( + composio_gmail_docs, + title_fn=_title_fn, + url_fn=_url_fn, + description_fn=_description_fn, + extra_fields_fn=_extra_fields_fn, + ) + + # Create result object + result_object = { + "id": 55, + "name": "Gmail (Composio)", + "type": "COMPOSIO_GMAIL_CONNECTOR", + "sources": sources_list, + } + + return result_object, composio_gmail_docs + + async def search_composio_google_calendar( + self, + user_query: str, + search_space_id: int, + top_k: int = 20, + start_date: datetime | None = None, + end_date: datetime | None = None, + ) -> tuple: + """ + Search for Composio Google Calendar events and return both the source information + and langchain documents. + + Uses combined chunk-level and document-level hybrid search with RRF fusion. + + Args: + user_query: The user's query + search_space_id: The search space ID to search in + top_k: Maximum number of results to return + start_date: Optional start date for filtering documents by updated_at + end_date: Optional end date for filtering documents by updated_at + + Returns: + tuple: (sources_info, langchain_documents) + """ + composio_calendar_docs = await self._combined_rrf_search( + query_text=user_query, + search_space_id=search_space_id, + document_type="COMPOSIO_GOOGLE_CALENDAR_CONNECTOR", + top_k=top_k, + start_date=start_date, + end_date=end_date, + ) + + # Early return if no results + if not composio_calendar_docs: + return { + "id": 56, + "name": "Google Calendar (Composio)", + "type": "COMPOSIO_GOOGLE_CALENDAR_CONNECTOR", + "sources": [], + }, [] + + def _title_fn(doc_info: dict[str, Any], metadata: dict[str, Any]) -> str: + return ( + doc_info.get("title") + or metadata.get("summary") + or metadata.get("title") + or "Untitled Event" + ) + + def _url_fn(_doc_info: dict[str, Any], metadata: dict[str, Any]) -> str: + return metadata.get("url") or metadata.get("html_link") or "" + + def _description_fn( + chunk: dict[str, Any], _doc_info: dict[str, Any], metadata: dict[str, Any] + ) -> str: + description = self._chunk_preview(chunk.get("content", ""), limit=200) + info_parts = [] + start_time = metadata.get("start_time") or metadata.get("start") + end_time = metadata.get("end_time") or metadata.get("end") + if start_time: + info_parts.append(f"Start: {start_time}") + if end_time: + info_parts.append(f"End: {end_time}") + if info_parts: + description = (description + " | " + " | ".join(info_parts)).strip(" |") + return description + + def _extra_fields_fn( + _chunk: dict[str, Any], _doc_info: dict[str, Any], metadata: dict[str, Any] + ) -> dict[str, Any]: + return { + "event_id": metadata.get("event_id", ""), + "calendar_id": metadata.get("calendar_id", ""), + "start_time": metadata.get("start_time", ""), + "end_time": metadata.get("end_time", ""), + "location": metadata.get("location", ""), + } + + sources_list = self._build_chunk_sources_from_documents( + composio_calendar_docs, + title_fn=_title_fn, + url_fn=_url_fn, + description_fn=_description_fn, + extra_fields_fn=_extra_fields_fn, + ) + + # Create result object + result_object = { + "id": 56, + "name": "Google Calendar (Composio)", + "type": "COMPOSIO_GOOGLE_CALENDAR_CONNECTOR", + "sources": sources_list, + } + + return result_object, composio_calendar_docs + + # ========================================================================= + # Utility Methods for Connector Discovery + # ========================================================================= + + async def get_available_connectors( + self, + search_space_id: int, + ) -> list[SearchSourceConnectorType]: + """ + Get all available (enabled) connector types for a search space. + + Args: + search_space_id: The search space ID + + Returns: + List of SearchSourceConnectorType enums for enabled connectors + """ + query = ( + select(SearchSourceConnector.connector_type) + .filter( + SearchSourceConnector.search_space_id == search_space_id, + ) + .distinct() + ) + + result = await self.session.execute(query) + connector_types = result.scalars().all() + return list(connector_types) + + async def get_available_document_types( + self, + search_space_id: int, + ) -> list[str]: + """ + Get all document types that have at least one document in the search space. + + Args: + search_space_id: The search space ID + + Returns: + List of document type strings that have documents indexed + """ + from sqlalchemy import distinct + + from app.db import Document + + query = select(distinct(Document.document_type)).filter( + Document.search_space_id == search_space_id, + ) + + result = await self.session.execute(query) + doc_types = result.scalars().all() + return [str(dt) for dt in doc_types] diff --git a/surfsense_backend/app/tasks/chat/stream_new_chat.py b/surfsense_backend/app/tasks/chat/stream_new_chat.py index a49c244eb..8dfff4895 100644 --- a/surfsense_backend/app/tasks/chat/stream_new_chat.py +++ b/surfsense_backend/app/tasks/chat/stream_new_chat.py @@ -54,21 +54,64 @@ def format_attachments_as_context(attachments: list[ChatAttachment]) -> str: def format_mentioned_documents_as_context(documents: list[Document]) -> str: - """Format mentioned documents as context for the agent.""" + """ + Format mentioned documents as context for the agent. + + Uses the same XML structure as knowledge_base.format_documents_for_context + to ensure citations work properly with chunk IDs. + """ if not documents: return "" context_parts = [""] context_parts.append( "The user has explicitly mentioned the following documents from their knowledge base. " - "These documents are directly relevant to the query and should be prioritized as primary sources." + "These documents are directly relevant to the query and should be prioritized as primary sources. " + "Use [citation:CHUNK_ID] format for citations (e.g., [citation:123])." ) - for i, doc in enumerate(documents, 1): - context_parts.append( - f"" + context_parts.append("") + + for doc in documents: + # Build metadata JSON + metadata = doc.document_metadata or {} + metadata_json = json.dumps(metadata, ensure_ascii=False) + + # Get URL from metadata + url = ( + metadata.get("url") + or metadata.get("source") + or metadata.get("page_url") + or "" ) - context_parts.append(f"") + + context_parts.append("") + context_parts.append("") + context_parts.append(f" {doc.id}") + context_parts.append(f" {doc.document_type.value}") + context_parts.append(f" <![CDATA[{doc.title}]]>") + context_parts.append(f" ") + context_parts.append(f" ") + context_parts.append("") + context_parts.append("") + context_parts.append("") + + # Use chunks if available (preferred for proper citations) + if hasattr(doc, "chunks") and doc.chunks: + for chunk in doc.chunks: + context_parts.append( + f" " + ) + else: + # Fallback to document content if chunks not loaded + # Use document ID as chunk ID prefix for consistency + context_parts.append( + f" " + ) + + context_parts.append("") context_parts.append("") + context_parts.append("") + context_parts.append("") return "\n".join(context_parts) @@ -81,8 +124,6 @@ def format_mentioned_surfsense_docs_as_context( if not documents: return "" - import json - context_parts = [""] context_parts.append( "The user has explicitly mentioned the following SurfSense documentation pages. " @@ -262,11 +303,15 @@ async def stream_new_chat( # Build input with message history from frontend langchain_messages = [] - # Fetch mentioned documents if any + # Fetch mentioned documents if any (with chunks for proper citations) mentioned_documents: list[Document] = [] if mentioned_document_ids: + from sqlalchemy.orm import selectinload as doc_selectinload + result = await session.execute( - select(Document).filter( + select(Document) + .options(doc_selectinload(Document.chunks)) + .filter( Document.id.in_(mentioned_document_ids), Document.search_space_id == search_space_id, ) diff --git a/surfsense_backend/pyproject.toml b/surfsense_backend/pyproject.toml index ffe9e5232..57dbdc7b5 100644 --- a/surfsense_backend/pyproject.toml +++ b/surfsense_backend/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "surf-new-backend" -version = "0.0.11" +version = "0.0.12" description = "SurfSense Backend" requires-python = ">=3.12" dependencies = [ diff --git a/surfsense_backend/uv.lock b/surfsense_backend/uv.lock index 18f04288e..16b77a7b2 100644 --- a/surfsense_backend/uv.lock +++ b/surfsense_backend/uv.lock @@ -6545,7 +6545,7 @@ wheels = [ [[package]] name = "surf-new-backend" -version = "0.0.11" +version = "0.0.12" source = { editable = "." } dependencies = [ { name = "alembic" }, diff --git a/surfsense_browser_extension/package.json b/surfsense_browser_extension/package.json index b225bc206..bf926d09f 100644 --- a/surfsense_browser_extension/package.json +++ b/surfsense_browser_extension/package.json @@ -1,7 +1,7 @@ { "name": "surfsense_browser_extension", "displayName": "Surfsense Browser Extension", - "version": "0.0.11", + "version": "0.0.12", "description": "Extension to collect Browsing History for SurfSense.", "author": "https://github.com/MODSetter", "engines": { diff --git a/surfsense_web/components/assistant-ui/connector-popup/config/connector-status-config.json b/surfsense_web/components/assistant-ui/connector-popup/config/connector-status-config.json index b729c3f8b..2c1010b1c 100644 --- a/surfsense_web/components/assistant-ui/connector-popup/config/connector-status-config.json +++ b/surfsense_web/components/assistant-ui/connector-popup/config/connector-status-config.json @@ -24,6 +24,16 @@ "enabled": true, "status": "warning", "statusMessage": "Some requests may be blocked if not using Firecrawl." + }, + "COMPOSIO_GOOGLE_DRIVE_CONNECTOR": { + "enabled": false, + "status": "disabled", + "statusMessage": "Not available yet." + }, + "GITHUB_CONNECTOR": { + "enabled": false, + "status": "warning", + "statusMessage": "Some issues with indexing repositories." } }, "globalSettings": { diff --git a/surfsense_web/components/layout/providers/LayoutDataProvider.tsx b/surfsense_web/components/layout/providers/LayoutDataProvider.tsx index 86d7082ee..702014050 100644 --- a/surfsense_web/components/layout/providers/LayoutDataProvider.tsx +++ b/surfsense_web/components/layout/providers/LayoutDataProvider.tsx @@ -1,13 +1,14 @@ "use client"; import { useQuery, useQueryClient } from "@tanstack/react-query"; -import { useAtomValue } from "jotai"; +import { useAtomValue, useSetAtom } from "jotai"; import { Inbox, LogOut, SquareLibrary, Trash2 } from "lucide-react"; import { useParams, usePathname, useRouter } from "next/navigation"; import { useTranslations } from "next-intl"; import { useTheme } from "next-themes"; -import { useCallback, useMemo, useState } from "react"; +import { useCallback, useEffect, useMemo, useState } from "react"; import { toast } from "sonner"; +import { currentThreadAtom, resetCurrentThreadAtom } from "@/atoms/chat/current-thread.atom"; import { deleteSearchSpaceMutationAtom } from "@/atoms/search-spaces/search-space-mutation.atoms"; import { searchSpacesAtom } from "@/atoms/search-spaces/search-space-query.atoms"; import { currentUserAtom } from "@/atoms/user/user-query.atoms"; @@ -68,11 +69,16 @@ export function LayoutDataProvider({ const { data: user } = useAtomValue(currentUserAtom); const { data: searchSpacesData, refetch: refetchSearchSpaces } = useAtomValue(searchSpacesAtom); const { mutateAsync: deleteSearchSpace } = useAtomValue(deleteSearchSpaceMutationAtom); + const currentThreadState = useAtomValue(currentThreadAtom); + const resetCurrentThread = useSetAtom(resetCurrentThreadAtom); - // Current IDs from URL + // State for handling new chat navigation when router is out of sync + const [pendingNewChat, setPendingNewChat] = useState(false); + + // Current IDs from URL, with fallback to atom for replaceState updates const currentChatId = params?.chat_id ? Number(Array.isArray(params.chat_id) ? params.chat_id[0] : params.chat_id) - : null; + : currentThreadState.id; // Fetch current search space (for caching purposes) useQuery({ @@ -124,6 +130,17 @@ export function LayoutDataProvider({ const [isDeletingSearchSpace, setIsDeletingSearchSpace] = useState(false); const [isLeavingSearchSpace, setIsLeavingSearchSpace] = useState(false); + // Effect to complete new chat navigation after router syncs + // This runs when handleNewChat detected an out-of-sync state and triggered a sync + useEffect(() => { + if (pendingNewChat && params?.chat_id) { + // Router is now synced (chat_id is in params), complete navigation to new-chat + resetCurrentThread(); + router.push(`/dashboard/${searchSpaceId}/new-chat`); + setPendingNewChat(false); + } + }, [pendingNewChat, params?.chat_id, router, searchSpaceId, resetCurrentThread]); + const searchSpaces: SearchSpace[] = useMemo(() => { if (!searchSpacesData || !Array.isArray(searchSpacesData)) return []; return searchSpacesData.map((space) => ({ @@ -175,12 +192,6 @@ export function LayoutDataProvider({ // Navigation items const navItems: NavItem[] = useMemo( () => [ - { - title: "Documents", - url: `/dashboard/${searchSpaceId}/documents`, - icon: SquareLibrary, - isActive: pathname?.includes("/documents"), - }, { title: "Inbox", url: "#inbox", // Special URL to indicate this is handled differently @@ -188,6 +199,12 @@ export function LayoutDataProvider({ isActive: isInboxSidebarOpen, badge: unreadCount > 0 ? formatInboxCount(unreadCount) : undefined, }, + { + title: "Documents", + url: `/dashboard/${searchSpaceId}/documents`, + icon: SquareLibrary, + isActive: pathname?.includes("/documents"), + }, ], [searchSpaceId, pathname, isInboxSidebarOpen, unreadCount] ); @@ -292,8 +309,20 @@ export function LayoutDataProvider({ ); const handleNewChat = useCallback(() => { - router.push(`/dashboard/${searchSpaceId}/new-chat`); - }, [router, searchSpaceId]); + // Check if router is out of sync (thread created via replaceState but params don't have chat_id) + const isOutOfSync = currentThreadState.id !== null && !params?.chat_id; + + if (isOutOfSync) { + // First sync Next.js router by navigating to the current chat's actual URL + // This updates the router's internal state to match the browser URL + router.replace(`/dashboard/${searchSpaceId}/new-chat/${currentThreadState.id}`); + // Set flag to trigger navigation to new-chat after params update + setPendingNewChat(true); + } else { + // Normal navigation - router is in sync + router.push(`/dashboard/${searchSpaceId}/new-chat`); + } + }, [router, searchSpaceId, currentThreadState.id, params?.chat_id]); const handleChatSelect = useCallback( (chat: ChatItem) => { diff --git a/surfsense_web/package.json b/surfsense_web/package.json index 7ec05c95d..235f4b9db 100644 --- a/surfsense_web/package.json +++ b/surfsense_web/package.json @@ -1,6 +1,6 @@ { "name": "surfsense_web", - "version": "0.0.11", + "version": "0.0.12", "private": true, "description": "SurfSense Frontend", "scripts": {