diff --git a/README.md b/README.md index 7f50b924c..4dd368c04 100644 --- a/README.md +++ b/README.md @@ -29,8 +29,7 @@ SurfSense is a highly customizable AI research agent, connected to external sour # Video -https://github.com/user-attachments/assets/42a29ea1-d4d8-4213-9c69-972b5b806d58 - +https://github.com/user-attachments/assets/cc0c84d3-1f2f-4f7a-b519-2ecce22310b1 ## Podcast Sample @@ -52,8 +51,10 @@ https://github.com/user-attachments/assets/a0a16566-6967-4374-ac51-9b3e07fbecd7 - Interact in Natural Language and get cited answers. ### 📄 **Cited Answers** - Get Cited answers just like Perplexity. +### 🧩 **Universal Compatibility** +- Connect virtually any inference provider via the OpenAI spec and LiteLLM. ### 🔔 **Privacy & Local LLM Support** -- Works Flawlessly with Ollama local LLMs. +- Works Flawlessly with local LLMs like vLLM and Ollama. ### 🏠 **Self Hostable** - Open source and easy to deploy locally. ### 👥 **Team Collaboration with RBAC** @@ -61,6 +62,7 @@ https://github.com/user-attachments/assets/a0a16566-6967-4374-ac51-9b3e07fbecd7 - Invite team members with customizable roles (Owner, Admin, Editor, Viewer) - Granular permissions for documents, chats, connectors, and settings - Share knowledge bases securely within your organization +- Team chats update in real-time and "Chat about the chat" in comment threads ### 🎙️ Podcasts - Blazingly fast podcast generation agent. (Creates a 3-minute podcast in under 20 seconds.) - Convert your chat conversations into engaging audio content @@ -237,6 +239,8 @@ Before self-hosting installation, make sure to complete the [prerequisite setup ### **BackEnd** +- **LiteLLM**: Universal LLM integration supporting 100+ models (OpenAI, Anthropic, Ollama, etc.) + - **FastAPI**: Modern, fast web framework for building APIs with Python - **PostgreSQL with pgvector**: Database with vector search capabilities for similarity searches @@ -253,8 +257,6 @@ Before self-hosting installation, make sure to complete the [prerequisite setup - **LangChain**: Framework for developing AI-powered applications. -- **LiteLLM**: Universal LLM integration supporting 100+ models (OpenAI, Anthropic, Ollama, etc.) - - **Rerankers**: Advanced result ranking for improved search relevance - **Hybrid Search**: Combines vector similarity and full-text search for optimal results using Reciprocal Rank Fusion (RRF) diff --git a/surfsense_backend/alembic/versions/79_add_composio_connector_enums.py b/surfsense_backend/alembic/versions/79_add_composio_connector_enums.py new file mode 100644 index 000000000..0869fdcc3 --- /dev/null +++ b/surfsense_backend/alembic/versions/79_add_composio_connector_enums.py @@ -0,0 +1,95 @@ +"""Add Composio connector types to SearchSourceConnectorType and DocumentType enums + +Revision ID: 79 +Revises: 78 + +This migration adds the Composio connector enum values to both: +- searchsourceconnectortype (for connector type tracking) +- documenttype (for document type tracking) + +Composio is a managed OAuth integration service that allows connecting +to various third-party services (Google Drive, Gmail, Calendar, etc.) +without requiring separate OAuth app verification. + +This migration adds three specific connector types: +- COMPOSIO_GOOGLE_DRIVE_CONNECTOR +- COMPOSIO_GMAIL_CONNECTOR +- COMPOSIO_GOOGLE_CALENDAR_CONNECTOR +""" + +from collections.abc import Sequence + +from alembic import op + +# revision identifiers, used by Alembic. +revision: str = "79" +down_revision: str | None = "78" +branch_labels: str | Sequence[str] | None = None +depends_on: str | Sequence[str] | None = None + +# Define the ENUM type names and the new values +CONNECTOR_ENUM = "searchsourceconnectortype" +CONNECTOR_NEW_VALUES = [ + "COMPOSIO_GOOGLE_DRIVE_CONNECTOR", + "COMPOSIO_GMAIL_CONNECTOR", + "COMPOSIO_GOOGLE_CALENDAR_CONNECTOR", +] +DOCUMENT_ENUM = "documenttype" +DOCUMENT_NEW_VALUES = [ + "COMPOSIO_GOOGLE_DRIVE_CONNECTOR", + "COMPOSIO_GMAIL_CONNECTOR", + "COMPOSIO_GOOGLE_CALENDAR_CONNECTOR", +] + + +def upgrade() -> None: + """Upgrade schema - add Composio connector types to connector and document enums safely.""" + # Add each Composio connector type to searchsourceconnectortype only if not exists + for value in CONNECTOR_NEW_VALUES: + op.execute( + f""" + DO $$ + BEGIN + IF NOT EXISTS ( + SELECT 1 FROM pg_enum e + JOIN pg_type t ON e.enumtypid = t.oid + WHERE t.typname = '{CONNECTOR_ENUM}' AND e.enumlabel = '{value}' + ) THEN + ALTER TYPE {CONNECTOR_ENUM} ADD VALUE '{value}'; + END IF; + END$$; + """ + ) + + # Add each Composio connector type to documenttype only if not exists + for value in DOCUMENT_NEW_VALUES: + op.execute( + f""" + DO $$ + BEGIN + IF NOT EXISTS ( + SELECT 1 FROM pg_enum e + JOIN pg_type t ON e.enumtypid = t.oid + WHERE t.typname = '{DOCUMENT_ENUM}' AND e.enumlabel = '{value}' + ) THEN + ALTER TYPE {DOCUMENT_ENUM} ADD VALUE '{value}'; + END IF; + END$$; + """ + ) + + +def downgrade() -> None: + """Downgrade schema - remove Composio connector types from connector and document enums. + + Note: PostgreSQL does not support removing enum values directly. + To properly downgrade, you would need to: + 1. Delete any rows using the Composio connector type values + 2. Create new enums without the Composio connector types + 3. Alter the columns to use the new enums + 4. Drop the old enums + + This is left as a no-op since removing enum values is complex + and typically not needed in practice. + """ + pass diff --git a/surfsense_backend/app/agents/new_chat/chat_deepagent.py b/surfsense_backend/app/agents/new_chat/chat_deepagent.py index 5bc6ac2e2..53e1b14bd 100644 --- a/surfsense_backend/app/agents/new_chat/chat_deepagent.py +++ b/surfsense_backend/app/agents/new_chat/chat_deepagent.py @@ -7,6 +7,7 @@ via NewLLMConfig. """ from collections.abc import Sequence +from typing import Any from deepagents import create_deep_agent from langchain_core.tools import BaseTool @@ -23,6 +24,90 @@ from app.agents.new_chat.system_prompt import ( from app.agents.new_chat.tools.registry import build_tools_async from app.services.connector_service import ConnectorService +# ============================================================================= +# Connector Type Mapping +# ============================================================================= + +# Maps SearchSourceConnectorType enum values to the searchable document/connector types +# used by the knowledge_base tool. Some connectors map to different document types. +_CONNECTOR_TYPE_TO_SEARCHABLE: dict[str, str] = { + # Direct mappings (connector type == searchable type) + "TAVILY_API": "TAVILY_API", + "SEARXNG_API": "SEARXNG_API", + "LINKUP_API": "LINKUP_API", + "BAIDU_SEARCH_API": "BAIDU_SEARCH_API", + "SLACK_CONNECTOR": "SLACK_CONNECTOR", + "TEAMS_CONNECTOR": "TEAMS_CONNECTOR", + "NOTION_CONNECTOR": "NOTION_CONNECTOR", + "GITHUB_CONNECTOR": "GITHUB_CONNECTOR", + "LINEAR_CONNECTOR": "LINEAR_CONNECTOR", + "DISCORD_CONNECTOR": "DISCORD_CONNECTOR", + "JIRA_CONNECTOR": "JIRA_CONNECTOR", + "CONFLUENCE_CONNECTOR": "CONFLUENCE_CONNECTOR", + "CLICKUP_CONNECTOR": "CLICKUP_CONNECTOR", + "GOOGLE_CALENDAR_CONNECTOR": "GOOGLE_CALENDAR_CONNECTOR", + "GOOGLE_GMAIL_CONNECTOR": "GOOGLE_GMAIL_CONNECTOR", + "GOOGLE_DRIVE_CONNECTOR": "GOOGLE_DRIVE_FILE", # Connector type differs from document type + "AIRTABLE_CONNECTOR": "AIRTABLE_CONNECTOR", + "LUMA_CONNECTOR": "LUMA_CONNECTOR", + "ELASTICSEARCH_CONNECTOR": "ELASTICSEARCH_CONNECTOR", + "WEBCRAWLER_CONNECTOR": "CRAWLED_URL", # Maps to document type + "BOOKSTACK_CONNECTOR": "BOOKSTACK_CONNECTOR", + "CIRCLEBACK_CONNECTOR": "CIRCLEBACK", # Connector type differs from document type + "OBSIDIAN_CONNECTOR": "OBSIDIAN_CONNECTOR", + # Composio connectors + "COMPOSIO_GOOGLE_DRIVE_CONNECTOR": "COMPOSIO_GOOGLE_DRIVE_CONNECTOR", + "COMPOSIO_GMAIL_CONNECTOR": "COMPOSIO_GMAIL_CONNECTOR", + "COMPOSIO_GOOGLE_CALENDAR_CONNECTOR": "COMPOSIO_GOOGLE_CALENDAR_CONNECTOR", +} + +# Document types that don't come from SearchSourceConnector but should always be searchable +_ALWAYS_AVAILABLE_DOC_TYPES: list[str] = [ + "EXTENSION", # Browser extension data + "FILE", # Uploaded files + "NOTE", # User notes + "YOUTUBE_VIDEO", # YouTube videos +] + + +def _map_connectors_to_searchable_types( + connector_types: list[Any], +) -> list[str]: + """ + Map SearchSourceConnectorType enums to searchable document/connector types. + + This function: + 1. Converts connector type enums to their searchable counterparts + 2. Includes always-available document types (EXTENSION, FILE, NOTE, YOUTUBE_VIDEO) + 3. Deduplicates while preserving order + + Args: + connector_types: List of SearchSourceConnectorType enum values + + Returns: + List of searchable connector/document type strings + """ + result_set: set[str] = set() + result_list: list[str] = [] + + # Add always-available document types first + for doc_type in _ALWAYS_AVAILABLE_DOC_TYPES: + if doc_type not in result_set: + result_set.add(doc_type) + result_list.append(doc_type) + + # Map each connector type to its searchable equivalent + for ct in connector_types: + # Handle both enum and string types + ct_str = ct.value if hasattr(ct, "value") else str(ct) + searchable = _CONNECTOR_TYPE_TO_SEARCHABLE.get(ct_str) + if searchable and searchable not in result_set: + result_set.add(searchable) + result_list.append(searchable) + + return result_list + + # ============================================================================= # Deep Agent Factory # ============================================================================= @@ -116,6 +201,30 @@ async def create_surfsense_deep_agent( additional_tools=[my_custom_tool] ) """ + # Discover available connectors and document types for this search space + # This enables dynamic tool docstrings that inform the LLM about what's actually available + available_connectors: list[str] | None = None + available_document_types: list[str] | None = None + + try: + # Get enabled search source connectors for this search space + connector_types = await connector_service.get_available_connectors( + search_space_id + ) + if connector_types: + # Convert enum values to strings and also include mapped document types + available_connectors = _map_connectors_to_searchable_types(connector_types) + + # Get document types that have at least one document indexed + available_document_types = await connector_service.get_available_document_types( + search_space_id + ) + except Exception as e: + # Log but don't fail - fall back to all connectors if discovery fails + import logging + + logging.warning(f"Failed to discover available connectors/document types: {e}") + # Build dependencies dict for the tools registry dependencies = { "search_space_id": search_space_id, @@ -123,6 +232,9 @@ async def create_surfsense_deep_agent( "connector_service": connector_service, "firecrawl_api_key": firecrawl_api_key, "user_id": user_id, # Required for memory tools + # Dynamic connector/document type discovery for knowledge base tool + "available_connectors": available_connectors, + "available_document_types": available_document_types, } # Build tools using the async registry (includes MCP tools) diff --git a/surfsense_backend/app/agents/new_chat/tools/__init__.py b/surfsense_backend/app/agents/new_chat/tools/__init__.py index acbdbcb3a..9e1a4f19c 100644 --- a/surfsense_backend/app/agents/new_chat/tools/__init__.py +++ b/surfsense_backend/app/agents/new_chat/tools/__init__.py @@ -19,6 +19,7 @@ Available tools: # Tool factory exports (for direct use) from .display_image import create_display_image_tool from .knowledge_base import ( + CONNECTOR_DESCRIPTIONS, create_search_knowledge_base_tool, format_documents_for_context, search_knowledge_base_async, @@ -40,6 +41,8 @@ from .user_memory import create_recall_memory_tool, create_save_memory_tool __all__ = [ # Registry "BUILTIN_TOOLS", + # Knowledge base utilities + "CONNECTOR_DESCRIPTIONS", "ToolDefinition", "build_tools", # Tool factories @@ -51,7 +54,6 @@ __all__ = [ "create_scrape_webpage_tool", "create_search_knowledge_base_tool", "create_search_surfsense_docs_tool", - # Knowledge base utilities "format_documents_for_context", "get_all_tool_names", "get_default_enabled_tools", diff --git a/surfsense_backend/app/agents/new_chat/tools/knowledge_base.py b/surfsense_backend/app/agents/new_chat/tools/knowledge_base.py index 552019dda..a11e4ac38 100644 --- a/surfsense_backend/app/agents/new_chat/tools/knowledge_base.py +++ b/surfsense_backend/app/agents/new_chat/tools/knowledge_base.py @@ -12,7 +12,8 @@ import json from datetime import datetime from typing import Any -from langchain_core.tools import tool +from langchain_core.tools import StructuredTool +from pydantic import BaseModel, Field from sqlalchemy.ext.asyncio import AsyncSession from app.services.connector_service import ConnectorService @@ -22,6 +23,7 @@ from app.services.connector_service import ConnectorService # ============================================================================= # Canonical connector values used internally by ConnectorService +# Includes all document types and search source connectors _ALL_CONNECTORS: list[str] = [ "EXTENSION", "FILE", @@ -50,41 +52,117 @@ _ALL_CONNECTORS: list[str] = [ "CRAWLED_URL", "CIRCLEBACK", "OBSIDIAN_CONNECTOR", + # Composio connectors + "COMPOSIO_GOOGLE_DRIVE_CONNECTOR", + "COMPOSIO_GMAIL_CONNECTOR", + "COMPOSIO_GOOGLE_CALENDAR_CONNECTOR", ] +# Human-readable descriptions for each connector type +# Used for generating dynamic docstrings and informing the LLM +CONNECTOR_DESCRIPTIONS: dict[str, str] = { + "EXTENSION": "Web content saved via SurfSense browser extension (personal browsing history)", + "FILE": "User-uploaded documents (PDFs, Word, etc.) (personal files)", + "NOTE": "SurfSense Notes (notes created inside SurfSense)", + "SLACK_CONNECTOR": "Slack conversations and shared content (personal workspace communications)", + "TEAMS_CONNECTOR": "Microsoft Teams messages and conversations (personal Teams communications)", + "NOTION_CONNECTOR": "Notion workspace pages and databases (personal knowledge management)", + "YOUTUBE_VIDEO": "YouTube video transcripts and metadata (personally saved videos)", + "GITHUB_CONNECTOR": "GitHub repository content and issues (personal repositories and interactions)", + "ELASTICSEARCH_CONNECTOR": "Elasticsearch indexed documents and data (personal Elasticsearch instances)", + "LINEAR_CONNECTOR": "Linear project issues and discussions (personal project management)", + "JIRA_CONNECTOR": "Jira project issues, tickets, and comments (personal project tracking)", + "CONFLUENCE_CONNECTOR": "Confluence pages and comments (personal project documentation)", + "CLICKUP_CONNECTOR": "ClickUp tasks and project data (personal task management)", + "GOOGLE_CALENDAR_CONNECTOR": "Google Calendar events, meetings, and schedules (personal calendar)", + "GOOGLE_GMAIL_CONNECTOR": "Google Gmail emails and conversations (personal emails)", + "GOOGLE_DRIVE_FILE": "Google Drive files and documents (personal cloud storage)", + "DISCORD_CONNECTOR": "Discord server conversations and shared content (personal community)", + "AIRTABLE_CONNECTOR": "Airtable records, tables, and database content (personal data)", + "TAVILY_API": "Tavily web search API results (real-time web search)", + "SEARXNG_API": "SearxNG search API results (privacy-focused web search)", + "LINKUP_API": "Linkup search API results (web search)", + "BAIDU_SEARCH_API": "Baidu search API results (Chinese web search)", + "LUMA_CONNECTOR": "Luma events and meetings", + "WEBCRAWLER_CONNECTOR": "Webpages indexed by SurfSense (personally selected websites)", + "CRAWLED_URL": "Webpages indexed by SurfSense (personally selected websites)", + "BOOKSTACK_CONNECTOR": "BookStack pages (personal documentation)", + "CIRCLEBACK": "Circleback meeting notes, transcripts, and action items", + "OBSIDIAN_CONNECTOR": "Obsidian vault notes and markdown files (personal notes)", + # Composio connectors + "COMPOSIO_GOOGLE_DRIVE_CONNECTOR": "Google Drive files via Composio (personal cloud storage)", + "COMPOSIO_GMAIL_CONNECTOR": "Gmail emails via Composio (personal emails)", + "COMPOSIO_GOOGLE_CALENDAR_CONNECTOR": "Google Calendar events via Composio (personal calendar)", +} -def _normalize_connectors(connectors_to_search: list[str] | None) -> list[str]: + +def _normalize_connectors( + connectors_to_search: list[str] | None, + available_connectors: list[str] | None = None, +) -> list[str]: """ Normalize connectors provided by the model. - Accepts user-facing enums like WEBCRAWLER_CONNECTOR and maps them to canonical ConnectorService types. - Drops unknown values. - - If None/empty, defaults to searching across all known connectors. + - If available_connectors is provided, only includes connectors from that list. + - If connectors_to_search is None/empty, defaults to available_connectors or all. + + Args: + connectors_to_search: List of connectors requested by the model + available_connectors: List of connectors actually available in the search space + + Returns: + List of normalized connector strings to search """ + # Determine the set of valid connectors to consider + valid_set = ( + set(available_connectors) if available_connectors else set(_ALL_CONNECTORS) + ) + if not connectors_to_search: - return list(_ALL_CONNECTORS) + # Search all available connectors if none specified + return ( + list(available_connectors) + if available_connectors + else list(_ALL_CONNECTORS) + ) normalized: list[str] = [] for raw in connectors_to_search: c = (raw or "").strip().upper() if not c: continue + # Map user-facing aliases to canonical names if c == "WEBCRAWLER_CONNECTOR": c = "CRAWLED_URL" normalized.append(c) - # de-dupe while preserving order + filter unknown + # de-dupe while preserving order + filter to valid connectors seen: set[str] = set() out: list[str] = [] for c in normalized: if c in seen: continue + # Only include if it's a known connector AND available if c not in _ALL_CONNECTORS: continue + if c not in valid_set: + continue seen.add(c) out.append(c) - return out if out else list(_ALL_CONNECTORS) + + # Fallback to all available if nothing matched + return ( + out + if out + else ( + list(available_connectors) + if available_connectors + else list(_ALL_CONNECTORS) + ) + ) # ============================================================================= @@ -233,6 +311,7 @@ async def search_knowledge_base_async( top_k: int = 10, start_date: datetime | None = None, end_date: datetime | None = None, + available_connectors: list[str] | None = None, ) -> str: """ Search the user's knowledge base for relevant documents. @@ -248,6 +327,8 @@ async def search_knowledge_base_async( top_k: Number of results per connector start_date: Optional start datetime (UTC) for filtering documents end_date: Optional end datetime (UTC) for filtering documents + available_connectors: Optional list of connectors actually available in the search space. + If provided, only these connectors will be searched. Returns: Formatted string with search results @@ -262,7 +343,7 @@ async def search_knowledge_base_async( end_date=end_date, ) - connectors = _normalize_connectors(connectors_to_search) + connectors = _normalize_connectors(connectors_to_search, available_connectors) for connector in connectors: try: @@ -316,6 +397,16 @@ async def search_knowledge_base_async( ) all_documents.extend(chunks) + elif connector == "TEAMS_CONNECTOR": + _, chunks = await connector_service.search_teams( + user_query=query, + search_space_id=search_space_id, + top_k=top_k, + start_date=resolved_start_date, + end_date=resolved_end_date, + ) + all_documents.extend(chunks) + elif connector == "NOTION_CONNECTOR": _, chunks = await connector_service.search_notion( user_query=query, @@ -519,6 +610,39 @@ async def search_knowledge_base_async( ) all_documents.extend(chunks) + # ========================================================= + # Composio Connectors + # ========================================================= + elif connector == "COMPOSIO_GOOGLE_DRIVE_CONNECTOR": + _, chunks = await connector_service.search_composio_google_drive( + user_query=query, + search_space_id=search_space_id, + top_k=top_k, + start_date=resolved_start_date, + end_date=resolved_end_date, + ) + all_documents.extend(chunks) + + elif connector == "COMPOSIO_GMAIL_CONNECTOR": + _, chunks = await connector_service.search_composio_gmail( + user_query=query, + search_space_id=search_space_id, + top_k=top_k, + start_date=resolved_start_date, + end_date=resolved_end_date, + ) + all_documents.extend(chunks) + + elif connector == "COMPOSIO_GOOGLE_CALENDAR_CONNECTOR": + _, chunks = await connector_service.search_composio_google_calendar( + user_query=query, + search_space_id=search_space_id, + top_k=top_k, + start_date=resolved_start_date, + end_date=resolved_end_date, + ) + all_documents.extend(chunks) + except Exception as e: print(f"Error searching connector {connector}: {e}") continue @@ -543,11 +667,68 @@ async def search_knowledge_base_async( return format_documents_for_context(deduplicated) +def _build_connector_docstring(available_connectors: list[str] | None) -> str: + """ + Build the connector documentation section for the tool docstring. + + Args: + available_connectors: List of available connector types, or None for all + + Returns: + Formatted docstring section listing available connectors + """ + connectors = available_connectors if available_connectors else list(_ALL_CONNECTORS) + + lines = [] + for connector in connectors: + # Skip internal names, prefer user-facing aliases + if connector == "CRAWLED_URL": + # Show as WEBCRAWLER_CONNECTOR for user-facing docs + description = CONNECTOR_DESCRIPTIONS.get(connector, connector) + lines.append(f"- WEBCRAWLER_CONNECTOR: {description}") + else: + description = CONNECTOR_DESCRIPTIONS.get(connector, connector) + lines.append(f"- {connector}: {description}") + + return "\n".join(lines) + + +# ============================================================================= +# Tool Input Schema +# ============================================================================= + + +class SearchKnowledgeBaseInput(BaseModel): + """Input schema for the search_knowledge_base tool.""" + + query: str = Field( + description="The search query - be specific and include key terms" + ) + top_k: int = Field( + default=10, + description="Number of results to retrieve (default: 10)", + ) + start_date: str | None = Field( + default=None, + description="Optional ISO date/datetime (e.g. '2025-12-12' or '2025-12-12T00:00:00+00:00')", + ) + end_date: str | None = Field( + default=None, + description="Optional ISO date/datetime (e.g. '2025-12-19' or '2025-12-19T23:59:59+00:00')", + ) + connectors_to_search: list[str] | None = Field( + default=None, + description="Optional list of connector enums to search. If omitted, searches all available.", + ) + + def create_search_knowledge_base_tool( search_space_id: int, db_session: AsyncSession, connector_service: ConnectorService, -): + available_connectors: list[str] | None = None, + available_document_types: list[str] | None = None, +) -> StructuredTool: """ Factory function to create the search_knowledge_base tool with injected dependencies. @@ -555,72 +736,57 @@ def create_search_knowledge_base_tool( search_space_id: The user's search space ID db_session: Database session connector_service: Initialized connector service + available_connectors: Optional list of connector types available in the search space. + Used to dynamically generate the tool docstring. + available_document_types: Optional list of document types that have data in the search space. + Used to inform the LLM about what data exists. Returns: - A configured tool function + A configured StructuredTool instance """ + # Build connector documentation dynamically + connector_docs = _build_connector_docstring(available_connectors) - @tool - async def search_knowledge_base( + # Build context about available document types + doc_types_info = "" + if available_document_types: + doc_types_info = f""" + +## Document types with indexed content in this search space + +The following document types have content available for search: +{", ".join(available_document_types)} + +Focus searches on these types for best results.""" + + # Build the dynamic description for the tool + # This is what the LLM sees when deciding whether/how to use the tool + dynamic_description = f"""Search the user's personal knowledge base for relevant information. + +Use this tool to find documents, notes, files, web pages, and other content that may help answer the user's question. + +IMPORTANT: +- If the user requests a specific source type (e.g. "my notes", "Slack messages"), pass `connectors_to_search=[...]` using the enums below. +- If `connectors_to_search` is omitted/empty, the system will search broadly. +- Only connectors that are enabled/configured for this search space are available.{doc_types_info} + +## Available connector enums for `connectors_to_search` + +{connector_docs} + +NOTE: `WEBCRAWLER_CONNECTOR` is mapped internally to the canonical document type `CRAWLED_URL`.""" + + # Capture for closure + _available_connectors = available_connectors + + async def _search_knowledge_base_impl( query: str, top_k: int = 10, start_date: str | None = None, end_date: str | None = None, connectors_to_search: list[str] | None = None, ) -> str: - """ - Search the user's personal knowledge base for relevant information. - - Use this tool to find documents, notes, files, web pages, and other content - that may help answer the user's question. - - IMPORTANT: - - If the user requests a specific source type (e.g. "my notes", "Slack messages"), - pass `connectors_to_search=[...]` using the enums below. - - If `connectors_to_search` is omitted/empty, the system will search broadly. - - ## Available connector enums for `connectors_to_search` - - - EXTENSION: "Web content saved via SurfSense browser extension" (personal browsing history) - - FILE: "User-uploaded documents (PDFs, Word, etc.)" (personal files) - - NOTE: "SurfSense Notes" (notes created inside SurfSense) - - SLACK_CONNECTOR: "Slack conversations and shared content" (personal workspace communications) - - TEAMS_CONNECTOR: "Microsoft Teams messages and conversations" (personal Teams communications) - - NOTION_CONNECTOR: "Notion workspace pages and databases" (personal knowledge management) - - YOUTUBE_VIDEO: "YouTube video transcripts and metadata" (personally saved videos) - - GITHUB_CONNECTOR: "GitHub repository content and issues" (personal repositories and interactions) - - ELASTICSEARCH_CONNECTOR: "Elasticsearch indexed documents and data" (personal Elasticsearch instances and custom data sources) - - LINEAR_CONNECTOR: "Linear project issues and discussions" (personal project management) - - JIRA_CONNECTOR: "Jira project issues, tickets, and comments" (personal project tracking) - - CONFLUENCE_CONNECTOR: "Confluence pages and comments" (personal project documentation) - - CLICKUP_CONNECTOR: "ClickUp tasks and project data" (personal task management) - - GOOGLE_CALENDAR_CONNECTOR: "Google Calendar events, meetings, and schedules" (personal calendar and time management) - - GOOGLE_GMAIL_CONNECTOR: "Google Gmail emails and conversations" (personal emails and communications) - - GOOGLE_DRIVE_FILE: "Google Drive files and documents" (personal cloud storage and file management) - - DISCORD_CONNECTOR: "Discord server conversations and shared content" (personal community communications) - - AIRTABLE_CONNECTOR: "Airtable records, tables, and database content" (personal data management and organization) - - TAVILY_API: "Tavily search API results" (personalized search results) - - SEARXNG_API: "SearxNG search API results" (personalized search results) - - LINKUP_API: "Linkup search API results" (personalized search results) - - BAIDU_SEARCH_API: "Baidu search API results" (personalized search results) - - LUMA_CONNECTOR: "Luma events" - - WEBCRAWLER_CONNECTOR: "Webpages indexed by SurfSense" (personally selected websites) - - BOOKSTACK_CONNECTOR: "BookStack pages" (personal documentation) - - CIRCLEBACK: "Circleback meeting notes, transcripts, and action items" (personal meeting records) - - OBSIDIAN_CONNECTOR: "Obsidian vault notes and markdown files" (personal notes and knowledge management) - - NOTE: `WEBCRAWLER_CONNECTOR` is mapped internally to the canonical document type `CRAWLED_URL`. - - Args: - query: The search query - be specific and include key terms - top_k: Number of results to retrieve (default: 10) - start_date: Optional ISO date/datetime (e.g. "2025-12-12" or "2025-12-12T00:00:00+00:00") - end_date: Optional ISO date/datetime (e.g. "2025-12-19" or "2025-12-19T23:59:59+00:00") - connectors_to_search: Optional list of connector enums to search. If omitted, searches all. - - Returns: - Formatted string with relevant documents and their content - """ + """Implementation function for knowledge base search.""" from app.agents.new_chat.utils import parse_date_or_datetime parsed_start: datetime | None = None @@ -640,6 +806,16 @@ def create_search_knowledge_base_tool( top_k=top_k, start_date=parsed_start, end_date=parsed_end, + available_connectors=_available_connectors, ) - return search_knowledge_base + # Create StructuredTool with dynamic description + # This properly sets the description that the LLM sees + tool = StructuredTool( + name="search_knowledge_base", + description=dynamic_description, + coroutine=_search_knowledge_base_impl, + args_schema=SearchKnowledgeBaseInput, + ) + + return tool diff --git a/surfsense_backend/app/agents/new_chat/tools/registry.py b/surfsense_backend/app/agents/new_chat/tools/registry.py index e4ce7a6b7..968e51445 100644 --- a/surfsense_backend/app/agents/new_chat/tools/registry.py +++ b/surfsense_backend/app/agents/new_chat/tools/registry.py @@ -85,6 +85,7 @@ class ToolDefinition: # Contributors: Add your new tools here! BUILTIN_TOOLS: list[ToolDefinition] = [ # Core tool - searches the user's knowledge base + # Now supports dynamic connector/document type discovery ToolDefinition( name="search_knowledge_base", description="Search the user's personal knowledge base for relevant information", @@ -92,8 +93,12 @@ BUILTIN_TOOLS: list[ToolDefinition] = [ search_space_id=deps["search_space_id"], db_session=deps["db_session"], connector_service=deps["connector_service"], + # Optional: dynamically discovered connectors/document types + available_connectors=deps.get("available_connectors"), + available_document_types=deps.get("available_document_types"), ), requires=["search_space_id", "db_session", "connector_service"], + # Note: available_connectors and available_document_types are optional ), # Podcast generation tool ToolDefinition( diff --git a/surfsense_backend/app/connectors/composio_connector.py b/surfsense_backend/app/connectors/composio_connector.py index fdf57d8ea..301296378 100644 --- a/surfsense_backend/app/connectors/composio_connector.py +++ b/surfsense_backend/app/connectors/composio_connector.py @@ -1,7 +1,7 @@ """ -Composio Connector Module. +Composio Connector Base Module. -Provides a unified interface for interacting with various services via Composio, +Provides a base class for interacting with various services via Composio, primarily used during indexing operations. """ @@ -19,10 +19,10 @@ logger = logging.getLogger(__name__) class ComposioConnector: """ - Generic Composio connector for data retrieval. + Base Composio connector for data retrieval. Wraps the ComposioService to provide toolkit-specific data access - for indexing operations. + for indexing operations. Subclasses implement toolkit-specific methods. """ def __init__( @@ -89,302 +89,12 @@ class ComposioConnector: toolkit_id = await self.get_toolkit_id() return toolkit_id in INDEXABLE_TOOLKITS - # ===== Google Drive Methods ===== + @property + def session(self) -> AsyncSession: + """Get the database session.""" + return self._session - async def list_drive_files( - self, - folder_id: str | None = None, - page_token: str | None = None, - page_size: int = 100, - ) -> tuple[list[dict[str, Any]], str | None, str | None]: - """ - List files from Google Drive via Composio. - - Args: - folder_id: Optional folder ID to list contents of. - page_token: Pagination token. - page_size: Number of files per page. - - Returns: - Tuple of (files list, next_page_token, error message). - """ - connected_account_id = await self.get_connected_account_id() - if not connected_account_id: - return [], None, "No connected account ID found" - - entity_id = await self.get_entity_id() - service = await self._get_service() - return await service.get_drive_files( - connected_account_id=connected_account_id, - entity_id=entity_id, - folder_id=folder_id, - page_token=page_token, - page_size=page_size, - ) - - async def get_drive_file_content( - self, file_id: str - ) -> tuple[bytes | None, str | None]: - """ - Download file content from Google Drive via Composio. - - Args: - file_id: Google Drive file ID. - - Returns: - Tuple of (file content bytes, error message). - """ - connected_account_id = await self.get_connected_account_id() - if not connected_account_id: - return None, "No connected account ID found" - - entity_id = await self.get_entity_id() - service = await self._get_service() - return await service.get_drive_file_content( - connected_account_id=connected_account_id, - entity_id=entity_id, - file_id=file_id, - ) - - # ===== Gmail Methods ===== - - async def list_gmail_messages( - self, - query: str = "", - max_results: int = 100, - ) -> tuple[list[dict[str, Any]], str | None]: - """ - List Gmail messages via Composio. - - Args: - query: Gmail search query. - max_results: Maximum number of messages. - - Returns: - Tuple of (messages list, error message). - """ - connected_account_id = await self.get_connected_account_id() - if not connected_account_id: - return [], "No connected account ID found" - - entity_id = await self.get_entity_id() - service = await self._get_service() - return await service.get_gmail_messages( - connected_account_id=connected_account_id, - entity_id=entity_id, - query=query, - max_results=max_results, - ) - - async def get_gmail_message_detail( - self, message_id: str - ) -> tuple[dict[str, Any] | None, str | None]: - """ - Get full details of a Gmail message via Composio. - - Args: - message_id: Gmail message ID. - - Returns: - Tuple of (message details, error message). - """ - connected_account_id = await self.get_connected_account_id() - if not connected_account_id: - return None, "No connected account ID found" - - entity_id = await self.get_entity_id() - service = await self._get_service() - return await service.get_gmail_message_detail( - connected_account_id=connected_account_id, - entity_id=entity_id, - message_id=message_id, - ) - - # ===== Google Calendar Methods ===== - - async def list_calendar_events( - self, - time_min: str | None = None, - time_max: str | None = None, - max_results: int = 250, - ) -> tuple[list[dict[str, Any]], str | None]: - """ - List Google Calendar events via Composio. - - Args: - time_min: Start time (RFC3339 format). - time_max: End time (RFC3339 format). - max_results: Maximum number of events. - - Returns: - Tuple of (events list, error message). - """ - connected_account_id = await self.get_connected_account_id() - if not connected_account_id: - return [], "No connected account ID found" - - entity_id = await self.get_entity_id() - service = await self._get_service() - return await service.get_calendar_events( - connected_account_id=connected_account_id, - entity_id=entity_id, - time_min=time_min, - time_max=time_max, - max_results=max_results, - ) - - # ===== Utility Methods ===== - - def format_gmail_message_to_markdown(self, message: dict[str, Any]) -> str: - """ - Format a Gmail message to markdown. - - Args: - message: Message object from Composio's GMAIL_FETCH_EMAILS response. - Composio structure: messageId, messageText, messageTimestamp, - payload.headers, labelIds, attachmentList - - Returns: - Formatted markdown string. - """ - try: - # Composio uses 'messageId' (camelCase) - message_id = message.get("messageId", "") or message.get("id", "") - label_ids = message.get("labelIds", []) - - # Extract headers from payload - payload = message.get("payload", {}) - headers = payload.get("headers", []) - - # Parse headers into a dict - header_dict = {} - for header in headers: - name = header.get("name", "").lower() - value = header.get("value", "") - header_dict[name] = value - - # Extract key information - subject = header_dict.get("subject", "No Subject") - from_email = header_dict.get("from", "Unknown Sender") - to_email = header_dict.get("to", "Unknown Recipient") - # Composio provides messageTimestamp directly - date_str = message.get("messageTimestamp", "") or header_dict.get( - "date", "Unknown Date" - ) - - # Build markdown content - markdown_content = f"# {subject}\n\n" - markdown_content += f"**From:** {from_email}\n" - markdown_content += f"**To:** {to_email}\n" - markdown_content += f"**Date:** {date_str}\n" - - if label_ids: - markdown_content += f"**Labels:** {', '.join(label_ids)}\n" - - markdown_content += "\n---\n\n" - - # Composio provides full message text in 'messageText' - message_text = message.get("messageText", "") - if message_text: - markdown_content += f"## Content\n\n{message_text}\n\n" - else: - # Fallback to snippet if no messageText - snippet = message.get("snippet", "") - if snippet: - markdown_content += f"## Preview\n\n{snippet}\n\n" - - # Add attachment info if present - attachments = message.get("attachmentList", []) - if attachments: - markdown_content += "## Attachments\n\n" - for att in attachments: - att_name = att.get("filename", att.get("name", "Unknown")) - markdown_content += f"- {att_name}\n" - markdown_content += "\n" - - # Add message metadata - markdown_content += "## Message Details\n\n" - markdown_content += f"- **Message ID:** {message_id}\n" - - return markdown_content - - except Exception as e: - return f"Error formatting message to markdown: {e!s}" - - def format_calendar_event_to_markdown(self, event: dict[str, Any]) -> str: - """ - Format a Google Calendar event to markdown. - - Args: - event: Event object from Google Calendar API. - - Returns: - Formatted markdown string. - """ - from datetime import datetime - - try: - # Extract basic event information - summary = event.get("summary", "No Title") - description = event.get("description", "") - location = event.get("location", "") - - # Extract start and end times - start = event.get("start", {}) - end = event.get("end", {}) - - start_time = start.get("dateTime") or start.get("date", "") - end_time = end.get("dateTime") or end.get("date", "") - - # Format times for display - def format_time(time_str: str) -> str: - if not time_str: - return "Unknown" - try: - if "T" in time_str: - dt = datetime.fromisoformat(time_str.replace("Z", "+00:00")) - return dt.strftime("%Y-%m-%d %H:%M") - return time_str - except Exception: - return time_str - - start_formatted = format_time(start_time) - end_formatted = format_time(end_time) - - # Extract attendees - attendees = event.get("attendees", []) - attendee_list = [] - for attendee in attendees: - email = attendee.get("email", "") - display_name = attendee.get("displayName", email) - response_status = attendee.get("responseStatus", "") - attendee_list.append(f"- {display_name} ({response_status})") - - # Build markdown content - markdown_content = f"# {summary}\n\n" - markdown_content += f"**Start:** {start_formatted}\n" - markdown_content += f"**End:** {end_formatted}\n" - - if location: - markdown_content += f"**Location:** {location}\n" - - markdown_content += "\n" - - if description: - markdown_content += f"## Description\n\n{description}\n\n" - - if attendee_list: - markdown_content += "## Attendees\n\n" - markdown_content += "\n".join(attendee_list) - markdown_content += "\n\n" - - # Add event metadata - markdown_content += "## Event Details\n\n" - markdown_content += f"- **Event ID:** {event.get('id', 'Unknown')}\n" - markdown_content += f"- **Created:** {event.get('created', 'Unknown')}\n" - markdown_content += f"- **Updated:** {event.get('updated', 'Unknown')}\n" - - return markdown_content - - except Exception as e: - return f"Error formatting event to markdown: {e!s}" + @property + def connector_id(self) -> int: + """Get the connector ID.""" + return self._connector_id diff --git a/surfsense_backend/app/connectors/composio_gmail_connector.py b/surfsense_backend/app/connectors/composio_gmail_connector.py new file mode 100644 index 000000000..953e2e8fc --- /dev/null +++ b/surfsense_backend/app/connectors/composio_gmail_connector.py @@ -0,0 +1,613 @@ +""" +Composio Gmail Connector Module. + +Provides Gmail specific methods for data retrieval and indexing via Composio. +""" + +import logging +from datetime import UTC, datetime +from typing import Any + +from sqlalchemy.ext.asyncio import AsyncSession +from sqlalchemy.future import select +from sqlalchemy.orm import selectinload + +from app.config import config +from app.connectors.composio_connector import ComposioConnector +from app.db import Document, DocumentType +from app.services.composio_service import TOOLKIT_TO_DOCUMENT_TYPE +from app.services.llm_service import get_user_long_context_llm +from app.services.task_logging_service import TaskLoggingService +from app.tasks.connector_indexers.base import calculate_date_range +from app.utils.document_converters import ( + create_document_chunks, + generate_content_hash, + generate_document_summary, + generate_unique_identifier_hash, +) + +logger = logging.getLogger(__name__) + + +def get_current_timestamp() -> datetime: + """Get the current timestamp with timezone for updated_at field.""" + return datetime.now(UTC) + + +async def check_document_by_unique_identifier( + session: AsyncSession, unique_identifier_hash: str +) -> Document | None: + """Check if a document with the given unique identifier hash already exists.""" + existing_doc_result = await session.execute( + select(Document) + .options(selectinload(Document.chunks)) + .where(Document.unique_identifier_hash == unique_identifier_hash) + ) + return existing_doc_result.scalars().first() + + +async def update_connector_last_indexed( + session: AsyncSession, + connector, + update_last_indexed: bool = True, +) -> None: + """Update the last_indexed_at timestamp for a connector.""" + if update_last_indexed: + connector.last_indexed_at = datetime.now(UTC) + logger.info(f"Updated last_indexed_at to {connector.last_indexed_at}") + + +class ComposioGmailConnector(ComposioConnector): + """ + Gmail specific Composio connector. + + Provides methods for listing messages, getting message details, and formatting + Gmail messages from Gmail via Composio. + """ + + async def list_gmail_messages( + self, + query: str = "", + max_results: int = 50, + page_token: str | None = None, + ) -> tuple[list[dict[str, Any]], str | None, int | None, str | None]: + """ + List Gmail messages via Composio with pagination support. + + Args: + query: Gmail search query. + max_results: Maximum number of messages per page (default: 50). + page_token: Optional pagination token for next page. + + Returns: + Tuple of (messages list, next_page_token, result_size_estimate, error message). + """ + connected_account_id = await self.get_connected_account_id() + if not connected_account_id: + return [], None, None, "No connected account ID found" + + entity_id = await self.get_entity_id() + service = await self._get_service() + return await service.get_gmail_messages( + connected_account_id=connected_account_id, + entity_id=entity_id, + query=query, + max_results=max_results, + page_token=page_token, + ) + + async def get_gmail_message_detail( + self, message_id: str + ) -> tuple[dict[str, Any] | None, str | None]: + """ + Get full details of a Gmail message via Composio. + + Args: + message_id: Gmail message ID. + + Returns: + Tuple of (message details, error message). + """ + connected_account_id = await self.get_connected_account_id() + if not connected_account_id: + return None, "No connected account ID found" + + entity_id = await self.get_entity_id() + service = await self._get_service() + return await service.get_gmail_message_detail( + connected_account_id=connected_account_id, + entity_id=entity_id, + message_id=message_id, + ) + + def format_gmail_message_to_markdown(self, message: dict[str, Any]) -> str: + """ + Format a Gmail message to markdown. + + Args: + message: Message object from Composio's GMAIL_FETCH_EMAILS response. + Composio structure: messageId, messageText, messageTimestamp, + payload.headers, labelIds, attachmentList + + Returns: + Formatted markdown string. + """ + try: + # Composio uses 'messageId' (camelCase) + message_id = message.get("messageId", "") or message.get("id", "") + label_ids = message.get("labelIds", []) + + # Extract headers from payload + payload = message.get("payload", {}) + headers = payload.get("headers", []) + + # Parse headers into a dict + header_dict = {} + for header in headers: + name = header.get("name", "").lower() + value = header.get("value", "") + header_dict[name] = value + + # Extract key information + subject = header_dict.get("subject", "No Subject") + from_email = header_dict.get("from", "Unknown Sender") + to_email = header_dict.get("to", "Unknown Recipient") + # Composio provides messageTimestamp directly + date_str = message.get("messageTimestamp", "") or header_dict.get( + "date", "Unknown Date" + ) + + # Build markdown content + markdown_content = f"# {subject}\n\n" + markdown_content += f"**From:** {from_email}\n" + markdown_content += f"**To:** {to_email}\n" + markdown_content += f"**Date:** {date_str}\n" + + if label_ids: + markdown_content += f"**Labels:** {', '.join(label_ids)}\n" + + markdown_content += "\n---\n\n" + + # Composio provides full message text in 'messageText' + message_text = message.get("messageText", "") + if message_text: + markdown_content += f"## Content\n\n{message_text}\n\n" + else: + # Fallback to snippet if no messageText + snippet = message.get("snippet", "") + if snippet: + markdown_content += f"## Preview\n\n{snippet}\n\n" + + # Add attachment info if present + attachments = message.get("attachmentList", []) + if attachments: + markdown_content += "## Attachments\n\n" + for att in attachments: + att_name = att.get("filename", att.get("name", "Unknown")) + markdown_content += f"- {att_name}\n" + markdown_content += "\n" + + # Add message metadata + markdown_content += "## Message Details\n\n" + markdown_content += f"- **Message ID:** {message_id}\n" + + return markdown_content + + except Exception as e: + return f"Error formatting message to markdown: {e!s}" + + +# ============ Indexer Functions ============ + + +async def _process_gmail_message_batch( + session: AsyncSession, + messages: list[dict[str, Any]], + composio_connector: ComposioGmailConnector, + connector_id: int, + search_space_id: int, + user_id: str, + total_documents_indexed: int = 0, +) -> tuple[int, int]: + """ + Process a batch of Gmail messages and index them. + + Args: + total_documents_indexed: Running total of documents indexed so far (for batch commits). + + Returns: + Tuple of (documents_indexed, documents_skipped) + """ + documents_indexed = 0 + documents_skipped = 0 + + for message in messages: + try: + # Composio uses 'messageId' (camelCase), not 'id' + message_id = message.get("messageId", "") or message.get("id", "") + if not message_id: + documents_skipped += 1 + continue + + # Composio's GMAIL_FETCH_EMAILS already returns full message content + # No need for a separate detail API call + + # Extract message info from Composio response + # Composio structure: messageId, messageText, messageTimestamp, payload.headers, labelIds + payload = message.get("payload", {}) + headers = payload.get("headers", []) + + subject = "No Subject" + sender = "Unknown Sender" + date_str = message.get("messageTimestamp", "Unknown Date") + + for header in headers: + name = header.get("name", "").lower() + value = header.get("value", "") + if name == "subject": + subject = value + elif name == "from": + sender = value + elif name == "date": + date_str = value + + # Format to markdown using the full message data + markdown_content = composio_connector.format_gmail_message_to_markdown( + message + ) + + # Check for empty content (defensive parsing per Composio best practices) + if not markdown_content.strip(): + logger.warning(f"Skipping Gmail message with no content: {subject}") + documents_skipped += 1 + continue + + # Generate unique identifier + document_type = DocumentType(TOOLKIT_TO_DOCUMENT_TYPE["gmail"]) + unique_identifier_hash = generate_unique_identifier_hash( + document_type, f"gmail_{message_id}", search_space_id + ) + + content_hash = generate_content_hash(markdown_content, search_space_id) + + existing_document = await check_document_by_unique_identifier( + session, unique_identifier_hash + ) + + # Get label IDs from Composio response + label_ids = message.get("labelIds", []) + # Extract thread_id if available (for consistency with non-Composio implementation) + thread_id = message.get("threadId", "") or message.get("thread_id", "") + + if existing_document: + if existing_document.content_hash == content_hash: + documents_skipped += 1 + continue + + # Update existing + user_llm = await get_user_long_context_llm( + session, user_id, search_space_id + ) + + if user_llm: + document_metadata = { + "message_id": message_id, + "thread_id": thread_id, + "subject": subject, + "sender": sender, + "document_type": "Gmail Message (Composio)", + } + ( + summary_content, + summary_embedding, + ) = await generate_document_summary( + markdown_content, user_llm, document_metadata + ) + else: + summary_content = ( + f"Gmail: {subject}\n\nFrom: {sender}\nDate: {date_str}" + ) + summary_embedding = config.embedding_model_instance.embed( + summary_content + ) + + chunks = await create_document_chunks(markdown_content) + + existing_document.title = f"Gmail: {subject}" + existing_document.content = summary_content + existing_document.content_hash = content_hash + existing_document.embedding = summary_embedding + existing_document.document_metadata = { + "message_id": message_id, + "thread_id": thread_id, + "subject": subject, + "sender": sender, + "date": date_str, + "labels": label_ids, + "connector_id": connector_id, + "source": "composio", + } + existing_document.chunks = chunks + existing_document.updated_at = get_current_timestamp() + + documents_indexed += 1 + + # Batch commit every 10 documents + current_total = total_documents_indexed + documents_indexed + if current_total % 10 == 0: + logger.info( + f"Committing batch: {current_total} Gmail messages processed so far" + ) + await session.commit() + continue + + # Create new document + user_llm = await get_user_long_context_llm( + session, user_id, search_space_id + ) + + if user_llm: + document_metadata = { + "message_id": message_id, + "thread_id": thread_id, + "subject": subject, + "sender": sender, + "document_type": "Gmail Message (Composio)", + } + summary_content, summary_embedding = await generate_document_summary( + markdown_content, user_llm, document_metadata + ) + else: + summary_content = ( + f"Gmail: {subject}\n\nFrom: {sender}\nDate: {date_str}" + ) + summary_embedding = config.embedding_model_instance.embed( + summary_content + ) + + chunks = await create_document_chunks(markdown_content) + + document = Document( + search_space_id=search_space_id, + title=f"Gmail: {subject}", + document_type=DocumentType(TOOLKIT_TO_DOCUMENT_TYPE["gmail"]), + document_metadata={ + "message_id": message_id, + "thread_id": thread_id, + "subject": subject, + "sender": sender, + "date": date_str, + "labels": label_ids, + "connector_id": connector_id, + "toolkit_id": "gmail", + "source": "composio", + }, + content=summary_content, + content_hash=content_hash, + unique_identifier_hash=unique_identifier_hash, + embedding=summary_embedding, + chunks=chunks, + updated_at=get_current_timestamp(), + ) + session.add(document) + documents_indexed += 1 + + # Batch commit every 10 documents + current_total = total_documents_indexed + documents_indexed + if current_total % 10 == 0: + logger.info( + f"Committing batch: {current_total} Gmail messages processed so far" + ) + await session.commit() + + except Exception as e: + logger.error(f"Error processing Gmail message: {e!s}", exc_info=True) + documents_skipped += 1 + # Rollback on error to avoid partial state (per Composio best practices) + try: + await session.rollback() + except Exception as rollback_error: + logger.error( + f"Error during rollback: {rollback_error!s}", exc_info=True + ) + continue + + return documents_indexed, documents_skipped + + +async def index_composio_gmail( + session: AsyncSession, + connector, + connector_id: int, + search_space_id: int, + user_id: str, + start_date: str | None, + end_date: str | None, + task_logger: TaskLoggingService, + log_entry, + update_last_indexed: bool = True, + max_items: int = 1000, +) -> tuple[int, str]: + """Index Gmail messages via Composio with pagination and incremental processing.""" + try: + composio_connector = ComposioGmailConnector(session, connector_id) + + # Normalize date values - handle "undefined" strings from frontend + if start_date == "undefined" or start_date == "": + start_date = None + if end_date == "undefined" or end_date == "": + end_date = None + + # Use provided dates directly if both are provided, otherwise calculate from last_indexed_at + # This ensures user-selected dates are respected (matching non-Composio Gmail connector behavior) + if start_date is not None and end_date is not None: + # User provided both dates - use them directly + start_date_str = start_date + end_date_str = end_date + else: + # Calculate date range with defaults (uses last_indexed_at or 365 days back) + # This ensures indexing works even when user doesn't specify dates + start_date_str, end_date_str = calculate_date_range( + connector, start_date, end_date, default_days_back=365 + ) + + # Build query with date range + query_parts = [] + if start_date_str: + query_parts.append(f"after:{start_date_str.replace('-', '/')}") + if end_date_str: + query_parts.append(f"before:{end_date_str.replace('-', '/')}") + query = " ".join(query_parts) if query_parts else "" + + logger.info( + f"Gmail query for connector {connector_id}: '{query}' " + f"(start_date={start_date_str}, end_date={end_date_str})" + ) + + # Use smaller batch size to avoid 413 payload too large errors + batch_size = 50 + page_token = None + total_documents_indexed = 0 + total_documents_skipped = 0 + total_messages_fetched = 0 + result_size_estimate = None # Will be set from first API response + + while total_messages_fetched < max_items: + # Calculate how many messages to fetch in this batch + remaining = max_items - total_messages_fetched + current_batch_size = min(batch_size, remaining) + + # Use result_size_estimate if available, otherwise fall back to max_items + estimated_total = ( + result_size_estimate if result_size_estimate is not None else max_items + ) + # Cap estimated_total at max_items to avoid showing misleading progress + estimated_total = min(estimated_total, max_items) + + await task_logger.log_task_progress( + log_entry, + f"Fetching Gmail messages batch via Composio for connector {connector_id} " + f"({total_messages_fetched}/{estimated_total} fetched, {total_documents_indexed} indexed)", + { + "stage": "fetching_messages", + "batch_size": current_batch_size, + "total_fetched": total_messages_fetched, + "total_indexed": total_documents_indexed, + "estimated_total": estimated_total, + }, + ) + + # Fetch batch of messages + ( + messages, + next_token, + result_size_estimate_batch, + error, + ) = await composio_connector.list_gmail_messages( + query=query, + max_results=current_batch_size, + page_token=page_token, + ) + + if error: + await task_logger.log_task_failure( + log_entry, f"Failed to fetch Gmail messages: {error}", {} + ) + return 0, f"Failed to fetch Gmail messages: {error}" + + if not messages: + # No more messages available + break + + # Update result_size_estimate from first response (Gmail provides this estimate) + if result_size_estimate is None and result_size_estimate_batch is not None: + result_size_estimate = result_size_estimate_batch + logger.info( + f"Gmail API estimated {result_size_estimate} total messages for query: '{query}'" + ) + + total_messages_fetched += len(messages) + # Recalculate estimated_total after potentially updating result_size_estimate + estimated_total = ( + result_size_estimate if result_size_estimate is not None else max_items + ) + estimated_total = min(estimated_total, max_items) + + logger.info( + f"Fetched batch of {len(messages)} Gmail messages " + f"(total: {total_messages_fetched}/{estimated_total})" + ) + + # Process batch incrementally + batch_indexed, batch_skipped = await _process_gmail_message_batch( + session=session, + messages=messages, + composio_connector=composio_connector, + connector_id=connector_id, + search_space_id=search_space_id, + user_id=user_id, + total_documents_indexed=total_documents_indexed, + ) + + total_documents_indexed += batch_indexed + total_documents_skipped += batch_skipped + + logger.info( + f"Processed batch: {batch_indexed} indexed, {batch_skipped} skipped " + f"(total: {total_documents_indexed} indexed, {total_documents_skipped} skipped)" + ) + + # Batch commits happen in _process_gmail_message_batch every 10 documents + # This ensures progress is saved incrementally, preventing data loss on crashes + + # Check if we should continue + if not next_token: + # No more pages available + break + + if len(messages) < current_batch_size: + # Last page had fewer items than requested, we're done + break + + # Continue with next page + page_token = next_token + + if total_messages_fetched == 0: + success_msg = "No Gmail messages found in the specified date range" + await task_logger.log_task_success( + log_entry, success_msg, {"messages_count": 0} + ) + # CRITICAL: Update timestamp even when no messages found so Electric SQL syncs and UI shows indexed status + await update_connector_last_indexed(session, connector, update_last_indexed) + await session.commit() + return 0, None # Return None (not error) when no items found + + # CRITICAL: Always update timestamp (even if 0 documents indexed) so Electric SQL syncs + # This ensures the UI shows "Last indexed" instead of "Never indexed" + await update_connector_last_indexed(session, connector, update_last_indexed) + + # Final commit to ensure all documents are persisted (safety net) + # This matches the pattern used in non-Composio Gmail indexer + logger.info( + f"Final commit: Total {total_documents_indexed} Gmail messages processed" + ) + await session.commit() + logger.info( + "Successfully committed all Composio Gmail document changes to database" + ) + + await task_logger.log_task_success( + log_entry, + f"Successfully completed Gmail indexing via Composio for connector {connector_id}", + { + "documents_indexed": total_documents_indexed, + "documents_skipped": total_documents_skipped, + "messages_fetched": total_messages_fetched, + }, + ) + + return total_documents_indexed, None + + except Exception as e: + logger.error(f"Failed to index Gmail via Composio: {e!s}", exc_info=True) + return 0, f"Failed to index Gmail via Composio: {e!s}" diff --git a/surfsense_backend/app/connectors/composio_google_calendar_connector.py b/surfsense_backend/app/connectors/composio_google_calendar_connector.py new file mode 100644 index 000000000..ec5b22b7f --- /dev/null +++ b/surfsense_backend/app/connectors/composio_google_calendar_connector.py @@ -0,0 +1,502 @@ +""" +Composio Google Calendar Connector Module. + +Provides Google Calendar specific methods for data retrieval and indexing via Composio. +""" + +import logging +from datetime import UTC, datetime +from typing import Any + +from sqlalchemy.ext.asyncio import AsyncSession +from sqlalchemy.future import select +from sqlalchemy.orm import selectinload + +from app.config import config +from app.connectors.composio_connector import ComposioConnector +from app.db import Document, DocumentType +from app.services.composio_service import TOOLKIT_TO_DOCUMENT_TYPE +from app.services.llm_service import get_user_long_context_llm +from app.services.task_logging_service import TaskLoggingService +from app.tasks.connector_indexers.base import ( + calculate_date_range, + check_duplicate_document_by_hash, +) +from app.utils.document_converters import ( + create_document_chunks, + generate_content_hash, + generate_document_summary, + generate_unique_identifier_hash, +) + +logger = logging.getLogger(__name__) + + +def get_current_timestamp() -> datetime: + """Get the current timestamp with timezone for updated_at field.""" + return datetime.now(UTC) + + +async def check_document_by_unique_identifier( + session: AsyncSession, unique_identifier_hash: str +) -> Document | None: + """Check if a document with the given unique identifier hash already exists.""" + existing_doc_result = await session.execute( + select(Document) + .options(selectinload(Document.chunks)) + .where(Document.unique_identifier_hash == unique_identifier_hash) + ) + return existing_doc_result.scalars().first() + + +async def update_connector_last_indexed( + session: AsyncSession, + connector, + update_last_indexed: bool = True, +) -> None: + """Update the last_indexed_at timestamp for a connector.""" + if update_last_indexed: + connector.last_indexed_at = datetime.now(UTC) + logger.info(f"Updated last_indexed_at to {connector.last_indexed_at}") + + +class ComposioGoogleCalendarConnector(ComposioConnector): + """ + Google Calendar specific Composio connector. + + Provides methods for listing calendar events and formatting them from + Google Calendar via Composio. + """ + + async def list_calendar_events( + self, + time_min: str | None = None, + time_max: str | None = None, + max_results: int = 250, + ) -> tuple[list[dict[str, Any]], str | None]: + """ + List Google Calendar events via Composio. + + Args: + time_min: Start time (RFC3339 format). + time_max: End time (RFC3339 format). + max_results: Maximum number of events. + + Returns: + Tuple of (events list, error message). + """ + connected_account_id = await self.get_connected_account_id() + if not connected_account_id: + return [], "No connected account ID found" + + entity_id = await self.get_entity_id() + service = await self._get_service() + return await service.get_calendar_events( + connected_account_id=connected_account_id, + entity_id=entity_id, + time_min=time_min, + time_max=time_max, + max_results=max_results, + ) + + def format_calendar_event_to_markdown(self, event: dict[str, Any]) -> str: + """ + Format a Google Calendar event to markdown. + + Args: + event: Event object from Google Calendar API. + + Returns: + Formatted markdown string. + """ + try: + # Extract basic event information + summary = event.get("summary", "No Title") + description = event.get("description", "") + location = event.get("location", "") + + # Extract start and end times + start = event.get("start", {}) + end = event.get("end", {}) + + start_time = start.get("dateTime") or start.get("date", "") + end_time = end.get("dateTime") or end.get("date", "") + + # Format times for display + def format_time(time_str: str) -> str: + if not time_str: + return "Unknown" + try: + if "T" in time_str: + dt = datetime.fromisoformat(time_str.replace("Z", "+00:00")) + return dt.strftime("%Y-%m-%d %H:%M") + return time_str + except Exception: + return time_str + + start_formatted = format_time(start_time) + end_formatted = format_time(end_time) + + # Extract attendees + attendees = event.get("attendees", []) + attendee_list = [] + for attendee in attendees: + email = attendee.get("email", "") + display_name = attendee.get("displayName", email) + response_status = attendee.get("responseStatus", "") + attendee_list.append(f"- {display_name} ({response_status})") + + # Build markdown content + markdown_content = f"# {summary}\n\n" + markdown_content += f"**Start:** {start_formatted}\n" + markdown_content += f"**End:** {end_formatted}\n" + + if location: + markdown_content += f"**Location:** {location}\n" + + markdown_content += "\n" + + if description: + markdown_content += f"## Description\n\n{description}\n\n" + + if attendee_list: + markdown_content += "## Attendees\n\n" + markdown_content += "\n".join(attendee_list) + markdown_content += "\n\n" + + # Add event metadata + markdown_content += "## Event Details\n\n" + markdown_content += f"- **Event ID:** {event.get('id', 'Unknown')}\n" + markdown_content += f"- **Created:** {event.get('created', 'Unknown')}\n" + markdown_content += f"- **Updated:** {event.get('updated', 'Unknown')}\n" + + return markdown_content + + except Exception as e: + return f"Error formatting event to markdown: {e!s}" + + +# ============ Indexer Functions ============ + + +async def index_composio_google_calendar( + session: AsyncSession, + connector, + connector_id: int, + search_space_id: int, + user_id: str, + start_date: str | None, + end_date: str | None, + task_logger: TaskLoggingService, + log_entry, + update_last_indexed: bool = True, + max_items: int = 2500, +) -> tuple[int, str]: + """Index Google Calendar events via Composio.""" + try: + composio_connector = ComposioGoogleCalendarConnector(session, connector_id) + + await task_logger.log_task_progress( + log_entry, + f"Fetching Google Calendar events via Composio for connector {connector_id}", + {"stage": "fetching_events"}, + ) + + # Normalize date values - handle "undefined" strings from frontend + if start_date == "undefined" or start_date == "": + start_date = None + if end_date == "undefined" or end_date == "": + end_date = None + + # Use provided dates directly if both are provided, otherwise calculate from last_indexed_at + # This ensures user-selected dates are respected (matching non-Composio Calendar connector behavior) + if start_date is not None and end_date is not None: + # User provided both dates - use them directly + start_date_str = start_date + end_date_str = end_date + else: + # Calculate date range with defaults (uses last_indexed_at or 365 days back) + # This ensures indexing works even when user doesn't specify dates + start_date_str, end_date_str = calculate_date_range( + connector, start_date, end_date, default_days_back=365 + ) + + # Build time range for API call + time_min = f"{start_date_str}T00:00:00Z" + time_max = f"{end_date_str}T23:59:59Z" + + logger.info( + f"Google Calendar query for connector {connector_id}: " + f"(start_date={start_date_str}, end_date={end_date_str})" + ) + + events, error = await composio_connector.list_calendar_events( + time_min=time_min, + time_max=time_max, + max_results=max_items, + ) + + if error: + await task_logger.log_task_failure( + log_entry, f"Failed to fetch Calendar events: {error}", {} + ) + return 0, f"Failed to fetch Calendar events: {error}" + + if not events: + success_msg = "No Google Calendar events found in the specified date range" + await task_logger.log_task_success( + log_entry, success_msg, {"events_count": 0} + ) + # CRITICAL: Update timestamp even when no events found so Electric SQL syncs and UI shows indexed status + await update_connector_last_indexed(session, connector, update_last_indexed) + await session.commit() + return ( + 0, + None, + ) # Return None (not error) when no items found - this is success with 0 items + + logger.info(f"Found {len(events)} Google Calendar events to index via Composio") + + documents_indexed = 0 + documents_skipped = 0 + duplicate_content_count = ( + 0 # Track events skipped due to duplicate content_hash + ) + + for event in events: + try: + # Handle both standard Google API and potential Composio variations + event_id = event.get("id", "") or event.get("eventId", "") + summary = ( + event.get("summary", "") or event.get("title", "") or "No Title" + ) + + if not event_id: + documents_skipped += 1 + continue + + # Format to markdown + markdown_content = composio_connector.format_calendar_event_to_markdown( + event + ) + + # Generate unique identifier + document_type = DocumentType(TOOLKIT_TO_DOCUMENT_TYPE["googlecalendar"]) + unique_identifier_hash = generate_unique_identifier_hash( + document_type, f"calendar_{event_id}", search_space_id + ) + + content_hash = generate_content_hash(markdown_content, search_space_id) + + existing_document = await check_document_by_unique_identifier( + session, unique_identifier_hash + ) + + # Extract event times + start = event.get("start", {}) + end = event.get("end", {}) + start_time = start.get("dateTime") or start.get("date", "") + end_time = end.get("dateTime") or end.get("date", "") + location = event.get("location", "") + + if existing_document: + if existing_document.content_hash == content_hash: + documents_skipped += 1 + continue + + # Update existing + user_llm = await get_user_long_context_llm( + session, user_id, search_space_id + ) + + if user_llm: + document_metadata = { + "event_id": event_id, + "summary": summary, + "start_time": start_time, + "document_type": "Google Calendar Event (Composio)", + } + ( + summary_content, + summary_embedding, + ) = await generate_document_summary( + markdown_content, user_llm, document_metadata + ) + else: + summary_content = f"Calendar: {summary}\n\nStart: {start_time}\nEnd: {end_time}" + if location: + summary_content += f"\nLocation: {location}" + summary_embedding = config.embedding_model_instance.embed( + summary_content + ) + + chunks = await create_document_chunks(markdown_content) + + existing_document.title = f"Calendar: {summary}" + existing_document.content = summary_content + existing_document.content_hash = content_hash + existing_document.embedding = summary_embedding + existing_document.document_metadata = { + "event_id": event_id, + "summary": summary, + "start_time": start_time, + "end_time": end_time, + "location": location, + "connector_id": connector_id, + "source": "composio", + } + existing_document.chunks = chunks + existing_document.updated_at = get_current_timestamp() + + documents_indexed += 1 + + # Batch commit every 10 documents + if documents_indexed % 10 == 0: + logger.info( + f"Committing batch: {documents_indexed} Google Calendar events processed so far" + ) + await session.commit() + continue + + # Document doesn't exist by unique_identifier_hash + # Check if a document with the same content_hash exists (from standard connector) + with session.no_autoflush: + duplicate_by_content = await check_duplicate_document_by_hash( + session, content_hash + ) + + if duplicate_by_content: + # A document with the same content already exists (likely from standard connector) + logger.info( + f"Event {summary} already indexed by another connector " + f"(existing document ID: {duplicate_by_content.id}, " + f"type: {duplicate_by_content.document_type}). Skipping to avoid duplicate content." + ) + duplicate_content_count += 1 + documents_skipped += 1 + continue + + # Create new document + user_llm = await get_user_long_context_llm( + session, user_id, search_space_id + ) + + if user_llm: + document_metadata = { + "event_id": event_id, + "summary": summary, + "start_time": start_time, + "document_type": "Google Calendar Event (Composio)", + } + ( + summary_content, + summary_embedding, + ) = await generate_document_summary( + markdown_content, user_llm, document_metadata + ) + else: + summary_content = ( + f"Calendar: {summary}\n\nStart: {start_time}\nEnd: {end_time}" + ) + if location: + summary_content += f"\nLocation: {location}" + summary_embedding = config.embedding_model_instance.embed( + summary_content + ) + + chunks = await create_document_chunks(markdown_content) + + document = Document( + search_space_id=search_space_id, + title=f"Calendar: {summary}", + document_type=DocumentType( + TOOLKIT_TO_DOCUMENT_TYPE["googlecalendar"] + ), + document_metadata={ + "event_id": event_id, + "summary": summary, + "start_time": start_time, + "end_time": end_time, + "location": location, + "connector_id": connector_id, + "toolkit_id": "googlecalendar", + "source": "composio", + }, + content=summary_content, + content_hash=content_hash, + unique_identifier_hash=unique_identifier_hash, + embedding=summary_embedding, + chunks=chunks, + updated_at=get_current_timestamp(), + ) + session.add(document) + documents_indexed += 1 + + # Batch commit every 10 documents + if documents_indexed % 10 == 0: + logger.info( + f"Committing batch: {documents_indexed} Google Calendar events processed so far" + ) + await session.commit() + + except Exception as e: + logger.error(f"Error processing Calendar event: {e!s}", exc_info=True) + documents_skipped += 1 + continue + + # CRITICAL: Always update timestamp (even if 0 documents indexed) so Electric SQL syncs + # This ensures the UI shows "Last indexed" instead of "Never indexed" + await update_connector_last_indexed(session, connector, update_last_indexed) + + # Final commit to ensure all documents are persisted (safety net) + # This matches the pattern used in non-Composio Gmail indexer + logger.info( + f"Final commit: Total {documents_indexed} Google Calendar events processed" + ) + try: + await session.commit() + logger.info( + "Successfully committed all Composio Google Calendar document changes to database" + ) + except Exception as e: + # Handle any remaining integrity errors gracefully (race conditions, etc.) + if ( + "duplicate key value violates unique constraint" in str(e).lower() + or "uniqueviolationerror" in str(e).lower() + ): + logger.warning( + f"Duplicate content_hash detected during final commit. " + f"This may occur if the same event was indexed by multiple connectors. " + f"Rolling back and continuing. Error: {e!s}" + ) + await session.rollback() + # Don't fail the entire task - some documents may have been successfully indexed + else: + raise + + # Build warning message if duplicates were found + warning_message = None + if duplicate_content_count > 0: + warning_message = f"{duplicate_content_count} skipped (duplicate)" + + await task_logger.log_task_success( + log_entry, + f"Successfully completed Google Calendar indexing via Composio for connector {connector_id}", + { + "documents_indexed": documents_indexed, + "documents_skipped": documents_skipped, + "duplicate_content_count": duplicate_content_count, + }, + ) + + logger.info( + f"Composio Google Calendar indexing completed: {documents_indexed} new events, {documents_skipped} skipped " + f"({duplicate_content_count} due to duplicate content from other connectors)" + ) + return documents_indexed, warning_message + + except Exception as e: + logger.error( + f"Failed to index Google Calendar via Composio: {e!s}", exc_info=True + ) + return 0, f"Failed to index Google Calendar via Composio: {e!s}" diff --git a/surfsense_backend/app/connectors/composio_google_drive_connector.py b/surfsense_backend/app/connectors/composio_google_drive_connector.py new file mode 100644 index 000000000..e3b988676 --- /dev/null +++ b/surfsense_backend/app/connectors/composio_google_drive_connector.py @@ -0,0 +1,1167 @@ +""" +Composio Google Drive Connector Module. + +Provides Google Drive specific methods for data retrieval and indexing via Composio. +""" + +import logging +import os +import tempfile +from datetime import UTC, datetime +from pathlib import Path +from typing import Any + +from sqlalchemy.ext.asyncio import AsyncSession +from sqlalchemy.orm.attributes import flag_modified + +from app.config import config +from app.connectors.composio_connector import ComposioConnector +from app.db import Document, DocumentType, Log +from app.services.composio_service import TOOLKIT_TO_DOCUMENT_TYPE +from app.services.llm_service import get_user_long_context_llm +from app.services.task_logging_service import TaskLoggingService +from app.utils.document_converters import ( + create_document_chunks, + generate_content_hash, + generate_document_summary, + generate_unique_identifier_hash, +) + +logger = logging.getLogger(__name__) + + +# Binary file extensions that need file processor +BINARY_FILE_EXTENSIONS = { + ".pdf", + ".doc", + ".docx", + ".xls", + ".xlsx", + ".ppt", + ".pptx", + ".png", + ".jpg", + ".jpeg", + ".gif", + ".bmp", + ".tiff", + ".webp", + ".zip", + ".tar", + ".gz", + ".rar", + ".7z", + ".mp3", + ".mp4", + ".wav", + ".avi", + ".mov", + ".exe", + ".dll", + ".so", + ".bin", +} + +# Text file extensions that can be decoded as UTF-8 +TEXT_FILE_EXTENSIONS = { + ".txt", + ".md", + ".markdown", + ".json", + ".xml", + ".html", + ".htm", + ".css", + ".js", + ".ts", + ".py", + ".java", + ".c", + ".cpp", + ".h", + ".yaml", + ".yml", + ".toml", + ".ini", + ".cfg", + ".conf", + ".sh", + ".bash", + ".zsh", + ".fish", + ".sql", + ".csv", + ".tsv", + ".rst", + ".tex", + ".log", +} + + +def get_current_timestamp() -> datetime: + """Get the current timestamp with timezone for updated_at field.""" + return datetime.now(UTC) + + +def _is_binary_file(file_name: str, mime_type: str) -> bool: + """Check if a file is binary based on extension or mime type.""" + extension = Path(file_name).suffix.lower() + + # Check extension first + if extension in BINARY_FILE_EXTENSIONS: + return True + if extension in TEXT_FILE_EXTENSIONS: + return False + + # Check mime type + if mime_type: + if mime_type.startswith(("image/", "audio/", "video/", "application/pdf")): + return True + if mime_type.startswith(("text/", "application/json", "application/xml")): + return False + # Office documents + if ( + "spreadsheet" in mime_type + or "document" in mime_type + or "presentation" in mime_type + ): + return True + + # Default to text for unknown types + return False + + +class ComposioGoogleDriveConnector(ComposioConnector): + """ + Google Drive specific Composio connector. + + Provides methods for listing files, downloading content, and tracking changes + from Google Drive via Composio. + """ + + async def list_drive_files( + self, + folder_id: str | None = None, + page_token: str | None = None, + page_size: int = 100, + ) -> tuple[list[dict[str, Any]], str | None, str | None]: + """ + List files from Google Drive via Composio. + + Args: + folder_id: Optional folder ID to list contents of. + page_token: Pagination token. + page_size: Number of files per page. + + Returns: + Tuple of (files list, next_page_token, error message). + """ + connected_account_id = await self.get_connected_account_id() + if not connected_account_id: + return [], None, "No connected account ID found" + + entity_id = await self.get_entity_id() + service = await self._get_service() + return await service.get_drive_files( + connected_account_id=connected_account_id, + entity_id=entity_id, + folder_id=folder_id, + page_token=page_token, + page_size=page_size, + ) + + async def get_drive_file_content( + self, file_id: str + ) -> tuple[bytes | None, str | None]: + """ + Download file content from Google Drive via Composio. + + Args: + file_id: Google Drive file ID. + + Returns: + Tuple of (file content bytes, error message). + """ + connected_account_id = await self.get_connected_account_id() + if not connected_account_id: + return None, "No connected account ID found" + + entity_id = await self.get_entity_id() + service = await self._get_service() + return await service.get_drive_file_content( + connected_account_id=connected_account_id, + entity_id=entity_id, + file_id=file_id, + ) + + async def get_drive_start_page_token(self) -> tuple[str | None, str | None]: + """ + Get the starting page token for Google Drive change tracking. + + Returns: + Tuple of (start_page_token, error message). + """ + connected_account_id = await self.get_connected_account_id() + if not connected_account_id: + return None, "No connected account ID found" + + entity_id = await self.get_entity_id() + service = await self._get_service() + return await service.get_drive_start_page_token( + connected_account_id=connected_account_id, + entity_id=entity_id, + ) + + async def list_drive_changes( + self, + page_token: str | None = None, + page_size: int = 100, + include_removed: bool = True, + ) -> tuple[list[dict[str, Any]], str | None, str | None]: + """ + List changes in Google Drive since the given page token. + + Args: + page_token: Page token from previous sync (optional). + page_size: Number of changes per page. + include_removed: Whether to include removed items. + + Returns: + Tuple of (changes list, new_start_page_token, error message). + """ + connected_account_id = await self.get_connected_account_id() + if not connected_account_id: + return [], None, "No connected account ID found" + + entity_id = await self.get_entity_id() + service = await self._get_service() + return await service.list_drive_changes( + connected_account_id=connected_account_id, + entity_id=entity_id, + page_token=page_token, + page_size=page_size, + include_removed=include_removed, + ) + + +# ============ File Processing Utilities ============ + + +async def _process_file_content( + content: bytes | str, + file_name: str, + file_id: str, + mime_type: str, + search_space_id: int, + user_id: str, + session: AsyncSession, + task_logger: TaskLoggingService, + log_entry: Log, + processing_errors: list[str], +) -> str: + """ + Process file content and return markdown text. + + For binary files (PDFs, images, etc.), uses Surfsense's ETL service. + For text files, decodes as UTF-8. + + Args: + content: File content as bytes or string + file_name: Name of the file + file_id: Google Drive file ID + mime_type: MIME type of the file + search_space_id: Search space ID + user_id: User ID + session: Database session + task_logger: Task logging service + log_entry: Log entry for tracking + processing_errors: List to append errors to + + Returns: + Markdown content string + """ + # Ensure content is bytes + if isinstance(content, str): + content = content.encode("utf-8") + + # Check if this is a binary file + if _is_binary_file(file_name, mime_type): + # Use ETL service for binary files (PDF, Office docs, etc.) + temp_file_path = None + try: + # Get file extension + extension = Path(file_name).suffix or ".bin" + + # Write to temp file + with tempfile.NamedTemporaryFile( + delete=False, suffix=extension + ) as tmp_file: + tmp_file.write(content) + temp_file_path = tmp_file.name + + # Use the configured ETL service to extract text + extracted_text = await _extract_text_with_etl( + temp_file_path, file_name, task_logger, log_entry + ) + + if extracted_text: + return extracted_text + else: + # Fallback if extraction fails + logger.warning(f"Could not extract text from binary file {file_name}") + return f"# {file_name}\n\n[Binary file - text extraction failed]\n\n**File ID:** {file_id}\n**Type:** {mime_type}\n" + + except Exception as e: + error_msg = f"Error processing binary file {file_name}: {e!s}" + logger.error(error_msg) + processing_errors.append(error_msg) + return f"# {file_name}\n\n[Binary file - processing error]\n\n**File ID:** {file_id}\n**Type:** {mime_type}\n" + finally: + # Cleanup temp file + if temp_file_path and os.path.exists(temp_file_path): + try: + os.unlink(temp_file_path) + except Exception as e: + logger.debug(f"Could not delete temp file {temp_file_path}: {e}") + else: + # Text file - try to decode as UTF-8 + try: + return content.decode("utf-8") + except UnicodeDecodeError: + # Try other encodings + for encoding in ["latin-1", "cp1252", "iso-8859-1"]: + try: + return content.decode(encoding) + except UnicodeDecodeError: + continue + + # If all encodings fail, treat as binary + error_msg = f"Could not decode text file {file_name} with any encoding" + logger.warning(error_msg) + processing_errors.append(error_msg) + return f"# {file_name}\n\n[File content could not be decoded]\n\n**File ID:** {file_id}\n**Type:** {mime_type}\n" + + +async def _extract_text_with_etl( + file_path: str, + file_name: str, + task_logger: TaskLoggingService, + log_entry: Log, +) -> str | None: + """ + Extract text from a file using the configured ETL service. + + Args: + file_path: Path to the file + file_name: Name of the file + task_logger: Task logging service + log_entry: Log entry for tracking + + Returns: + Extracted text as markdown, or None if extraction fails + """ + import warnings + from logging import ERROR, getLogger + + etl_service = config.ETL_SERVICE + + try: + if etl_service == "UNSTRUCTURED": + from langchain_unstructured import UnstructuredLoader + + from app.utils.document_converters import convert_document_to_markdown + + loader = UnstructuredLoader( + file_path, + mode="elements", + post_processors=[], + languages=["eng"], + include_orig_elements=False, + include_metadata=False, + strategy="auto", + ) + + docs = await loader.aload() + if docs: + return await convert_document_to_markdown(docs) + return None + + elif etl_service == "LLAMACLOUD": + from app.tasks.document_processors.file_processors import ( + parse_with_llamacloud_retry, + ) + + # Estimate pages (rough estimate based on file size) + file_size = os.path.getsize(file_path) + estimated_pages = max(1, file_size // (80 * 1024)) + + result = await parse_with_llamacloud_retry( + file_path=file_path, + estimated_pages=estimated_pages, + task_logger=task_logger, + log_entry=log_entry, + ) + + markdown_documents = await result.aget_markdown_documents( + split_by_page=False + ) + if markdown_documents: + return markdown_documents[0].text + return None + + elif etl_service == "DOCLING": + from app.services.docling_service import create_docling_service + + docling_service = create_docling_service() + + # Suppress pdfminer warnings + pdfminer_logger = getLogger("pdfminer") + original_level = pdfminer_logger.level + + with warnings.catch_warnings(): + warnings.filterwarnings( + "ignore", category=UserWarning, module="pdfminer" + ) + warnings.filterwarnings( + "ignore", message=".*Cannot set gray non-stroke color.*" + ) + warnings.filterwarnings("ignore", message=".*invalid float value.*") + + pdfminer_logger.setLevel(ERROR) + + try: + result = await docling_service.process_document( + file_path, file_name + ) + finally: + pdfminer_logger.setLevel(original_level) + + return result.get("content") + else: + logger.warning(f"Unknown ETL service: {etl_service}") + return None + + except Exception as e: + logger.error(f"ETL extraction failed for {file_name}: {e!s}") + return None + + +# ============ Indexer Functions ============ + + +async def check_document_by_unique_identifier( + session: AsyncSession, unique_identifier_hash: str +) -> Document | None: + """Check if a document with the given unique identifier hash already exists.""" + from sqlalchemy.future import select + from sqlalchemy.orm import selectinload + + existing_doc_result = await session.execute( + select(Document) + .options(selectinload(Document.chunks)) + .where(Document.unique_identifier_hash == unique_identifier_hash) + ) + return existing_doc_result.scalars().first() + + +async def update_connector_last_indexed( + session: AsyncSession, + connector, + update_last_indexed: bool = True, +) -> None: + """Update the last_indexed_at timestamp for a connector.""" + if update_last_indexed: + connector.last_indexed_at = datetime.now( + UTC + ) # Use UTC for timezone consistency + logger.info(f"Updated last_indexed_at to {connector.last_indexed_at}") + + +async def index_composio_google_drive( + session: AsyncSession, + connector, + connector_id: int, + search_space_id: int, + user_id: str, + task_logger: TaskLoggingService, + log_entry, + update_last_indexed: bool = True, + max_items: int = 1000, +) -> tuple[int, str]: + """Index Google Drive files via Composio with delta sync support. + + Delta Sync Flow: + 1. First sync: Full scan + get initial page token + 2. Subsequent syncs: Use LIST_CHANGES to process only changed files + + Supports folder/file selection via connector config: + - selected_folders: List of {id, name} for folders to index + - selected_files: List of {id, name} for individual files to index + - indexing_options: {max_files_per_folder, incremental_sync, include_subfolders} + """ + try: + composio_connector = ComposioGoogleDriveConnector(session, connector_id) + connector_config = await composio_connector.get_config() + + # Get folder/file selection configuration + selected_folders = connector_config.get("selected_folders", []) + selected_files = connector_config.get("selected_files", []) + indexing_options = connector_config.get("indexing_options", {}) + + # Check for stored page token for delta sync + stored_page_token = connector_config.get("drive_page_token") + use_delta_sync = stored_page_token and connector.last_indexed_at + + max_files_per_folder = indexing_options.get("max_files_per_folder", 100) + include_subfolders = indexing_options.get("include_subfolders", True) + + # Route to delta sync or full scan + if use_delta_sync: + logger.info( + f"Using delta sync for Composio Google Drive connector {connector_id}" + ) + await task_logger.log_task_progress( + log_entry, + f"Starting delta sync for Google Drive via Composio (connector {connector_id})", + {"stage": "delta_sync", "token": stored_page_token[:20] + "..."}, + ) + + ( + documents_indexed, + documents_skipped, + processing_errors, + ) = await _index_composio_drive_delta_sync( + session=session, + composio_connector=composio_connector, + connector_id=connector_id, + search_space_id=search_space_id, + user_id=user_id, + page_token=stored_page_token, + max_items=max_items, + task_logger=task_logger, + log_entry=log_entry, + ) + else: + logger.info( + f"Using full scan for Composio Google Drive connector {connector_id} (first sync or no token)" + ) + await task_logger.log_task_progress( + log_entry, + f"Fetching Google Drive files via Composio for connector {connector_id}", + { + "stage": "full_scan", + "selected_folders": len(selected_folders), + "selected_files": len(selected_files), + }, + ) + + ( + documents_indexed, + documents_skipped, + processing_errors, + ) = await _index_composio_drive_full_scan( + session=session, + composio_connector=composio_connector, + connector_id=connector_id, + search_space_id=search_space_id, + user_id=user_id, + selected_folders=selected_folders, + selected_files=selected_files, + max_files_per_folder=max_files_per_folder, + include_subfolders=include_subfolders, + max_items=max_items, + task_logger=task_logger, + log_entry=log_entry, + ) + + # Get new page token for next sync (always update after successful sync) + new_token, token_error = await composio_connector.get_drive_start_page_token() + if new_token and not token_error: + # Refresh connector to avoid stale state + await session.refresh(connector) + + if not connector.config: + connector.config = {} + connector.config["drive_page_token"] = new_token + flag_modified(connector, "config") + logger.info(f"Updated drive_page_token for connector {connector_id}") + elif token_error: + logger.warning(f"Failed to get new page token: {token_error}") + + # CRITICAL: Always update timestamp so Electric SQL syncs and UI shows indexed status + await update_connector_last_indexed(session, connector, update_last_indexed) + + # Final commit + logger.info( + f"Final commit: Total {documents_indexed} Google Drive files processed" + ) + await session.commit() + logger.info( + "Successfully committed all Composio Google Drive document changes to database" + ) + + # Handle processing errors + error_message = None + if processing_errors: + if len(processing_errors) == 1: + error_message = processing_errors[0] + else: + error_message = f"Failed to process {len(processing_errors)} file(s). First error: {processing_errors[0]}" + await task_logger.log_task_failure( + log_entry, + f"Completed Google Drive indexing with {len(processing_errors)} error(s) for connector {connector_id}", + { + "documents_indexed": documents_indexed, + "documents_skipped": documents_skipped, + "sync_type": "delta" if use_delta_sync else "full", + "errors": processing_errors, + }, + ) + else: + await task_logger.log_task_success( + log_entry, + f"Successfully completed Google Drive indexing via Composio for connector {connector_id}", + { + "documents_indexed": documents_indexed, + "documents_skipped": documents_skipped, + "sync_type": "delta" if use_delta_sync else "full", + }, + ) + + return documents_indexed, error_message + + except Exception as e: + logger.error(f"Failed to index Google Drive via Composio: {e!s}", exc_info=True) + return 0, f"Failed to index Google Drive via Composio: {e!s}" + + +async def _index_composio_drive_delta_sync( + session: AsyncSession, + composio_connector: ComposioGoogleDriveConnector, + connector_id: int, + search_space_id: int, + user_id: str, + page_token: str, + max_items: int, + task_logger: TaskLoggingService, + log_entry, +) -> tuple[int, int, list[str]]: + """Index Google Drive files using delta sync (only changed files). + + Uses GOOGLEDRIVE_LIST_CHANGES to fetch only files that changed since last sync. + Handles: new files, modified files, and deleted files. + """ + documents_indexed = 0 + documents_skipped = 0 + processing_errors = [] + + # Fetch all changes with pagination + all_changes = [] + current_token = page_token + + while len(all_changes) < max_items: + changes, next_token, error = await composio_connector.list_drive_changes( + page_token=current_token, + page_size=100, + include_removed=True, + ) + + if error: + logger.error(f"Error fetching Drive changes: {error}") + processing_errors.append(f"Failed to fetch changes: {error}") + break + + all_changes.extend(changes) + + if not next_token or next_token == current_token: + break + current_token = next_token + + if not all_changes: + logger.info("No changes detected since last sync") + return 0, 0, [] + + logger.info(f"Processing {len(all_changes)} changes from delta sync") + + for change in all_changes[:max_items]: + try: + # Handle removed files + is_removed = change.get("removed", False) + file_info = change.get("file", {}) + file_id = change.get("fileId") or file_info.get("id", "") + + if not file_id: + documents_skipped += 1 + continue + + # Check if file was trashed or removed + if is_removed or file_info.get("trashed", False): + # Remove document from database + document_type = DocumentType(TOOLKIT_TO_DOCUMENT_TYPE["googledrive"]) + unique_identifier_hash = generate_unique_identifier_hash( + document_type, f"drive_{file_id}", search_space_id + ) + existing_document = await check_document_by_unique_identifier( + session, unique_identifier_hash + ) + if existing_document: + await session.delete(existing_document) + documents_indexed += 1 + logger.info(f"Deleted document for removed/trashed file: {file_id}") + continue + + # Process changed file + file_name = file_info.get("name", "") or "Untitled" + mime_type = file_info.get("mimeType", "") or file_info.get("mime_type", "") + + # Skip folders + if mime_type == "application/vnd.google-apps.folder": + continue + + # Process the file + indexed, skipped, errors = await _process_single_drive_file( + session=session, + composio_connector=composio_connector, + file_id=file_id, + file_name=file_name, + mime_type=mime_type, + connector_id=connector_id, + search_space_id=search_space_id, + user_id=user_id, + task_logger=task_logger, + log_entry=log_entry, + ) + + documents_indexed += indexed + documents_skipped += skipped + processing_errors.extend(errors) + + # Batch commit every 10 documents + if documents_indexed > 0 and documents_indexed % 10 == 0: + await session.commit() + logger.info(f"Committed batch: {documents_indexed} changes processed") + + except Exception as e: + error_msg = f"Error processing change for file {file_id}: {e!s}" + logger.error(error_msg, exc_info=True) + processing_errors.append(error_msg) + documents_skipped += 1 + + logger.info( + f"Delta sync complete: {documents_indexed} indexed, {documents_skipped} skipped" + ) + return documents_indexed, documents_skipped, processing_errors + + +async def _index_composio_drive_full_scan( + session: AsyncSession, + composio_connector: ComposioGoogleDriveConnector, + connector_id: int, + search_space_id: int, + user_id: str, + selected_folders: list[dict], + selected_files: list[dict], + max_files_per_folder: int, + include_subfolders: bool, + max_items: int, + task_logger: TaskLoggingService, + log_entry, +) -> tuple[int, int, list[str]]: + """Index Google Drive files using full scan (first sync or when no delta token).""" + documents_indexed = 0 + documents_skipped = 0 + processing_errors = [] + + all_files = [] + + # If specific folders/files are selected, fetch from those + if selected_folders or selected_files: + # Fetch files from selected folders + for folder in selected_folders: + folder_id = folder.get("id") + folder_name = folder.get("name", "Unknown") + + if not folder_id: + continue + + # Handle special case for "root" folder + actual_folder_id = None if folder_id == "root" else folder_id + + logger.info(f"Fetching files from folder: {folder_name} ({folder_id})") + + # Fetch files from this folder + folder_files = [] + page_token = None + + while len(folder_files) < max_files_per_folder: + ( + files, + next_token, + error, + ) = await composio_connector.list_drive_files( + folder_id=actual_folder_id, + page_token=page_token, + page_size=min(100, max_files_per_folder - len(folder_files)), + ) + + if error: + logger.warning( + f"Failed to fetch files from folder {folder_name}: {error}" + ) + break + + # Process files + for file_info in files: + mime_type = file_info.get("mimeType", "") or file_info.get( + "mime_type", "" + ) + + # If it's a folder and include_subfolders is enabled, recursively fetch + if mime_type == "application/vnd.google-apps.folder": + if include_subfolders: + # Add subfolder files recursively + subfolder_files = await _fetch_folder_files_recursively( + composio_connector, + file_info.get("id"), + max_files=max_files_per_folder, + current_count=len(folder_files), + ) + folder_files.extend(subfolder_files) + else: + folder_files.append(file_info) + + if not next_token: + break + page_token = next_token + + all_files.extend(folder_files[:max_files_per_folder]) + logger.info(f"Found {len(folder_files)} files in folder {folder_name}") + + # Add specifically selected files + for selected_file in selected_files: + file_id = selected_file.get("id") + file_name = selected_file.get("name", "Unknown") + + if not file_id: + continue + + # Add file info (we'll fetch content later during indexing) + all_files.append( + { + "id": file_id, + "name": file_name, + "mimeType": "", # Will be determined later + } + ) + else: + # No selection specified - fetch all files (original behavior) + page_token = None + + while len(all_files) < max_items: + files, next_token, error = await composio_connector.list_drive_files( + page_token=page_token, + page_size=min(100, max_items - len(all_files)), + ) + + if error: + return 0, 0, [f"Failed to fetch Drive files: {error}"] + + all_files.extend(files) + + if not next_token: + break + page_token = next_token + + if not all_files: + logger.info("No Google Drive files found") + return 0, 0, [] + + logger.info( + f"Found {len(all_files)} Google Drive files to index via Composio (full scan)" + ) + + for file_info in all_files: + try: + # Handle both standard Google API and potential Composio variations + file_id = file_info.get("id", "") or file_info.get("fileId", "") + file_name = ( + file_info.get("name", "") or file_info.get("fileName", "") or "Untitled" + ) + mime_type = file_info.get("mimeType", "") or file_info.get("mime_type", "") + + if not file_id: + documents_skipped += 1 + continue + + # Skip folders + if mime_type == "application/vnd.google-apps.folder": + continue + + # Process the file + indexed, skipped, errors = await _process_single_drive_file( + session=session, + composio_connector=composio_connector, + file_id=file_id, + file_name=file_name, + mime_type=mime_type, + connector_id=connector_id, + search_space_id=search_space_id, + user_id=user_id, + task_logger=task_logger, + log_entry=log_entry, + ) + + documents_indexed += indexed + documents_skipped += skipped + processing_errors.extend(errors) + + # Batch commit every 10 documents + if documents_indexed > 0 and documents_indexed % 10 == 0: + logger.info( + f"Committing batch: {documents_indexed} Google Drive files processed so far" + ) + await session.commit() + + except Exception as e: + error_msg = f"Error processing Drive file {file_name or 'unknown'}: {e!s}" + logger.error(error_msg, exc_info=True) + processing_errors.append(error_msg) + documents_skipped += 1 + + logger.info( + f"Full scan complete: {documents_indexed} indexed, {documents_skipped} skipped" + ) + return documents_indexed, documents_skipped, processing_errors + + +async def _process_single_drive_file( + session: AsyncSession, + composio_connector: ComposioGoogleDriveConnector, + file_id: str, + file_name: str, + mime_type: str, + connector_id: int, + search_space_id: int, + user_id: str, + task_logger: TaskLoggingService, + log_entry, +) -> tuple[int, int, list[str]]: + """Process a single Google Drive file for indexing. + + Returns: + Tuple of (documents_indexed, documents_skipped, processing_errors) + """ + processing_errors = [] + + # Generate unique identifier hash + document_type = DocumentType(TOOLKIT_TO_DOCUMENT_TYPE["googledrive"]) + unique_identifier_hash = generate_unique_identifier_hash( + document_type, f"drive_{file_id}", search_space_id + ) + + # Check if document exists + existing_document = await check_document_by_unique_identifier( + session, unique_identifier_hash + ) + + # Get file content + content, content_error = await composio_connector.get_drive_file_content(file_id) + + if content_error or not content: + logger.warning(f"Could not get content for file {file_name}: {content_error}") + # Use metadata as content fallback + markdown_content = f"# {file_name}\n\n" + markdown_content += f"**File ID:** {file_id}\n" + markdown_content += f"**Type:** {mime_type}\n" + elif isinstance(content, dict): + # Safety check: if content is still a dict, log error and use fallback + error_msg = f"Unexpected dict content format for file {file_name}: {list(content.keys())}" + logger.error(error_msg) + processing_errors.append(error_msg) + markdown_content = f"# {file_name}\n\n" + markdown_content += f"**File ID:** {file_id}\n" + markdown_content += f"**Type:** {mime_type}\n" + else: + # Process content based on file type + markdown_content = await _process_file_content( + content=content, + file_name=file_name, + file_id=file_id, + mime_type=mime_type, + search_space_id=search_space_id, + user_id=user_id, + session=session, + task_logger=task_logger, + log_entry=log_entry, + processing_errors=processing_errors, + ) + + content_hash = generate_content_hash(markdown_content, search_space_id) + + if existing_document: + if existing_document.content_hash == content_hash: + return 0, 1, processing_errors # Skipped + + # Update existing document + user_llm = await get_user_long_context_llm(session, user_id, search_space_id) + + if user_llm: + document_metadata = { + "file_id": file_id, + "file_name": file_name, + "mime_type": mime_type, + "document_type": "Google Drive File (Composio)", + } + ( + summary_content, + summary_embedding, + ) = await generate_document_summary( + markdown_content, user_llm, document_metadata + ) + else: + summary_content = f"Google Drive File: {file_name}\n\nType: {mime_type}" + summary_embedding = config.embedding_model_instance.embed(summary_content) + + chunks = await create_document_chunks(markdown_content) + + existing_document.title = f"Drive: {file_name}" + existing_document.content = summary_content + existing_document.content_hash = content_hash + existing_document.embedding = summary_embedding + existing_document.document_metadata = { + "file_id": file_id, + "file_name": file_name, + "FILE_NAME": file_name, # For compatibility + "mime_type": mime_type, + "connector_id": connector_id, + "source": "composio", + } + existing_document.chunks = chunks + existing_document.updated_at = get_current_timestamp() + + return 1, 0, processing_errors # Indexed + + # Create new document + user_llm = await get_user_long_context_llm(session, user_id, search_space_id) + + if user_llm: + document_metadata = { + "file_id": file_id, + "file_name": file_name, + "mime_type": mime_type, + "document_type": "Google Drive File (Composio)", + } + ( + summary_content, + summary_embedding, + ) = await generate_document_summary( + markdown_content, user_llm, document_metadata + ) + else: + summary_content = f"Google Drive File: {file_name}\n\nType: {mime_type}" + summary_embedding = config.embedding_model_instance.embed(summary_content) + + chunks = await create_document_chunks(markdown_content) + + document = Document( + search_space_id=search_space_id, + title=f"Drive: {file_name}", + document_type=DocumentType(TOOLKIT_TO_DOCUMENT_TYPE["googledrive"]), + document_metadata={ + "file_id": file_id, + "file_name": file_name, + "FILE_NAME": file_name, # For compatibility + "mime_type": mime_type, + "connector_id": connector_id, + "toolkit_id": "googledrive", + "source": "composio", + }, + content=summary_content, + content_hash=content_hash, + unique_identifier_hash=unique_identifier_hash, + embedding=summary_embedding, + chunks=chunks, + updated_at=get_current_timestamp(), + ) + session.add(document) + + return 1, 0, processing_errors # Indexed + + +async def _fetch_folder_files_recursively( + composio_connector: ComposioGoogleDriveConnector, + folder_id: str, + max_files: int = 100, + current_count: int = 0, + depth: int = 0, + max_depth: int = 10, +) -> list[dict[str, Any]]: + """ + Recursively fetch files from a Google Drive folder via Composio. + + Args: + composio_connector: The Composio connector instance + folder_id: Google Drive folder ID + max_files: Maximum number of files to fetch + current_count: Current number of files already fetched + depth: Current recursion depth + max_depth: Maximum recursion depth to prevent infinite loops + + Returns: + List of file info dictionaries + """ + if depth >= max_depth: + logger.warning(f"Max recursion depth reached for folder {folder_id}") + return [] + + if current_count >= max_files: + return [] + + all_files = [] + page_token = None + + try: + while len(all_files) + current_count < max_files: + files, next_token, error = await composio_connector.list_drive_files( + folder_id=folder_id, + page_token=page_token, + page_size=min(100, max_files - len(all_files) - current_count), + ) + + if error: + logger.warning( + f"Error fetching files from subfolder {folder_id}: {error}" + ) + break + + for file_info in files: + mime_type = file_info.get("mimeType", "") or file_info.get( + "mime_type", "" + ) + + if mime_type == "application/vnd.google-apps.folder": + # Recursively fetch from subfolders + subfolder_files = await _fetch_folder_files_recursively( + composio_connector, + file_info.get("id"), + max_files=max_files, + current_count=current_count + len(all_files), + depth=depth + 1, + max_depth=max_depth, + ) + all_files.extend(subfolder_files) + else: + all_files.append(file_info) + + if len(all_files) + current_count >= max_files: + break + + if not next_token: + break + page_token = next_token + + return all_files[: max_files - current_count] + + except Exception as e: + logger.error(f"Error in recursive folder fetch: {e!s}") + return all_files diff --git a/surfsense_backend/app/connectors/google_calendar_connector.py b/surfsense_backend/app/connectors/google_calendar_connector.py index 6d389ddd5..d8160cf25 100644 --- a/surfsense_backend/app/connectors/google_calendar_connector.py +++ b/surfsense_backend/app/connectors/google_calendar_connector.py @@ -142,6 +142,15 @@ class GoogleCalendarConnector: flag_modified(connector, "config") await self._session.commit() except Exception as e: + error_str = str(e) + # Check if this is an invalid_grant error (token expired/revoked) + if ( + "invalid_grant" in error_str.lower() + or "token has been expired or revoked" in error_str.lower() + ): + raise Exception( + "Google Calendar authentication failed. Please re-authenticate." + ) from e raise Exception( f"Failed to refresh Google OAuth credentials: {e!s}" ) from e @@ -165,6 +174,14 @@ class GoogleCalendarConnector: self.service = build("calendar", "v3", credentials=credentials) return self.service except Exception as e: + error_str = str(e) + # If the error already contains a user-friendly re-authentication message, preserve it + if ( + "re-authenticate" in error_str.lower() + or "expired or been revoked" in error_str.lower() + or "authentication failed" in error_str.lower() + ): + raise Exception(error_str) from e raise Exception(f"Failed to create Google Calendar service: {e!s}") from e async def get_calendars(self) -> tuple[list[dict[str, Any]], str | None]: @@ -271,6 +288,14 @@ class GoogleCalendarConnector: return events, None except Exception as e: + error_str = str(e) + # If the error already contains a user-friendly re-authentication message, preserve it + if ( + "re-authenticate" in error_str.lower() + or "expired or been revoked" in error_str.lower() + or "authentication failed" in error_str.lower() + ): + return [], error_str return [], f"Error fetching events: {e!s}" def format_event_to_markdown(self, event: dict[str, Any]) -> str: diff --git a/surfsense_backend/app/connectors/google_gmail_connector.py b/surfsense_backend/app/connectors/google_gmail_connector.py index 10008ad73..7c7262bff 100644 --- a/surfsense_backend/app/connectors/google_gmail_connector.py +++ b/surfsense_backend/app/connectors/google_gmail_connector.py @@ -141,6 +141,15 @@ class GoogleGmailConnector: flag_modified(connector, "config") await self._session.commit() except Exception as e: + error_str = str(e) + # Check if this is an invalid_grant error (token expired/revoked) + if ( + "invalid_grant" in error_str.lower() + or "token has been expired or revoked" in error_str.lower() + ): + raise Exception( + "Gmail authentication failed. Please re-authenticate." + ) from e raise Exception( f"Failed to refresh Google OAuth credentials: {e!s}" ) from e @@ -164,6 +173,14 @@ class GoogleGmailConnector: self.service = build("gmail", "v1", credentials=credentials) return self.service except Exception as e: + error_str = str(e) + # If the error already contains a user-friendly re-authentication message, preserve it + if ( + "re-authenticate" in error_str.lower() + or "expired or been revoked" in error_str.lower() + or "authentication failed" in error_str.lower() + ): + raise Exception(error_str) from e raise Exception(f"Failed to create Gmail service: {e!s}") from e async def get_user_profile(self) -> tuple[dict[str, Any], str | None]: @@ -225,6 +242,14 @@ class GoogleGmailConnector: return messages, None except Exception as e: + error_str = str(e) + # If the error already contains a user-friendly re-authentication message, preserve it + if ( + "re-authenticate" in error_str.lower() + or "expired or been revoked" in error_str.lower() + or "authentication failed" in error_str.lower() + ): + return [], error_str return [], f"Error fetching messages list: {e!s}" async def get_message_details( @@ -271,6 +296,13 @@ class GoogleGmailConnector: try: from datetime import datetime, timedelta + # Normalize date values - handle "undefined" strings from frontend + # This prevents "time data 'undefined' does not match format" errors + if start_date == "undefined" or start_date == "": + start_date = None + if end_date == "undefined" or end_date == "": + end_date = None + # Build date query query_parts = [] diff --git a/surfsense_backend/app/db.py b/surfsense_backend/app/db.py index 771f956b3..8b6f3c718 100644 --- a/surfsense_backend/app/db.py +++ b/surfsense_backend/app/db.py @@ -55,7 +55,9 @@ class DocumentType(str, Enum): CIRCLEBACK = "CIRCLEBACK" OBSIDIAN_CONNECTOR = "OBSIDIAN_CONNECTOR" NOTE = "NOTE" - COMPOSIO_CONNECTOR = "COMPOSIO_CONNECTOR" # Generic Composio integration + COMPOSIO_GOOGLE_DRIVE_CONNECTOR = "COMPOSIO_GOOGLE_DRIVE_CONNECTOR" + COMPOSIO_GMAIL_CONNECTOR = "COMPOSIO_GMAIL_CONNECTOR" + COMPOSIO_GOOGLE_CALENDAR_CONNECTOR = "COMPOSIO_GOOGLE_CALENDAR_CONNECTOR" class SearchSourceConnectorType(str, Enum): @@ -86,9 +88,9 @@ class SearchSourceConnectorType(str, Enum): "OBSIDIAN_CONNECTOR" # Self-hosted only - Local Obsidian vault indexing ) MCP_CONNECTOR = "MCP_CONNECTOR" # Model Context Protocol - User-defined API tools - COMPOSIO_CONNECTOR = ( - "COMPOSIO_CONNECTOR" # Generic Composio integration (Google, Slack, etc.) - ) + COMPOSIO_GOOGLE_DRIVE_CONNECTOR = "COMPOSIO_GOOGLE_DRIVE_CONNECTOR" + COMPOSIO_GMAIL_CONNECTOR = "COMPOSIO_GMAIL_CONNECTOR" + COMPOSIO_GOOGLE_CALENDAR_CONNECTOR = "COMPOSIO_GOOGLE_CALENDAR_CONNECTOR" class LiteLLMProvider(str, Enum): diff --git a/surfsense_backend/app/routes/composio_routes.py b/surfsense_backend/app/routes/composio_routes.py index eecbaf598..a28361132 100644 --- a/surfsense_backend/app/routes/composio_routes.py +++ b/surfsense_backend/app/routes/composio_routes.py @@ -8,16 +8,18 @@ Endpoints: - GET /composio/toolkits - List available Composio toolkits - GET /auth/composio/connector/add - Initiate OAuth for a specific toolkit - GET /auth/composio/connector/callback - Handle OAuth callback +- GET /connectors/{connector_id}/composio-drive/folders - List folders/files for Composio Google Drive """ import logging from uuid import UUID -from fastapi import APIRouter, Depends, HTTPException, Query +from fastapi import APIRouter, Depends, HTTPException, Query, Request from fastapi.responses import RedirectResponse from pydantic import ValidationError from sqlalchemy.exc import IntegrityError from sqlalchemy.ext.asyncio import AsyncSession +from sqlalchemy.future import select from app.config import config from app.db import ( @@ -29,19 +31,31 @@ from app.db import ( from app.services.composio_service import ( COMPOSIO_TOOLKIT_NAMES, INDEXABLE_TOOLKITS, + TOOLKIT_TO_CONNECTOR_TYPE, ComposioService, ) from app.users import current_active_user from app.utils.connector_naming import ( - check_duplicate_connector, - generate_unique_connector_name, + count_connectors_of_type, + get_base_name_for_type, ) from app.utils.oauth_security import OAuthStateManager +# Note: We no longer use check_duplicate_connector for Composio connectors because +# Composio generates a new connected_account_id each time, even for the same Google account. +# Instead, we check for existing connectors by type/space/user and update them. + logger = logging.getLogger(__name__) router = APIRouter() +# Map toolkit_id to frontend connector ID +TOOLKIT_TO_FRONTEND_CONNECTOR_ID = { + "googledrive": "composio-googledrive", + "gmail": "composio-gmail", + "googlecalendar": "composio-googlecalendar", +} + # Initialize security utilities _state_manager = None @@ -166,11 +180,8 @@ async def initiate_composio_auth( @router.get("/auth/composio/connector/callback") async def composio_callback( + request: Request, state: str | None = None, - composio_connected_account_id: str | None = Query( - None, alias="connectedAccountId" - ), # Composio sends camelCase - connected_account_id: str | None = None, # Fallback snake_case error: str | None = None, session: AsyncSession = Depends(get_async_session), ): @@ -236,16 +247,17 @@ async def composio_callback( ) # Initialize Composio service - ComposioService() + service = ComposioService() - # Use camelCase param if provided (Composio's format), fallback to snake_case - final_connected_account_id = ( - composio_connected_account_id or connected_account_id - ) + # Extract connected_account_id from query params (accepts both camelCase and snake_case) + query_params = request.query_params + final_connected_account_id = query_params.get( + "connectedAccountId" + ) or query_params.get("connected_account_id") - # DEBUG: Log all query parameters received + # DEBUG: Log query parameter received logger.info( - f"DEBUG: Callback received - connectedAccountId: {composio_connected_account_id}, connected_account_id: {connected_account_id}, using: {final_connected_account_id}" + f"DEBUG: Callback received - connectedAccountId: {query_params.get('connectedAccountId')}, connected_account_id: {query_params.get('connected_account_id')}, using: {final_connected_account_id}" ) # If we still don't have a connected_account_id, warn but continue @@ -268,38 +280,89 @@ async def composio_callback( "is_indexable": toolkit_id in INDEXABLE_TOOLKITS, } - # Check for duplicate connector - # For Composio, we use toolkit_id + connected_account_id as unique identifier - identifier = final_connected_account_id or f"{toolkit_id}_{user_id}" + # Get the specific connector type for this toolkit + connector_type_str = TOOLKIT_TO_CONNECTOR_TYPE.get(toolkit_id) + if not connector_type_str: + raise HTTPException( + status_code=400, + detail=f"Unknown toolkit: {toolkit_id}. Available: {list(TOOLKIT_TO_CONNECTOR_TYPE.keys())}", + ) + connector_type = SearchSourceConnectorType(connector_type_str) - is_duplicate = await check_duplicate_connector( - session, - SearchSourceConnectorType.COMPOSIO_CONNECTOR, - space_id, - user_id, - identifier, + # Check for existing connector of the same type for this user/space + # When reconnecting, Composio gives a new connected_account_id, so we need to + # check by connector_type, user_id, and search_space_id instead of connected_account_id + existing_connector_result = await session.execute( + select(SearchSourceConnector).where( + SearchSourceConnector.connector_type == connector_type, + SearchSourceConnector.search_space_id == space_id, + SearchSourceConnector.user_id == user_id, + ) ) - if is_duplicate: - logger.warning( - f"Duplicate Composio connector detected for user {user_id} with toolkit {toolkit_id}" + existing_connector = existing_connector_result.scalars().first() + + if existing_connector: + # Delete the old Composio connected account before updating + old_connected_account_id = existing_connector.config.get( + "composio_connected_account_id" + ) + if ( + old_connected_account_id + and old_connected_account_id != final_connected_account_id + ): + try: + deleted = await service.delete_connected_account( + old_connected_account_id + ) + if deleted: + logger.info( + f"Deleted old Composio connected account {old_connected_account_id} " + f"before updating connector {existing_connector.id}" + ) + else: + logger.warning( + f"Failed to delete old Composio connected account {old_connected_account_id}" + ) + except Exception as delete_error: + # Log but don't fail - the old account may already be deleted + logger.warning( + f"Error deleting old Composio connected account {old_connected_account_id}: {delete_error!s}" + ) + + # Update existing connector with new connected_account_id + logger.info( + f"Updating existing Composio connector {existing_connector.id} with new connected_account_id {final_connected_account_id}" + ) + existing_connector.config = connector_config + await session.commit() + await session.refresh(existing_connector) + + # Get the frontend connector ID based on toolkit_id + frontend_connector_id = TOOLKIT_TO_FRONTEND_CONNECTOR_ID.get( + toolkit_id, "composio-connector" ) return RedirectResponse( - url=f"{config.NEXT_FRONTEND_URL}/dashboard/{space_id}/new-chat?modal=connectors&tab=all&error=duplicate_account&connector=composio-connector" + url=f"{config.NEXT_FRONTEND_URL}/dashboard/{space_id}/new-chat?modal=connectors&tab=all&success=true&connector={frontend_connector_id}&connectorId={existing_connector.id}" ) try: - # Generate a unique, user-friendly connector name - connector_name = await generate_unique_connector_name( - session, - SearchSourceConnectorType.COMPOSIO_CONNECTOR, - space_id, - user_id, - f"{toolkit_name} (Composio)", + # Count existing connectors of this type to determine the number + count = await count_connectors_of_type( + session, connector_type, space_id, user_id ) + # Generate base name (e.g., "Gmail", "Google Drive") + base_name = get_base_name_for_type(connector_type) + + # Format: "Gmail (Composio) 1", "Gmail (Composio) 2", etc. + if count == 0: + connector_name = f"{base_name} (Composio) 1" + else: + connector_name = f"{base_name} (Composio) {count + 1}" + db_connector = SearchSourceConnector( name=connector_name, - connector_type=SearchSourceConnectorType.COMPOSIO_CONNECTOR, + connector_type=connector_type, config=connector_config, search_space_id=space_id, user_id=user_id, @@ -314,8 +377,12 @@ async def composio_callback( f"Successfully created Composio connector {db_connector.id} for user {user_id}, toolkit {toolkit_id}" ) + # Get the frontend connector ID based on toolkit_id + frontend_connector_id = TOOLKIT_TO_FRONTEND_CONNECTOR_ID.get( + toolkit_id, "composio-connector" + ) return RedirectResponse( - url=f"{config.NEXT_FRONTEND_URL}/dashboard/{space_id}/new-chat?modal=connectors&tab=all&success=true&connector=composio-connector&connectorId={db_connector.id}" + url=f"{config.NEXT_FRONTEND_URL}/dashboard/{space_id}/new-chat?modal=connectors&tab=all&success=true&connector={frontend_connector_id}&connectorId={db_connector.id}" ) except IntegrityError as e: @@ -339,3 +406,136 @@ async def composio_callback( raise HTTPException( status_code=500, detail=f"Failed to complete Composio OAuth: {e!s}" ) from e + + +@router.get("/connectors/{connector_id}/composio-drive/folders") +async def list_composio_drive_folders( + connector_id: int, + parent_id: str | None = None, + session: AsyncSession = Depends(get_async_session), + user: User = Depends(current_active_user), +): + """ + List folders AND files in user's Google Drive via Composio with hierarchical support. + + This is called at index time from the manage connector page to display + the complete file system (folders and files). Only folders are selectable. + + Args: + connector_id: ID of the Composio Google Drive connector + parent_id: Optional parent folder ID to list contents (None for root) + + Returns: + JSON with list of items: { + "items": [ + {"id": str, "name": str, "mimeType": str, "isFolder": bool, ...}, + ... + ] + } + """ + if not ComposioService.is_enabled(): + raise HTTPException( + status_code=503, + detail="Composio integration is not enabled.", + ) + + try: + # Get connector and verify ownership + result = await session.execute( + select(SearchSourceConnector).filter( + SearchSourceConnector.id == connector_id, + SearchSourceConnector.user_id == user.id, + SearchSourceConnector.connector_type + == SearchSourceConnectorType.COMPOSIO_GOOGLE_DRIVE_CONNECTOR, + ) + ) + connector = result.scalars().first() + + if not connector: + raise HTTPException( + status_code=404, + detail="Composio Google Drive connector not found or access denied", + ) + + # Get Composio connected account ID from config + composio_connected_account_id = connector.config.get( + "composio_connected_account_id" + ) + if not composio_connected_account_id: + raise HTTPException( + status_code=400, + detail="Composio connected account not found. Please reconnect the connector.", + ) + + # Initialize Composio service and fetch files + service = ComposioService() + entity_id = f"surfsense_{user.id}" + + # Fetch files/folders from Composio Google Drive + files, _next_token, error = await service.get_drive_files( + connected_account_id=composio_connected_account_id, + entity_id=entity_id, + folder_id=parent_id, + page_size=100, + ) + + if error: + logger.error(f"Failed to list Composio Drive files: {error}") + raise HTTPException( + status_code=500, detail=f"Failed to list folder contents: {error}" + ) + + # Transform files to match the expected format with isFolder field + items = [] + for file_info in files: + file_id = file_info.get("id", "") or file_info.get("fileId", "") + file_name = ( + file_info.get("name", "") or file_info.get("fileName", "") or "Untitled" + ) + mime_type = file_info.get("mimeType", "") or file_info.get("mime_type", "") + + if not file_id: + continue + + is_folder = mime_type == "application/vnd.google-apps.folder" + + items.append( + { + "id": file_id, + "name": file_name, + "mimeType": mime_type, + "isFolder": is_folder, + "parents": file_info.get("parents", []), + "size": file_info.get("size"), + "iconLink": file_info.get("iconLink"), + } + ) + + # Sort: folders first, then files, both alphabetically + folders = sorted( + [item for item in items if item["isFolder"]], + key=lambda x: x["name"].lower(), + ) + files_list = sorted( + [item for item in items if not item["isFolder"]], + key=lambda x: x["name"].lower(), + ) + items = folders + files_list + + folder_count = len(folders) + file_count = len(files_list) + + logger.info( + f"Listed {len(items)} total items ({folder_count} folders, {file_count} files) for Composio connector {connector_id}" + + (f" in folder {parent_id}" if parent_id else " in ROOT") + ) + + return {"items": items} + + except HTTPException: + raise + except Exception as e: + logger.error(f"Error listing Composio Drive contents: {e!s}", exc_info=True) + raise HTTPException( + status_code=500, detail=f"Failed to list Drive contents: {e!s}" + ) from e diff --git a/surfsense_backend/app/routes/google_drive_add_connector_route.py b/surfsense_backend/app/routes/google_drive_add_connector_route.py index e15aed762..6b4159d29 100644 --- a/surfsense_backend/app/routes/google_drive_add_connector_route.py +++ b/surfsense_backend/app/routes/google_drive_add_connector_route.py @@ -402,7 +402,7 @@ async def list_google_drive_folders( file_count = len(items) - folder_count logger.info( - f"✅ Listed {len(items)} total items ({folder_count} folders, {file_count} files) for connector {connector_id}" + f"Listed {len(items)} total items ({folder_count} folders, {file_count} files) for connector {connector_id}" + (f" in folder {parent_id}" if parent_id else " in ROOT") ) diff --git a/surfsense_backend/app/routes/search_source_connectors_routes.py b/surfsense_backend/app/routes/search_source_connectors_routes.py index 07d1dffe5..191c6f954 100644 --- a/surfsense_backend/app/routes/search_source_connectors_routes.py +++ b/surfsense_backend/app/routes/search_source_connectors_routes.py @@ -22,6 +22,8 @@ import logging from datetime import UTC, datetime, timedelta from typing import Any +import pytz +from dateutil.parser import isoparse from fastapi import APIRouter, Body, Depends, HTTPException, Query from pydantic import BaseModel, Field, ValidationError from sqlalchemy.exc import IntegrityError @@ -47,6 +49,7 @@ from app.schemas import ( SearchSourceConnectorRead, SearchSourceConnectorUpdate, ) +from app.services.composio_service import ComposioService from app.services.notification_service import NotificationService from app.tasks.connector_indexers import ( index_airtable_records, @@ -529,6 +532,38 @@ async def delete_search_source_connector( f"Failed to delete periodic schedule for connector {connector_id}" ) + # For Composio connectors, also delete the connected account in Composio + composio_connector_types = [ + SearchSourceConnectorType.COMPOSIO_GOOGLE_DRIVE_CONNECTOR, + SearchSourceConnectorType.COMPOSIO_GMAIL_CONNECTOR, + SearchSourceConnectorType.COMPOSIO_GOOGLE_CALENDAR_CONNECTOR, + ] + if db_connector.connector_type in composio_connector_types: + composio_connected_account_id = db_connector.config.get( + "composio_connected_account_id" + ) + if composio_connected_account_id and ComposioService.is_enabled(): + try: + service = ComposioService() + deleted = await service.delete_connected_account( + composio_connected_account_id + ) + if deleted: + logger.info( + f"Successfully deleted Composio connected account {composio_connected_account_id} " + f"for connector {connector_id}" + ) + else: + logger.warning( + f"Failed to delete Composio connected account {composio_connected_account_id} " + f"for connector {connector_id}" + ) + except Exception as composio_error: + # Log but don't fail the deletion - Composio account may already be deleted + logger.warning( + f"Error deleting Composio connected account {composio_connected_account_id}: {composio_error!s}" + ) + await session.delete(db_connector) await session.commit() return {"message": "Search source connector deleted successfully"} @@ -611,32 +646,59 @@ async def index_connector_content( # Handle different connector types response_message = "" - today_str = datetime.now().strftime("%Y-%m-%d") + # Use UTC for consistency with last_indexed_at storage + today_str = datetime.now(UTC).strftime("%Y-%m-%d") # Determine the actual date range to use if start_date is None: # Use last_indexed_at or default to 365 days ago if connector.last_indexed_at: - today = datetime.now().date() - if connector.last_indexed_at.date() == today: - # If last indexed today, go back 1 day to ensure we don't miss anything - indexing_from = (today - timedelta(days=1)).strftime("%Y-%m-%d") - else: - indexing_from = connector.last_indexed_at.strftime("%Y-%m-%d") - else: - indexing_from = (datetime.now() - timedelta(days=365)).strftime( - "%Y-%m-%d" + # Convert last_indexed_at to timezone-naive for comparison (like calculate_date_range does) + last_indexed_naive = ( + connector.last_indexed_at.replace(tzinfo=None) + if connector.last_indexed_at.tzinfo + else connector.last_indexed_at ) + # Use UTC for "today" to match how last_indexed_at is stored + today_utc = datetime.now(UTC).replace(tzinfo=None).date() + last_indexed_date = last_indexed_naive.date() + + if last_indexed_date == today_utc: + # If last indexed today, go back 1 day to ensure we don't miss anything + indexing_from = (today_utc - timedelta(days=1)).strftime("%Y-%m-%d") + else: + indexing_from = last_indexed_naive.strftime("%Y-%m-%d") + else: + indexing_from = ( + datetime.now(UTC).replace(tzinfo=None) - timedelta(days=365) + ).strftime("%Y-%m-%d") else: indexing_from = start_date # For calendar connectors, default to today but allow future dates if explicitly provided if connector.connector_type in [ SearchSourceConnectorType.GOOGLE_CALENDAR_CONNECTOR, + SearchSourceConnectorType.COMPOSIO_GOOGLE_CALENDAR_CONNECTOR, SearchSourceConnectorType.LUMA_CONNECTOR, ]: # Default to today if no end_date provided (users can manually select future dates) indexing_to = today_str if end_date is None else end_date + + # If start_date and end_date are the same, adjust end_date to be one day later + # to ensure valid date range (start_date must be strictly before end_date) + if indexing_from == indexing_to: + dt = isoparse(indexing_to) + if dt.tzinfo is None: + dt = dt.replace(tzinfo=pytz.UTC) + else: + dt = dt.astimezone(pytz.UTC) + # Add one day to end_date to make it strictly after start_date + dt_end = dt + timedelta(days=1) + indexing_to = dt_end.strftime("%Y-%m-%d") + logger.info( + f"Adjusted end_date from {end_date} to {indexing_to} " + f"to ensure valid date range (start_date must be strictly before end_date)" + ) else: # For non-calendar connectors, cap at today indexing_to = end_date if end_date else today_str @@ -887,11 +949,66 @@ async def index_connector_content( ) response_message = "Obsidian vault indexing started in the background." - elif connector.connector_type == SearchSourceConnectorType.COMPOSIO_CONNECTOR: + elif ( + connector.connector_type + == SearchSourceConnectorType.COMPOSIO_GOOGLE_DRIVE_CONNECTOR + ): from app.tasks.celery_tasks.connector_tasks import ( index_composio_connector_task, ) + # For Composio Google Drive, if drive_items is provided, update connector config + # This allows the UI to pass folder/file selection like the regular Google Drive connector + if drive_items and drive_items.has_items(): + # Update connector config with the selected folders/files + config = connector.config or {} + config["selected_folders"] = [ + {"id": f.id, "name": f.name} for f in drive_items.folders + ] + config["selected_files"] = [ + {"id": f.id, "name": f.name} for f in drive_items.files + ] + if drive_items.indexing_options: + config["indexing_options"] = { + "max_files_per_folder": drive_items.indexing_options.max_files_per_folder, + "incremental_sync": drive_items.indexing_options.incremental_sync, + "include_subfolders": drive_items.indexing_options.include_subfolders, + } + connector.config = config + from sqlalchemy.orm.attributes import flag_modified + + flag_modified(connector, "config") + await session.commit() + await session.refresh(connector) + + logger.info( + f"Triggering Composio Google Drive indexing for connector {connector_id} into search space {search_space_id}, " + f"folders: {len(drive_items.folders)}, files: {len(drive_items.files)}" + ) + else: + logger.info( + f"Triggering Composio Google Drive indexing for connector {connector_id} into search space {search_space_id} " + f"using existing config (from {indexing_from} to {indexing_to})" + ) + + index_composio_connector_task.delay( + connector_id, search_space_id, str(user.id), indexing_from, indexing_to + ) + response_message = ( + "Composio Google Drive indexing started in the background." + ) + + elif connector.connector_type in [ + SearchSourceConnectorType.COMPOSIO_GMAIL_CONNECTOR, + SearchSourceConnectorType.COMPOSIO_GOOGLE_CALENDAR_CONNECTOR, + ]: + from app.tasks.celery_tasks.connector_tasks import ( + index_composio_connector_task, + ) + + # For Composio Gmail and Calendar, use the same date calculation logic as normal connectors + # This ensures consistent behavior and uses last_indexed_at to reduce API calls + # (includes special case: if indexed today, go back 1 day to avoid missing data) logger.info( f"Triggering Composio connector indexing for connector {connector_id} into search space {search_space_id} from {indexing_from} to {indexing_to}" ) @@ -943,7 +1060,9 @@ async def _update_connector_timestamp_by_id(session: AsyncSession, connector_id: connector = result.scalars().first() if connector: - connector.last_indexed_at = datetime.now() + connector.last_indexed_at = datetime.now( + UTC + ) # Use UTC for timezone consistency await session.commit() logger.info(f"Updated last_indexed_at for connector {connector_id}") except Exception as e: @@ -1083,18 +1202,24 @@ async def _run_indexing_with_notifications( ) await update_timestamp_func(session, connector_id) + await session.commit() # Commit timestamp update logger.info( f"Indexing completed successfully: {documents_processed} documents processed" ) - # Update notification on success + # Update notification on success (or partial success with errors) if notification: + # Refresh notification to ensure it's not stale after timestamp update commit + await session.refresh(notification) await NotificationService.connector_indexing.notify_indexing_completed( session=session, notification=notification, indexed_count=documents_processed, - error_message=None, + error_message=error_or_warning, # Show errors even if some documents were indexed ) + await ( + session.commit() + ) # Commit to ensure Electric SQL syncs the notification update elif documents_processed > 0: # Update notification to storing stage if notification: @@ -1110,24 +1235,73 @@ async def _run_indexing_with_notifications( f"Indexing completed successfully: {documents_processed} documents processed" ) if notification: + # Refresh notification to ensure it's not stale after indexing function commits + await session.refresh(notification) await NotificationService.connector_indexing.notify_indexing_completed( session=session, notification=notification, indexed_count=documents_processed, - error_message=None, + error_message=error_or_warning, # Show errors even if some documents were indexed ) + await ( + session.commit() + ) # Commit to ensure Electric SQL syncs the notification update else: # No new documents processed - check if this is an error or just no changes if error_or_warning: - # Actual failure - logger.error(f"Indexing failed: {error_or_warning}") - if notification: - await NotificationService.connector_indexing.notify_indexing_completed( - session=session, - notification=notification, - indexed_count=0, - error_message=error_or_warning, - ) + # Check if this is a duplicate warning or empty result (success cases) or an actual error + # Handle both normal and Composio calendar connectors + error_or_warning_lower = ( + str(error_or_warning).lower() if error_or_warning else "" + ) + is_duplicate_warning = "skipped (duplicate)" in error_or_warning_lower + # "No X found" messages are success cases - sync worked, just found nothing in date range + is_empty_result = ( + "no " in error_or_warning_lower + and "found" in error_or_warning_lower + ) + + if is_duplicate_warning or is_empty_result: + # These are success cases - sync worked, just found nothing new + logger.info(f"Indexing completed successfully: {error_or_warning}") + # Still update timestamp so ElectricSQL syncs and clears "Syncing" UI + if update_timestamp_func: + await update_timestamp_func(session, connector_id) + await session.commit() # Commit timestamp update + if notification: + # Refresh notification to ensure it's not stale after timestamp update commit + await session.refresh(notification) + # For empty results, use a cleaner message + notification_message = ( + "No new items found in date range" + if is_empty_result + else error_or_warning + ) + await NotificationService.connector_indexing.notify_indexing_completed( + session=session, + notification=notification, + indexed_count=0, + error_message=notification_message, # Pass as warning, not error + is_warning=True, # Flag to indicate this is a warning, not an error + ) + await ( + session.commit() + ) # Commit to ensure Electric SQL syncs the notification update + else: + # Actual failure + logger.error(f"Indexing failed: {error_or_warning}") + if notification: + # Refresh notification to ensure it's not stale after indexing function commits + await session.refresh(notification) + await NotificationService.connector_indexing.notify_indexing_completed( + session=session, + notification=notification, + indexed_count=0, + error_message=error_or_warning, + ) + await ( + session.commit() + ) # Commit to ensure Electric SQL syncs the notification update else: # Success - just no new documents to index (all skipped/unchanged) logger.info( @@ -1136,13 +1310,19 @@ async def _run_indexing_with_notifications( # Still update timestamp so ElectricSQL syncs and clears "Syncing" UI if update_timestamp_func: await update_timestamp_func(session, connector_id) + await session.commit() # Commit timestamp update if notification: + # Refresh notification to ensure it's not stale after timestamp update commit + await session.refresh(notification) await NotificationService.connector_indexing.notify_indexing_completed( session=session, notification=notification, indexed_count=0, error_message=None, # No error - sync succeeded ) + await ( + session.commit() + ) # Commit to ensure Electric SQL syncs the notification update except Exception as e: logger.error(f"Error in indexing task: {e!s}", exc_info=True) @@ -2157,6 +2337,59 @@ async def run_obsidian_indexing( ) +async def run_composio_indexing_with_new_session( + connector_id: int, + search_space_id: int, + user_id: str, + start_date: str, + end_date: str, +): + """ + Create a new session and run the Composio indexing task. + This prevents session leaks by creating a dedicated session for the background task. + """ + async with async_session_maker() as session: + await run_composio_indexing( + session, connector_id, search_space_id, user_id, start_date, end_date + ) + + +async def run_composio_indexing( + session: AsyncSession, + connector_id: int, + search_space_id: int, + user_id: str, + start_date: str | None, + end_date: str | None, +): + """ + Run Composio connector indexing with real-time notifications. + + This wraps the Composio indexer with the notification system so that + Electric SQL can sync indexing progress to the frontend in real-time. + + Args: + session: Database session + connector_id: ID of the Composio connector + search_space_id: ID of the search space + user_id: ID of the user + start_date: Start date for indexing + end_date: End date for indexing + """ + from app.tasks.composio_indexer import index_composio_connector + + await _run_indexing_with_notifications( + session=session, + connector_id=connector_id, + search_space_id=search_space_id, + user_id=user_id, + start_date=start_date, + end_date=end_date, + indexing_function=index_composio_connector, + update_timestamp_func=_update_connector_timestamp_by_id, + ) + + # ============================================================================= # MCP Connector Routes # ============================================================================= diff --git a/surfsense_backend/app/services/composio_service.py b/surfsense_backend/app/services/composio_service.py index 6046ea2d8..ad7841a8b 100644 --- a/surfsense_backend/app/services/composio_service.py +++ b/surfsense_backend/app/services/composio_service.py @@ -39,21 +39,73 @@ COMPOSIO_TOOLKIT_NAMES = { # Toolkits that support indexing (Phase 1: Google services only) INDEXABLE_TOOLKITS = {"googledrive", "gmail", "googlecalendar"} +# Mapping of toolkit IDs to connector types +TOOLKIT_TO_CONNECTOR_TYPE = { + "googledrive": "COMPOSIO_GOOGLE_DRIVE_CONNECTOR", + "gmail": "COMPOSIO_GMAIL_CONNECTOR", + "googlecalendar": "COMPOSIO_GOOGLE_CALENDAR_CONNECTOR", +} + +# Mapping of toolkit IDs to document types +TOOLKIT_TO_DOCUMENT_TYPE = { + "googledrive": "COMPOSIO_GOOGLE_DRIVE_CONNECTOR", + "gmail": "COMPOSIO_GMAIL_CONNECTOR", + "googlecalendar": "COMPOSIO_GOOGLE_CALENDAR_CONNECTOR", +} + +# Mapping of toolkit IDs to their indexer functions +# Format: toolkit_id -> (module_path, function_name, supports_date_filter) +# supports_date_filter: True if the indexer accepts start_date/end_date params +TOOLKIT_TO_INDEXER = { + "googledrive": ( + "app.connectors.composio_google_drive_connector", + "index_composio_google_drive", + False, # Google Drive doesn't use date filtering + ), + "gmail": ( + "app.connectors.composio_gmail_connector", + "index_composio_gmail", + True, # Gmail uses date filtering + ), + "googlecalendar": ( + "app.connectors.composio_google_calendar_connector", + "index_composio_google_calendar", + True, # Calendar uses date filtering + ), +} + class ComposioService: """Service for interacting with Composio API.""" - def __init__(self, api_key: str | None = None): + # Default download directory for files from Composio + DEFAULT_DOWNLOAD_DIR = "/tmp/composio_downloads" + + def __init__( + self, api_key: str | None = None, file_download_dir: str | None = None + ): """ Initialize the Composio service. Args: api_key: Composio API key. If not provided, uses config.COMPOSIO_API_KEY. + file_download_dir: Directory for downloaded files. Defaults to /tmp/composio_downloads. """ + import os + self.api_key = api_key or config.COMPOSIO_API_KEY if not self.api_key: raise ValueError("COMPOSIO_API_KEY is required but not configured") - self.client = Composio(api_key=self.api_key) + + # Set up download directory + self.file_download_dir = file_download_dir or self.DEFAULT_DOWNLOAD_DIR + os.makedirs(self.file_download_dir, exist_ok=True) + + # Initialize Composio client with download directory + # Per docs: file_download_dir configures where files are downloaded + self.client = Composio( + api_key=self.api_key, file_download_dir=self.file_download_dir + ) @staticmethod def is_enabled() -> bool: @@ -252,7 +304,6 @@ class ComposioService: } ) - logger.info(f"DEBUG: Found {len(result)} TOTAL connections in Composio") return result except Exception as e: logger.error(f"Failed to list all connections: {e!s}") @@ -269,7 +320,6 @@ class ComposioService: List of connected account details. """ try: - logger.info(f"DEBUG: Calling connected_accounts.list(user_id='{user_id}')") accounts_response = self.client.connected_accounts.list(user_id=user_id) # Handle paginated response (may have .items attribute) or direct list @@ -312,6 +362,30 @@ class ComposioService: logger.error(f"Failed to list connections for user {user_id}: {e!s}") return [] + async def delete_connected_account(self, connected_account_id: str) -> bool: + """ + Delete a connected account from Composio. + + This permanently removes the connected account and revokes access tokens. + + Args: + connected_account_id: The Composio connected account ID to delete. + + Returns: + True if deletion was successful, False otherwise. + """ + try: + self.client.connected_accounts.delete(connected_account_id) + logger.info( + f"Successfully deleted Composio connected account: {connected_account_id}" + ) + return True + except Exception as e: + logger.error( + f"Failed to delete Composio connected account {connected_account_id}: {e!s}" + ) + return False + async def execute_tool( self, connected_account_id: str, @@ -338,7 +412,6 @@ class ComposioService: # - connected_account_id: for authentication # - user_id: user identifier (SDK uses user_id, not entity_id) # - dangerously_skip_version_check: skip version check for manual execution - logger.info(f"DEBUG: Executing tool {tool_name} with params: {params}") result = self.client.tools.execute( slug=tool_name, connected_account_id=connected_account_id, @@ -346,8 +419,6 @@ class ComposioService: arguments=params or {}, dangerously_skip_version_check=True, ) - logger.info(f"DEBUG: Tool {tool_name} raw result type: {type(result)}") - logger.info(f"DEBUG: Tool {tool_name} raw result: {result}") return {"success": True, "data": result} except Exception as e: logger.error(f"Failed to execute tool {tool_name}: {e!s}") @@ -382,7 +453,15 @@ class ComposioService: "page_size": min(page_size, 100), } if folder_id: - params["folder_id"] = folder_id + # List contents of a specific folder (exclude shortcuts - we don't have access to them) + params["q"] = ( + f"'{folder_id}' in parents and trashed = false and mimeType != 'application/vnd.google-apps.shortcut'" + ) + else: + # List root-level items only (My Drive root), exclude shortcuts + params["q"] = ( + "'root' in parents and trashed = false and mimeType != 'application/vnd.google-apps.shortcut'" + ) if page_token: params["page_token"] = page_token @@ -397,9 +476,6 @@ class ComposioService: return [], None, result.get("error", "Unknown error") data = result.get("data", {}) - logger.info( - f"DEBUG: Drive data type: {type(data)}, keys: {data.keys() if isinstance(data, dict) else 'N/A'}" - ) # Handle nested response structure from Composio files = [] @@ -415,7 +491,6 @@ class ComposioService: elif isinstance(data, list): files = data - logger.info(f"DEBUG: Extracted {len(files)} drive files") return files, next_token, None except Exception as e: @@ -428,6 +503,10 @@ class ComposioService: """ Download file content from Google Drive via Composio. + Per Composio docs: When tools return files, they are automatically downloaded + to a local directory, and the local file path is provided in the response. + Response includes: file_path, file_name, size fields. + Args: connected_account_id: Composio connected account ID. entity_id: The entity/user ID that owns the connected account. @@ -436,27 +515,264 @@ class ComposioService: Returns: Tuple of (file content bytes, error message). """ + from pathlib import Path + try: result = await self.execute_tool( connected_account_id=connected_account_id, tool_name="GOOGLEDRIVE_DOWNLOAD_FILE", - params={"file_id": file_id}, # snake_case + params={"file_id": file_id}, entity_id=entity_id, ) if not result.get("success"): return None, result.get("error", "Unknown error") - content = result.get("data") - if isinstance(content, str): - content = content.encode("utf-8") + data = result.get("data") + if not data: + return None, "No data returned from Composio" - return content, None + # Per Composio docs, response includes file_path where file was downloaded + # Response structure: {data: {...}, error: ..., successful: ...} + # The actual file info is nested inside data["data"] + file_path = None + + if isinstance(data, dict): + # Handle nested response structure: data contains {data, error, successful} + # The actual file info is in data["data"] + inner_data = data + if "data" in data and isinstance(data["data"], dict): + inner_data = data["data"] + logger.debug( + f"Found nested data structure. Inner keys: {list(inner_data.keys())}" + ) + elif "successful" in data and "data" in data: + # Standard Composio response wrapper + inner_data = data["data"] if data["data"] else data + + # Try documented fields: file_path, downloaded_file_content, path, uri + file_path = ( + inner_data.get("file_path") + or inner_data.get("downloaded_file_content") + or inner_data.get("path") + or inner_data.get("uri") + ) + + # Handle nested dict case where downloaded_file_content contains the path + if isinstance(file_path, dict): + file_path = ( + file_path.get("file_path") + or file_path.get("downloaded_file_content") + or file_path.get("path") + or file_path.get("uri") + ) + + # If still no path, check if inner_data itself has the nested structure + if not file_path and isinstance(inner_data, dict): + for key in ["downloaded_file_content", "file_path", "path", "uri"]: + if key in inner_data: + val = inner_data[key] + if isinstance(val, str): + file_path = val + break + elif isinstance(val, dict): + # One more level of nesting + file_path = ( + val.get("file_path") + or val.get("downloaded_file_content") + or val.get("path") + or val.get("uri") + ) + if file_path: + break + + logger.debug( + f"Composio response keys: {list(data.keys())}, inner keys: {list(inner_data.keys()) if isinstance(inner_data, dict) else 'N/A'}, extracted path: {file_path}" + ) + elif isinstance(data, str): + # Direct string response (could be path or content) + file_path = data + elif isinstance(data, bytes): + # Direct bytes response + return data, None + + # Read file from the path + if file_path and isinstance(file_path, str): + path_obj = Path(file_path) + + # Check if it's a valid file path (absolute or in .composio directory) + if path_obj.is_absolute() or ".composio" in str(path_obj): + try: + if path_obj.exists(): + content = path_obj.read_bytes() + logger.info( + f"Successfully read {len(content)} bytes from Composio file: {file_path}" + ) + return content, None + else: + logger.warning( + f"File path from Composio does not exist: {file_path}" + ) + return None, f"File not found at path: {file_path}" + except Exception as e: + logger.error( + f"Failed to read file from Composio path {file_path}: {e!s}" + ) + return None, f"Failed to read file: {e!s}" + else: + # Not a file path - might be base64 encoded content + try: + import base64 + + content = base64.b64decode(file_path) + return content, None + except Exception: + # Not base64, return as UTF-8 bytes + return file_path.encode("utf-8"), None + + # If we got here, couldn't extract file path + if isinstance(data, dict): + # Log full structure for debugging + inner_data = data.get("data", {}) + logger.warning( + f"Could not extract file path from Composio response. " + f"Top keys: {list(data.keys())}, " + f"Inner data keys: {list(inner_data.keys()) if isinstance(inner_data, dict) else type(inner_data).__name__}, " + f"Full inner data: {inner_data}" + ) + return ( + None, + f"No file path in Composio response. Keys: {list(data.keys())}, inner: {list(inner_data.keys()) if isinstance(inner_data, dict) else 'N/A'}", + ) + + return None, f"Unexpected data type from Composio: {type(data).__name__}" except Exception as e: logger.error(f"Failed to get Drive file content: {e!s}") return None, str(e) + async def get_drive_start_page_token( + self, connected_account_id: str, entity_id: str + ) -> tuple[str | None, str | None]: + """ + Get the starting page token for Google Drive change tracking. + + This token represents the current state and is used for future delta syncs. + Per Composio docs: Use GOOGLEDRIVE_GET_CHANGES_START_PAGE_TOKEN to get initial token. + + Args: + connected_account_id: Composio connected account ID. + entity_id: The entity/user ID that owns the connected account. + + Returns: + Tuple of (start_page_token, error message). + """ + try: + result = await self.execute_tool( + connected_account_id=connected_account_id, + tool_name="GOOGLEDRIVE_GET_CHANGES_START_PAGE_TOKEN", + params={}, + entity_id=entity_id, + ) + + if not result.get("success"): + return None, result.get("error", "Unknown error") + + data = result.get("data", {}) + # Handle nested response: {data: {startPageToken: ...}, successful: ...} + if isinstance(data, dict): + inner_data = data.get("data", data) + token = ( + inner_data.get("startPageToken") + or inner_data.get("start_page_token") + or data.get("startPageToken") + or data.get("start_page_token") + ) + if token: + logger.info(f"Got Drive start page token: {token}") + return token, None + + logger.warning(f"Could not extract start page token from response: {data}") + return None, "No start page token in response" + + except Exception as e: + logger.error(f"Failed to get Drive start page token: {e!s}") + return None, str(e) + + async def list_drive_changes( + self, + connected_account_id: str, + entity_id: str, + page_token: str | None = None, + page_size: int = 100, + include_removed: bool = True, + ) -> tuple[list[dict[str, Any]], str | None, str | None]: + """ + List changes in Google Drive since the given page token. + + Per Composio docs: GOOGLEDRIVE_LIST_CHANGES tracks modifications to files/folders. + If pageToken is not provided, it auto-fetches the current start page token. + Response includes nextPageToken for pagination and newStartPageToken for future syncs. + + Args: + connected_account_id: Composio connected account ID. + entity_id: The entity/user ID that owns the connected account. + page_token: Page token from previous sync (optional - will auto-fetch if not provided). + page_size: Number of changes per page. + include_removed: Whether to include removed items in the response. + + Returns: + Tuple of (changes list, new_start_page_token, error message). + """ + try: + params = { + "pageSize": min(page_size, 100), + "includeRemoved": include_removed, + } + if page_token: + params["pageToken"] = page_token + + result = await self.execute_tool( + connected_account_id=connected_account_id, + tool_name="GOOGLEDRIVE_LIST_CHANGES", + params=params, + entity_id=entity_id, + ) + + if not result.get("success"): + return [], None, result.get("error", "Unknown error") + + data = result.get("data", {}) + + # Handle nested response structure + changes = [] + new_start_token = None + + if isinstance(data, dict): + inner_data = data.get("data", data) + changes = inner_data.get("changes", []) or data.get("changes", []) + + # Get the token for next sync + # newStartPageToken is returned when all changes have been fetched + # nextPageToken is for pagination within the current fetch + new_start_token = ( + inner_data.get("newStartPageToken") + or inner_data.get("new_start_page_token") + or inner_data.get("nextPageToken") + or inner_data.get("next_page_token") + or data.get("newStartPageToken") + or data.get("nextPageToken") + ) + + logger.info( + f"Got {len(changes)} Drive changes, new token: {new_start_token[:20] if new_start_token else 'None'}..." + ) + return changes, new_start_token, None + + except Exception as e: + logger.error(f"Failed to list Drive changes: {e!s}") + return [], None, str(e) + # ===== Gmail specific methods ===== async def get_gmail_messages( @@ -464,25 +780,30 @@ class ComposioService: connected_account_id: str, entity_id: str, query: str = "", - max_results: int = 100, - ) -> tuple[list[dict[str, Any]], str | None]: + max_results: int = 50, + page_token: str | None = None, + ) -> tuple[list[dict[str, Any]], str | None, int | None, str | None]: """ - List Gmail messages via Composio. + List Gmail messages via Composio with pagination support. Args: connected_account_id: Composio connected account ID. entity_id: The entity/user ID that owns the connected account. query: Gmail search query. - max_results: Maximum number of messages to return. + max_results: Maximum number of messages to return per page (default: 50 to avoid payload size issues). + page_token: Optional pagination token for next page. Returns: - Tuple of (messages list, error message). + Tuple of (messages list, next_page_token, result_size_estimate, error message). """ try: - # Composio uses snake_case for parameters, max is 500 - params = {"max_results": min(max_results, 500)} + # Use smaller batch size to avoid 413 payload too large errors + # Composio uses snake_case for parameters + params = {"max_results": min(max_results, 50)} # Reduced from 500 to 50 if query: params["query"] = query # Composio uses 'query' not 'q' + if page_token: + params["page_token"] = page_token result = await self.execute_tool( connected_account_id=connected_account_id, @@ -492,31 +813,42 @@ class ComposioService: ) if not result.get("success"): - return [], result.get("error", "Unknown error") + return [], None, result.get("error", "Unknown error") data = result.get("data", {}) - logger.info( - f"DEBUG: Gmail data type: {type(data)}, keys: {data.keys() if isinstance(data, dict) else 'N/A'}" - ) - logger.info(f"DEBUG: Gmail full data: {data}") # Try different possible response structures messages = [] + next_token = None + result_size_estimate = None if isinstance(data, dict): messages = ( data.get("messages", []) or data.get("data", {}).get("messages", []) or data.get("emails", []) ) + # Check for pagination token in various possible locations + next_token = ( + data.get("nextPageToken") + or data.get("next_page_token") + or data.get("data", {}).get("nextPageToken") + or data.get("data", {}).get("next_page_token") + ) + # Extract resultSizeEstimate if available (Gmail API provides this) + result_size_estimate = ( + data.get("resultSizeEstimate") + or data.get("result_size_estimate") + or data.get("data", {}).get("resultSizeEstimate") + or data.get("data", {}).get("result_size_estimate") + ) elif isinstance(data, list): messages = data - logger.info(f"DEBUG: Extracted {len(messages)} messages") - return messages, None + return messages, next_token, result_size_estimate, None except Exception as e: logger.error(f"Failed to list Gmail messages: {e!s}") - return [], str(e) + return [], None, str(e) async def get_gmail_message_detail( self, connected_account_id: str, entity_id: str, message_id: str @@ -595,10 +927,6 @@ class ComposioService: return [], result.get("error", "Unknown error") data = result.get("data", {}) - logger.info( - f"DEBUG: Calendar data type: {type(data)}, keys: {data.keys() if isinstance(data, dict) else 'N/A'}" - ) - logger.info(f"DEBUG: Calendar full data: {data}") # Try different possible response structures events = [] @@ -611,7 +939,6 @@ class ComposioService: elif isinstance(data, list): events = data - logger.info(f"DEBUG: Extracted {len(events)} calendar events") return events, None except Exception as e: diff --git a/surfsense_backend/app/services/connector_service.py b/surfsense_backend/app/services/connector_service.py index dc43697e7..4c5599815 100644 --- a/surfsense_backend/app/services/connector_service.py +++ b/surfsense_backend/app/services/connector_service.py @@ -2871,3 +2871,350 @@ class ConnectorService: } return result_object, obsidian_docs + + # ========================================================================= + # Composio Connector Search Methods + # ========================================================================= + + async def search_composio_google_drive( + self, + user_query: str, + search_space_id: int, + top_k: int = 20, + start_date: datetime | None = None, + end_date: datetime | None = None, + ) -> tuple: + """ + Search for Composio Google Drive files and return both the source information + and langchain documents. + + Uses combined chunk-level and document-level hybrid search with RRF fusion. + + Args: + user_query: The user's query + search_space_id: The search space ID to search in + top_k: Maximum number of results to return + start_date: Optional start date for filtering documents by updated_at + end_date: Optional end date for filtering documents by updated_at + + Returns: + tuple: (sources_info, langchain_documents) + """ + composio_drive_docs = await self._combined_rrf_search( + query_text=user_query, + search_space_id=search_space_id, + document_type="COMPOSIO_GOOGLE_DRIVE_CONNECTOR", + top_k=top_k, + start_date=start_date, + end_date=end_date, + ) + + # Early return if no results + if not composio_drive_docs: + return { + "id": 54, + "name": "Google Drive (Composio)", + "type": "COMPOSIO_GOOGLE_DRIVE_CONNECTOR", + "sources": [], + }, [] + + def _title_fn(doc_info: dict[str, Any], metadata: dict[str, Any]) -> str: + return ( + doc_info.get("title") + or metadata.get("title") + or metadata.get("file_name") + or "Untitled Document" + ) + + def _url_fn(_doc_info: dict[str, Any], metadata: dict[str, Any]) -> str: + return metadata.get("url") or metadata.get("web_view_link") or "" + + def _description_fn( + chunk: dict[str, Any], _doc_info: dict[str, Any], metadata: dict[str, Any] + ) -> str: + description = self._chunk_preview(chunk.get("content", ""), limit=200) + info_parts = [] + mime_type = metadata.get("mime_type") + modified_time = metadata.get("modified_time") + if mime_type: + info_parts.append(f"Type: {mime_type}") + if modified_time: + info_parts.append(f"Modified: {modified_time}") + if info_parts: + description = (description + " | " + " | ".join(info_parts)).strip(" |") + return description + + def _extra_fields_fn( + _chunk: dict[str, Any], _doc_info: dict[str, Any], metadata: dict[str, Any] + ) -> dict[str, Any]: + return { + "mime_type": metadata.get("mime_type", ""), + "file_id": metadata.get("file_id", ""), + "modified_time": metadata.get("modified_time", ""), + } + + sources_list = self._build_chunk_sources_from_documents( + composio_drive_docs, + title_fn=_title_fn, + url_fn=_url_fn, + description_fn=_description_fn, + extra_fields_fn=_extra_fields_fn, + ) + + # Create result object + result_object = { + "id": 54, + "name": "Google Drive (Composio)", + "type": "COMPOSIO_GOOGLE_DRIVE_CONNECTOR", + "sources": sources_list, + } + + return result_object, composio_drive_docs + + async def search_composio_gmail( + self, + user_query: str, + search_space_id: int, + top_k: int = 20, + start_date: datetime | None = None, + end_date: datetime | None = None, + ) -> tuple: + """ + Search for Composio Gmail messages and return both the source information + and langchain documents. + + Uses combined chunk-level and document-level hybrid search with RRF fusion. + + Args: + user_query: The user's query + search_space_id: The search space ID to search in + top_k: Maximum number of results to return + start_date: Optional start date for filtering documents by updated_at + end_date: Optional end date for filtering documents by updated_at + + Returns: + tuple: (sources_info, langchain_documents) + """ + composio_gmail_docs = await self._combined_rrf_search( + query_text=user_query, + search_space_id=search_space_id, + document_type="COMPOSIO_GMAIL_CONNECTOR", + top_k=top_k, + start_date=start_date, + end_date=end_date, + ) + + # Early return if no results + if not composio_gmail_docs: + return { + "id": 55, + "name": "Gmail (Composio)", + "type": "COMPOSIO_GMAIL_CONNECTOR", + "sources": [], + }, [] + + def _title_fn(doc_info: dict[str, Any], metadata: dict[str, Any]) -> str: + return ( + doc_info.get("title") + or metadata.get("subject") + or metadata.get("title") + or "Untitled Email" + ) + + def _url_fn(_doc_info: dict[str, Any], metadata: dict[str, Any]) -> str: + return metadata.get("url") or "" + + def _description_fn( + chunk: dict[str, Any], _doc_info: dict[str, Any], metadata: dict[str, Any] + ) -> str: + description = self._chunk_preview(chunk.get("content", ""), limit=200) + info_parts = [] + sender = metadata.get("from") or metadata.get("sender") + date = metadata.get("date") or metadata.get("received_at") + if sender: + info_parts.append(f"From: {sender}") + if date: + info_parts.append(f"Date: {date}") + if info_parts: + description = (description + " | " + " | ".join(info_parts)).strip(" |") + return description + + def _extra_fields_fn( + _chunk: dict[str, Any], _doc_info: dict[str, Any], metadata: dict[str, Any] + ) -> dict[str, Any]: + return { + "message_id": metadata.get("message_id", ""), + "thread_id": metadata.get("thread_id", ""), + "from": metadata.get("from", ""), + "to": metadata.get("to", ""), + "date": metadata.get("date", ""), + } + + sources_list = self._build_chunk_sources_from_documents( + composio_gmail_docs, + title_fn=_title_fn, + url_fn=_url_fn, + description_fn=_description_fn, + extra_fields_fn=_extra_fields_fn, + ) + + # Create result object + result_object = { + "id": 55, + "name": "Gmail (Composio)", + "type": "COMPOSIO_GMAIL_CONNECTOR", + "sources": sources_list, + } + + return result_object, composio_gmail_docs + + async def search_composio_google_calendar( + self, + user_query: str, + search_space_id: int, + top_k: int = 20, + start_date: datetime | None = None, + end_date: datetime | None = None, + ) -> tuple: + """ + Search for Composio Google Calendar events and return both the source information + and langchain documents. + + Uses combined chunk-level and document-level hybrid search with RRF fusion. + + Args: + user_query: The user's query + search_space_id: The search space ID to search in + top_k: Maximum number of results to return + start_date: Optional start date for filtering documents by updated_at + end_date: Optional end date for filtering documents by updated_at + + Returns: + tuple: (sources_info, langchain_documents) + """ + composio_calendar_docs = await self._combined_rrf_search( + query_text=user_query, + search_space_id=search_space_id, + document_type="COMPOSIO_GOOGLE_CALENDAR_CONNECTOR", + top_k=top_k, + start_date=start_date, + end_date=end_date, + ) + + # Early return if no results + if not composio_calendar_docs: + return { + "id": 56, + "name": "Google Calendar (Composio)", + "type": "COMPOSIO_GOOGLE_CALENDAR_CONNECTOR", + "sources": [], + }, [] + + def _title_fn(doc_info: dict[str, Any], metadata: dict[str, Any]) -> str: + return ( + doc_info.get("title") + or metadata.get("summary") + or metadata.get("title") + or "Untitled Event" + ) + + def _url_fn(_doc_info: dict[str, Any], metadata: dict[str, Any]) -> str: + return metadata.get("url") or metadata.get("html_link") or "" + + def _description_fn( + chunk: dict[str, Any], _doc_info: dict[str, Any], metadata: dict[str, Any] + ) -> str: + description = self._chunk_preview(chunk.get("content", ""), limit=200) + info_parts = [] + start_time = metadata.get("start_time") or metadata.get("start") + end_time = metadata.get("end_time") or metadata.get("end") + if start_time: + info_parts.append(f"Start: {start_time}") + if end_time: + info_parts.append(f"End: {end_time}") + if info_parts: + description = (description + " | " + " | ".join(info_parts)).strip(" |") + return description + + def _extra_fields_fn( + _chunk: dict[str, Any], _doc_info: dict[str, Any], metadata: dict[str, Any] + ) -> dict[str, Any]: + return { + "event_id": metadata.get("event_id", ""), + "calendar_id": metadata.get("calendar_id", ""), + "start_time": metadata.get("start_time", ""), + "end_time": metadata.get("end_time", ""), + "location": metadata.get("location", ""), + } + + sources_list = self._build_chunk_sources_from_documents( + composio_calendar_docs, + title_fn=_title_fn, + url_fn=_url_fn, + description_fn=_description_fn, + extra_fields_fn=_extra_fields_fn, + ) + + # Create result object + result_object = { + "id": 56, + "name": "Google Calendar (Composio)", + "type": "COMPOSIO_GOOGLE_CALENDAR_CONNECTOR", + "sources": sources_list, + } + + return result_object, composio_calendar_docs + + # ========================================================================= + # Utility Methods for Connector Discovery + # ========================================================================= + + async def get_available_connectors( + self, + search_space_id: int, + ) -> list[SearchSourceConnectorType]: + """ + Get all available (enabled) connector types for a search space. + + Args: + search_space_id: The search space ID + + Returns: + List of SearchSourceConnectorType enums for enabled connectors + """ + query = ( + select(SearchSourceConnector.connector_type) + .filter( + SearchSourceConnector.search_space_id == search_space_id, + ) + .distinct() + ) + + result = await self.session.execute(query) + connector_types = result.scalars().all() + return list(connector_types) + + async def get_available_document_types( + self, + search_space_id: int, + ) -> list[str]: + """ + Get all document types that have at least one document in the search space. + + Args: + search_space_id: The search space ID + + Returns: + List of document type strings that have documents indexed + """ + from sqlalchemy import distinct + + from app.db import Document + + query = select(distinct(Document.document_type)).filter( + Document.search_space_id == search_space_id, + ) + + result = await self.session.execute(query) + doc_types = result.scalars().all() + return [str(dt) for dt in doc_types] diff --git a/surfsense_backend/app/services/notification_service.py b/surfsense_backend/app/services/notification_service.py index 836daeb9e..04f39d8ef 100644 --- a/surfsense_backend/app/services/notification_service.py +++ b/surfsense_backend/app/services/notification_service.py @@ -335,6 +335,7 @@ class ConnectorIndexingNotificationHandler(BaseNotificationHandler): notification: Notification, indexed_count: int, error_message: str | None = None, + is_warning: bool = False, ) -> Notification: """ Update notification when connector indexing completes. @@ -343,7 +344,8 @@ class ConnectorIndexingNotificationHandler(BaseNotificationHandler): session: Database session notification: Notification to update indexed_count: Total number of items indexed - error_message: Error message if indexing failed (optional) + error_message: Error message if indexing failed, or warning message (optional) + is_warning: If True, treat error_message as a warning (success case) rather than an error Returns: Updated notification @@ -352,10 +354,26 @@ class ConnectorIndexingNotificationHandler(BaseNotificationHandler): "connector_name", "Connector" ) + # If there's an error message but items were indexed, treat it as a warning (partial success) + # If is_warning is True, treat it as success even with 0 items (e.g., duplicates found) + # Otherwise, treat it as a failure if error_message: - title = f"Failed: {connector_name}" - message = f"Sync failed: {error_message}" - status = "failed" + if indexed_count > 0: + # Partial success with warnings (e.g., duplicate content from other connectors) + title = f"Ready: {connector_name}" + item_text = "item" if indexed_count == 1 else "items" + message = f"Now searchable! {indexed_count} {item_text} synced. Note: {error_message}" + status = "completed" + elif is_warning: + # Warning case (e.g., duplicates found) - treat as success + title = f"Ready: {connector_name}" + message = f"Sync completed. {error_message}" + status = "completed" + else: + # Complete failure + title = f"Failed: {connector_name}" + message = f"Sync failed: {error_message}" + status = "failed" else: title = f"Ready: {connector_name}" if indexed_count == 0: @@ -367,7 +385,9 @@ class ConnectorIndexingNotificationHandler(BaseNotificationHandler): metadata_updates = { "indexed_count": indexed_count, - "sync_stage": "completed" if not error_message else "failed", + "sync_stage": "completed" + if (not error_message or is_warning or indexed_count > 0) + else "failed", "error_message": error_message, } diff --git a/surfsense_backend/app/tasks/celery_tasks/connector_tasks.py b/surfsense_backend/app/tasks/celery_tasks/connector_tasks.py index b90ff753f..d0710d246 100644 --- a/surfsense_backend/app/tasks/celery_tasks/connector_tasks.py +++ b/surfsense_backend/app/tasks/celery_tasks/connector_tasks.py @@ -810,8 +810,8 @@ def index_composio_connector_task( connector_id: int, search_space_id: int, user_id: str, - start_date: str, - end_date: str, + start_date: str | None, + end_date: str | None, ): """Celery task to index Composio connector content (Google Drive, Gmail, Calendar via Composio).""" import asyncio @@ -833,14 +833,16 @@ async def _index_composio_connector( connector_id: int, search_space_id: int, user_id: str, - start_date: str, - end_date: str, + start_date: str | None, + end_date: str | None, ): - """Index Composio connector content with new session.""" - # Import from tasks folder (not connector_indexers) to avoid circular import - from app.tasks.composio_indexer import index_composio_connector + """Index Composio connector content with new session and real-time notifications.""" + # Import from routes to use the notification-wrapped version + from app.routes.search_source_connectors_routes import ( + run_composio_indexing, + ) async with get_celery_session_maker()() as session: - await index_composio_connector( + await run_composio_indexing( session, connector_id, search_space_id, user_id, start_date, end_date ) diff --git a/surfsense_backend/app/tasks/celery_tasks/schedule_checker_task.py b/surfsense_backend/app/tasks/celery_tasks/schedule_checker_task.py index 21855f73f..bf80cbe78 100644 --- a/surfsense_backend/app/tasks/celery_tasks/schedule_checker_task.py +++ b/surfsense_backend/app/tasks/celery_tasks/schedule_checker_task.py @@ -66,6 +66,7 @@ async def _check_and_trigger_schedules(): from app.tasks.celery_tasks.connector_tasks import ( index_airtable_records_task, index_clickup_tasks_task, + index_composio_connector_task, index_confluence_pages_task, index_crawled_urls_task, index_discord_messages_task, @@ -98,6 +99,10 @@ async def _check_and_trigger_schedules(): SearchSourceConnectorType.ELASTICSEARCH_CONNECTOR: index_elasticsearch_documents_task, SearchSourceConnectorType.WEBCRAWLER_CONNECTOR: index_crawled_urls_task, SearchSourceConnectorType.GOOGLE_DRIVE_CONNECTOR: index_google_drive_files_task, + # Composio connector types + SearchSourceConnectorType.COMPOSIO_GOOGLE_DRIVE_CONNECTOR: index_composio_connector_task, + SearchSourceConnectorType.COMPOSIO_GMAIL_CONNECTOR: index_composio_connector_task, + SearchSourceConnectorType.COMPOSIO_GOOGLE_CALENDAR_CONNECTOR: index_composio_connector_task, } # Trigger indexing for each due connector diff --git a/surfsense_backend/app/tasks/composio_indexer.py b/surfsense_backend/app/tasks/composio_indexer.py index abb238924..f97652114 100644 --- a/surfsense_backend/app/tasks/composio_indexer.py +++ b/surfsense_backend/app/tasks/composio_indexer.py @@ -2,83 +2,76 @@ Composio connector indexer. Routes indexing requests to toolkit-specific handlers (Google Drive, Gmail, Calendar). +Uses a registry pattern for clean, extensible connector routing. Note: This module is intentionally placed in app/tasks/ (not in connector_indexers/) to avoid circular import issues with the connector_indexers package. """ import logging -from datetime import UTC, datetime +from importlib import import_module from sqlalchemy.exc import SQLAlchemyError from sqlalchemy.ext.asyncio import AsyncSession from sqlalchemy.future import select -from sqlalchemy.orm import selectinload -from app.config import config -from app.connectors.composio_connector import ComposioConnector from app.db import ( - Document, - DocumentType, SearchSourceConnector, SearchSourceConnectorType, ) -from app.services.composio_service import INDEXABLE_TOOLKITS -from app.services.llm_service import get_user_long_context_llm +from app.services.composio_service import INDEXABLE_TOOLKITS, TOOLKIT_TO_INDEXER from app.services.task_logging_service import TaskLoggingService -from app.utils.document_converters import ( - create_document_chunks, - generate_content_hash, - generate_document_summary, - generate_unique_identifier_hash, -) # Set up logging logger = logging.getLogger(__name__) -# ============ Utility functions (copied from connector_indexers.base to avoid circular imports) ============ +# Valid Composio connector types +COMPOSIO_CONNECTOR_TYPES = { + SearchSourceConnectorType.COMPOSIO_GOOGLE_DRIVE_CONNECTOR, + SearchSourceConnectorType.COMPOSIO_GMAIL_CONNECTOR, + SearchSourceConnectorType.COMPOSIO_GOOGLE_CALENDAR_CONNECTOR, +} -def get_current_timestamp() -> datetime: - """Get the current timestamp with timezone for updated_at field.""" - return datetime.now(UTC) - - -async def check_document_by_unique_identifier( - session: AsyncSession, unique_identifier_hash: str -) -> Document | None: - """Check if a document with the given unique identifier hash already exists.""" - existing_doc_result = await session.execute( - select(Document) - .options(selectinload(Document.chunks)) - .where(Document.unique_identifier_hash == unique_identifier_hash) - ) - return existing_doc_result.scalars().first() +# ============ Utility functions ============ async def get_connector_by_id( - session: AsyncSession, connector_id: int, connector_type: SearchSourceConnectorType + session: AsyncSession, + connector_id: int, + connector_type: SearchSourceConnectorType | None, ) -> SearchSourceConnector | None: - """Get a connector by ID and type from the database.""" - result = await session.execute( - select(SearchSourceConnector).filter( - SearchSourceConnector.id == connector_id, - SearchSourceConnector.connector_type == connector_type, - ) + """Get a connector by ID and optionally by type from the database.""" + query = select(SearchSourceConnector).filter( + SearchSourceConnector.id == connector_id ) + if connector_type is not None: + query = query.filter(SearchSourceConnector.connector_type == connector_type) + result = await session.execute(query) return result.scalars().first() -async def update_connector_last_indexed( - session: AsyncSession, - connector: SearchSourceConnector, - update_last_indexed: bool = True, -) -> None: - """Update the last_indexed_at timestamp for a connector.""" - if update_last_indexed: - connector.last_indexed_at = datetime.now() - logger.info(f"Updated last_indexed_at to {connector.last_indexed_at}") +def get_indexer_function(toolkit_id: str): + """ + Dynamically import and return the indexer function for a toolkit. + + Args: + toolkit_id: The toolkit ID (e.g., "googledrive", "gmail") + + Returns: + Tuple of (indexer_function, supports_date_filter) + + Raises: + ValueError: If toolkit not found in registry + """ + if toolkit_id not in TOOLKIT_TO_INDEXER: + raise ValueError(f"No indexer registered for toolkit: {toolkit_id}") + + module_path, function_name, supports_date_filter = TOOLKIT_TO_INDEXER[toolkit_id] + module = import_module(module_path) + indexer_func = getattr(module, function_name) + return indexer_func, supports_date_filter # ============ Main indexer function ============ @@ -98,6 +91,7 @@ async def index_composio_connector( Index content from a Composio connector. Routes to toolkit-specific indexing based on the connector's toolkit_id. + Uses a registry pattern for clean, extensible connector routing. Args: session: Database session @@ -129,10 +123,16 @@ async def index_composio_connector( ) try: - # Get connector by id - connector = await get_connector_by_id( - session, connector_id, SearchSourceConnectorType.COMPOSIO_CONNECTOR - ) + # Get connector by id - accept any Composio connector type + connector = await get_connector_by_id(session, connector_id, None) + + # Validate it's a Composio connector + if connector and connector.connector_type not in COMPOSIO_CONNECTOR_TYPES: + error_msg = f"Connector {connector_id} is not a Composio connector" + await task_logger.log_task_failure( + log_entry, error_msg, {"error_type": "InvalidConnectorType"} + ) + return 0, error_msg if not connector: error_msg = f"Composio connector with ID {connector_id} not found" @@ -160,53 +160,35 @@ async def index_composio_connector( ) return 0, error_msg - # Route to toolkit-specific indexer - if toolkit_id == "googledrive": - return await _index_composio_google_drive( - session=session, - connector=connector, - connector_id=connector_id, - search_space_id=search_space_id, - user_id=user_id, - task_logger=task_logger, - log_entry=log_entry, - update_last_indexed=update_last_indexed, - max_items=max_items, - ) - elif toolkit_id == "gmail": - return await _index_composio_gmail( - session=session, - connector=connector, - connector_id=connector_id, - search_space_id=search_space_id, - user_id=user_id, - start_date=start_date, - end_date=end_date, - task_logger=task_logger, - log_entry=log_entry, - update_last_indexed=update_last_indexed, - max_items=max_items, - ) - elif toolkit_id == "googlecalendar": - return await _index_composio_google_calendar( - session=session, - connector=connector, - connector_id=connector_id, - search_space_id=search_space_id, - user_id=user_id, - start_date=start_date, - end_date=end_date, - task_logger=task_logger, - log_entry=log_entry, - update_last_indexed=update_last_indexed, - max_items=max_items, - ) - else: - error_msg = f"No indexer implemented for toolkit: {toolkit_id}" + # Get indexer function from registry + try: + indexer_func, supports_date_filter = get_indexer_function(toolkit_id) + except ValueError as e: await task_logger.log_task_failure( - log_entry, error_msg, {"error_type": "NoIndexerImplemented"} + log_entry, str(e), {"error_type": "NoIndexerImplemented"} ) - return 0, error_msg + return 0, str(e) + + # Build kwargs for the indexer function + kwargs = { + "session": session, + "connector": connector, + "connector_id": connector_id, + "search_space_id": search_space_id, + "user_id": user_id, + "task_logger": task_logger, + "log_entry": log_entry, + "update_last_indexed": update_last_indexed, + "max_items": max_items, + } + + # Add date params for toolkits that support them + if supports_date_filter: + kwargs["start_date"] = start_date + kwargs["end_date"] = end_date + + # Call the toolkit-specific indexer + return await indexer_func(**kwargs) except SQLAlchemyError as db_error: await session.rollback() @@ -228,714 +210,3 @@ async def index_composio_connector( ) logger.error(f"Failed to index Composio connector: {e!s}", exc_info=True) return 0, f"Failed to index Composio connector: {e!s}" - - -async def _index_composio_google_drive( - session: AsyncSession, - connector, - connector_id: int, - search_space_id: int, - user_id: str, - task_logger: TaskLoggingService, - log_entry, - update_last_indexed: bool = True, - max_items: int = 1000, -) -> tuple[int, str]: - """Index Google Drive files via Composio.""" - try: - composio_connector = ComposioConnector(session, connector_id) - - await task_logger.log_task_progress( - log_entry, - f"Fetching Google Drive files via Composio for connector {connector_id}", - {"stage": "fetching_files"}, - ) - - # Fetch files - all_files = [] - page_token = None - - while len(all_files) < max_items: - files, next_token, error = await composio_connector.list_drive_files( - page_token=page_token, - page_size=min(100, max_items - len(all_files)), - ) - - if error: - await task_logger.log_task_failure( - log_entry, f"Failed to fetch Drive files: {error}", {} - ) - return 0, f"Failed to fetch Drive files: {error}" - - all_files.extend(files) - - if not next_token: - break - page_token = next_token - - if not all_files: - success_msg = "No Google Drive files found" - await task_logger.log_task_success( - log_entry, success_msg, {"files_count": 0} - ) - return 0, success_msg - - logger.info(f"Found {len(all_files)} Google Drive files to index via Composio") - - documents_indexed = 0 - documents_skipped = 0 - - for file_info in all_files: - try: - # Handle both standard Google API and potential Composio variations - file_id = file_info.get("id", "") or file_info.get("fileId", "") - file_name = ( - file_info.get("name", "") - or file_info.get("fileName", "") - or "Untitled" - ) - mime_type = file_info.get("mimeType", "") or file_info.get( - "mime_type", "" - ) - - if not file_id: - documents_skipped += 1 - continue - - # Skip folders - if mime_type == "application/vnd.google-apps.folder": - continue - - # Generate unique identifier hash - unique_identifier_hash = generate_unique_identifier_hash( - DocumentType.COMPOSIO_CONNECTOR, f"drive_{file_id}", search_space_id - ) - - # Check if document exists - existing_document = await check_document_by_unique_identifier( - session, unique_identifier_hash - ) - - # Get file content - ( - content, - content_error, - ) = await composio_connector.get_drive_file_content(file_id) - - if content_error or not content: - logger.warning( - f"Could not get content for file {file_name}: {content_error}" - ) - # Use metadata as content fallback - markdown_content = f"# {file_name}\n\n" - markdown_content += f"**File ID:** {file_id}\n" - markdown_content += f"**Type:** {mime_type}\n" - else: - try: - markdown_content = content.decode("utf-8") - except UnicodeDecodeError: - markdown_content = f"# {file_name}\n\n[Binary file content]\n" - - content_hash = generate_content_hash(markdown_content, search_space_id) - - if existing_document: - if existing_document.content_hash == content_hash: - documents_skipped += 1 - continue - - # Update existing document - user_llm = await get_user_long_context_llm( - session, user_id, search_space_id - ) - - if user_llm: - document_metadata = { - "file_id": file_id, - "file_name": file_name, - "mime_type": mime_type, - "document_type": "Google Drive File (Composio)", - } - ( - summary_content, - summary_embedding, - ) = await generate_document_summary( - markdown_content, user_llm, document_metadata - ) - else: - summary_content = ( - f"Google Drive File: {file_name}\n\nType: {mime_type}" - ) - summary_embedding = config.embedding_model_instance.embed( - summary_content - ) - - chunks = await create_document_chunks(markdown_content) - - existing_document.title = f"Drive: {file_name}" - existing_document.content = summary_content - existing_document.content_hash = content_hash - existing_document.embedding = summary_embedding - existing_document.document_metadata = { - "file_id": file_id, - "file_name": file_name, - "mime_type": mime_type, - "connector_id": connector_id, - "source": "composio", - } - existing_document.chunks = chunks - existing_document.updated_at = get_current_timestamp() - - documents_indexed += 1 - continue - - # Create new document - user_llm = await get_user_long_context_llm( - session, user_id, search_space_id - ) - - if user_llm: - document_metadata = { - "file_id": file_id, - "file_name": file_name, - "mime_type": mime_type, - "document_type": "Google Drive File (Composio)", - } - ( - summary_content, - summary_embedding, - ) = await generate_document_summary( - markdown_content, user_llm, document_metadata - ) - else: - summary_content = ( - f"Google Drive File: {file_name}\n\nType: {mime_type}" - ) - summary_embedding = config.embedding_model_instance.embed( - summary_content - ) - - chunks = await create_document_chunks(markdown_content) - - document = Document( - search_space_id=search_space_id, - title=f"Drive: {file_name}", - document_type=DocumentType.COMPOSIO_CONNECTOR, - document_metadata={ - "file_id": file_id, - "file_name": file_name, - "mime_type": mime_type, - "connector_id": connector_id, - "toolkit_id": "googledrive", - "source": "composio", - }, - content=summary_content, - content_hash=content_hash, - unique_identifier_hash=unique_identifier_hash, - embedding=summary_embedding, - chunks=chunks, - updated_at=get_current_timestamp(), - ) - session.add(document) - documents_indexed += 1 - - if documents_indexed % 10 == 0: - await session.commit() - - except Exception as e: - logger.error(f"Error processing Drive file: {e!s}", exc_info=True) - documents_skipped += 1 - continue - - if documents_indexed > 0: - await update_connector_last_indexed(session, connector, update_last_indexed) - - await session.commit() - - await task_logger.log_task_success( - log_entry, - f"Successfully completed Google Drive indexing via Composio for connector {connector_id}", - { - "documents_indexed": documents_indexed, - "documents_skipped": documents_skipped, - }, - ) - - return documents_indexed, None - - except Exception as e: - logger.error(f"Failed to index Google Drive via Composio: {e!s}", exc_info=True) - return 0, f"Failed to index Google Drive via Composio: {e!s}" - - -async def _index_composio_gmail( - session: AsyncSession, - connector, - connector_id: int, - search_space_id: int, - user_id: str, - start_date: str | None, - end_date: str | None, - task_logger: TaskLoggingService, - log_entry, - update_last_indexed: bool = True, - max_items: int = 1000, -) -> tuple[int, str]: - """Index Gmail messages via Composio.""" - try: - composio_connector = ComposioConnector(session, connector_id) - - await task_logger.log_task_progress( - log_entry, - f"Fetching Gmail messages via Composio for connector {connector_id}", - {"stage": "fetching_messages"}, - ) - - # Build query with date range - query_parts = [] - if start_date: - query_parts.append(f"after:{start_date.replace('-', '/')}") - if end_date: - query_parts.append(f"before:{end_date.replace('-', '/')}") - query = " ".join(query_parts) - - messages, error = await composio_connector.list_gmail_messages( - query=query, - max_results=max_items, - ) - - if error: - await task_logger.log_task_failure( - log_entry, f"Failed to fetch Gmail messages: {error}", {} - ) - return 0, f"Failed to fetch Gmail messages: {error}" - - if not messages: - success_msg = "No Gmail messages found in the specified date range" - await task_logger.log_task_success( - log_entry, success_msg, {"messages_count": 0} - ) - return 0, success_msg - - logger.info(f"Found {len(messages)} Gmail messages to index via Composio") - - documents_indexed = 0 - documents_skipped = 0 - - for message in messages: - try: - # Composio uses 'messageId' (camelCase), not 'id' - message_id = message.get("messageId", "") or message.get("id", "") - if not message_id: - documents_skipped += 1 - continue - - # Composio's GMAIL_FETCH_EMAILS already returns full message content - # No need for a separate detail API call - - # Extract message info from Composio response - # Composio structure: messageId, messageText, messageTimestamp, payload.headers, labelIds - payload = message.get("payload", {}) - headers = payload.get("headers", []) - - subject = "No Subject" - sender = "Unknown Sender" - date_str = message.get("messageTimestamp", "Unknown Date") - - for header in headers: - name = header.get("name", "").lower() - value = header.get("value", "") - if name == "subject": - subject = value - elif name == "from": - sender = value - elif name == "date": - date_str = value - - # Format to markdown using the full message data - markdown_content = composio_connector.format_gmail_message_to_markdown( - message - ) - - # Generate unique identifier - unique_identifier_hash = generate_unique_identifier_hash( - DocumentType.COMPOSIO_CONNECTOR, - f"gmail_{message_id}", - search_space_id, - ) - - content_hash = generate_content_hash(markdown_content, search_space_id) - - existing_document = await check_document_by_unique_identifier( - session, unique_identifier_hash - ) - - # Get label IDs from Composio response - label_ids = message.get("labelIds", []) - - if existing_document: - if existing_document.content_hash == content_hash: - documents_skipped += 1 - continue - - # Update existing - user_llm = await get_user_long_context_llm( - session, user_id, search_space_id - ) - - if user_llm: - document_metadata = { - "message_id": message_id, - "subject": subject, - "sender": sender, - "document_type": "Gmail Message (Composio)", - } - ( - summary_content, - summary_embedding, - ) = await generate_document_summary( - markdown_content, user_llm, document_metadata - ) - else: - summary_content = ( - f"Gmail: {subject}\n\nFrom: {sender}\nDate: {date_str}" - ) - summary_embedding = config.embedding_model_instance.embed( - summary_content - ) - - chunks = await create_document_chunks(markdown_content) - - existing_document.title = f"Gmail: {subject}" - existing_document.content = summary_content - existing_document.content_hash = content_hash - existing_document.embedding = summary_embedding - existing_document.document_metadata = { - "message_id": message_id, - "subject": subject, - "sender": sender, - "date": date_str, - "labels": label_ids, - "connector_id": connector_id, - "source": "composio", - } - existing_document.chunks = chunks - existing_document.updated_at = get_current_timestamp() - - documents_indexed += 1 - continue - - # Create new document - user_llm = await get_user_long_context_llm( - session, user_id, search_space_id - ) - - if user_llm: - document_metadata = { - "message_id": message_id, - "subject": subject, - "sender": sender, - "document_type": "Gmail Message (Composio)", - } - ( - summary_content, - summary_embedding, - ) = await generate_document_summary( - markdown_content, user_llm, document_metadata - ) - else: - summary_content = ( - f"Gmail: {subject}\n\nFrom: {sender}\nDate: {date_str}" - ) - summary_embedding = config.embedding_model_instance.embed( - summary_content - ) - - chunks = await create_document_chunks(markdown_content) - - document = Document( - search_space_id=search_space_id, - title=f"Gmail: {subject}", - document_type=DocumentType.COMPOSIO_CONNECTOR, - document_metadata={ - "message_id": message_id, - "subject": subject, - "sender": sender, - "date": date_str, - "labels": label_ids, - "connector_id": connector_id, - "toolkit_id": "gmail", - "source": "composio", - }, - content=summary_content, - content_hash=content_hash, - unique_identifier_hash=unique_identifier_hash, - embedding=summary_embedding, - chunks=chunks, - updated_at=get_current_timestamp(), - ) - session.add(document) - documents_indexed += 1 - - if documents_indexed % 10 == 0: - await session.commit() - - except Exception as e: - logger.error(f"Error processing Gmail message: {e!s}", exc_info=True) - documents_skipped += 1 - continue - - if documents_indexed > 0: - await update_connector_last_indexed(session, connector, update_last_indexed) - - await session.commit() - - await task_logger.log_task_success( - log_entry, - f"Successfully completed Gmail indexing via Composio for connector {connector_id}", - { - "documents_indexed": documents_indexed, - "documents_skipped": documents_skipped, - }, - ) - - return documents_indexed, None - - except Exception as e: - logger.error(f"Failed to index Gmail via Composio: {e!s}", exc_info=True) - return 0, f"Failed to index Gmail via Composio: {e!s}" - - -async def _index_composio_google_calendar( - session: AsyncSession, - connector, - connector_id: int, - search_space_id: int, - user_id: str, - start_date: str | None, - end_date: str | None, - task_logger: TaskLoggingService, - log_entry, - update_last_indexed: bool = True, - max_items: int = 2500, -) -> tuple[int, str]: - """Index Google Calendar events via Composio.""" - from datetime import datetime, timedelta - - try: - composio_connector = ComposioConnector(session, connector_id) - - await task_logger.log_task_progress( - log_entry, - f"Fetching Google Calendar events via Composio for connector {connector_id}", - {"stage": "fetching_events"}, - ) - - # Build time range - if start_date: - time_min = f"{start_date}T00:00:00Z" - else: - # Default to 365 days ago - default_start = datetime.now() - timedelta(days=365) - time_min = default_start.strftime("%Y-%m-%dT00:00:00Z") - - if end_date: - time_max = f"{end_date}T23:59:59Z" - else: - time_max = datetime.now().strftime("%Y-%m-%dT23:59:59Z") - - events, error = await composio_connector.list_calendar_events( - time_min=time_min, - time_max=time_max, - max_results=max_items, - ) - - if error: - await task_logger.log_task_failure( - log_entry, f"Failed to fetch Calendar events: {error}", {} - ) - return 0, f"Failed to fetch Calendar events: {error}" - - if not events: - success_msg = "No Google Calendar events found in the specified date range" - await task_logger.log_task_success( - log_entry, success_msg, {"events_count": 0} - ) - return 0, success_msg - - logger.info(f"Found {len(events)} Google Calendar events to index via Composio") - - documents_indexed = 0 - documents_skipped = 0 - - for event in events: - try: - # Handle both standard Google API and potential Composio variations - event_id = event.get("id", "") or event.get("eventId", "") - summary = ( - event.get("summary", "") or event.get("title", "") or "No Title" - ) - - if not event_id: - documents_skipped += 1 - continue - - # Format to markdown - markdown_content = composio_connector.format_calendar_event_to_markdown( - event - ) - - # Generate unique identifier - unique_identifier_hash = generate_unique_identifier_hash( - DocumentType.COMPOSIO_CONNECTOR, - f"calendar_{event_id}", - search_space_id, - ) - - content_hash = generate_content_hash(markdown_content, search_space_id) - - existing_document = await check_document_by_unique_identifier( - session, unique_identifier_hash - ) - - # Extract event times - start = event.get("start", {}) - end = event.get("end", {}) - start_time = start.get("dateTime") or start.get("date", "") - end_time = end.get("dateTime") or end.get("date", "") - location = event.get("location", "") - - if existing_document: - if existing_document.content_hash == content_hash: - documents_skipped += 1 - continue - - # Update existing - user_llm = await get_user_long_context_llm( - session, user_id, search_space_id - ) - - if user_llm: - document_metadata = { - "event_id": event_id, - "summary": summary, - "start_time": start_time, - "document_type": "Google Calendar Event (Composio)", - } - ( - summary_content, - summary_embedding, - ) = await generate_document_summary( - markdown_content, user_llm, document_metadata - ) - else: - summary_content = f"Calendar: {summary}\n\nStart: {start_time}\nEnd: {end_time}" - if location: - summary_content += f"\nLocation: {location}" - summary_embedding = config.embedding_model_instance.embed( - summary_content - ) - - chunks = await create_document_chunks(markdown_content) - - existing_document.title = f"Calendar: {summary}" - existing_document.content = summary_content - existing_document.content_hash = content_hash - existing_document.embedding = summary_embedding - existing_document.document_metadata = { - "event_id": event_id, - "summary": summary, - "start_time": start_time, - "end_time": end_time, - "location": location, - "connector_id": connector_id, - "source": "composio", - } - existing_document.chunks = chunks - existing_document.updated_at = get_current_timestamp() - - documents_indexed += 1 - continue - - # Create new document - user_llm = await get_user_long_context_llm( - session, user_id, search_space_id - ) - - if user_llm: - document_metadata = { - "event_id": event_id, - "summary": summary, - "start_time": start_time, - "document_type": "Google Calendar Event (Composio)", - } - ( - summary_content, - summary_embedding, - ) = await generate_document_summary( - markdown_content, user_llm, document_metadata - ) - else: - summary_content = ( - f"Calendar: {summary}\n\nStart: {start_time}\nEnd: {end_time}" - ) - if location: - summary_content += f"\nLocation: {location}" - summary_embedding = config.embedding_model_instance.embed( - summary_content - ) - - chunks = await create_document_chunks(markdown_content) - - document = Document( - search_space_id=search_space_id, - title=f"Calendar: {summary}", - document_type=DocumentType.COMPOSIO_CONNECTOR, - document_metadata={ - "event_id": event_id, - "summary": summary, - "start_time": start_time, - "end_time": end_time, - "location": location, - "connector_id": connector_id, - "toolkit_id": "googlecalendar", - "source": "composio", - }, - content=summary_content, - content_hash=content_hash, - unique_identifier_hash=unique_identifier_hash, - embedding=summary_embedding, - chunks=chunks, - updated_at=get_current_timestamp(), - ) - session.add(document) - documents_indexed += 1 - - if documents_indexed % 10 == 0: - await session.commit() - - except Exception as e: - logger.error(f"Error processing Calendar event: {e!s}", exc_info=True) - documents_skipped += 1 - continue - - if documents_indexed > 0: - await update_connector_last_indexed(session, connector, update_last_indexed) - - await session.commit() - - await task_logger.log_task_success( - log_entry, - f"Successfully completed Google Calendar indexing via Composio for connector {connector_id}", - { - "documents_indexed": documents_indexed, - "documents_skipped": documents_skipped, - }, - ) - - return documents_indexed, None - - except Exception as e: - logger.error( - f"Failed to index Google Calendar via Composio: {e!s}", exc_info=True - ) - return 0, f"Failed to index Google Calendar via Composio: {e!s}" diff --git a/surfsense_backend/app/tasks/connector_indexers/base.py b/surfsense_backend/app/tasks/connector_indexers/base.py index b9a99808e..b390937f0 100644 --- a/surfsense_backend/app/tasks/connector_indexers/base.py +++ b/surfsense_backend/app/tasks/connector_indexers/base.py @@ -112,6 +112,13 @@ def calculate_date_range( Returns: Tuple of (start_date_str, end_date_str) """ + # Normalize "undefined" strings to None (from frontend) + # This prevents parsing errors and ensures consistent behavior across all indexers + if start_date == "undefined" or start_date == "": + start_date = None + if end_date == "undefined" or end_date == "": + end_date = None + if start_date is not None and end_date is not None: return start_date, end_date diff --git a/surfsense_backend/app/tasks/connector_indexers/bookstack_indexer.py b/surfsense_backend/app/tasks/connector_indexers/bookstack_indexer.py index 2793f78db..a1067255d 100644 --- a/surfsense_backend/app/tasks/connector_indexers/bookstack_indexer.py +++ b/surfsense_backend/app/tasks/connector_indexers/bookstack_indexer.py @@ -136,10 +136,9 @@ async def index_bookstack_pages( ) if error: - logger.error(f"Failed to get BookStack pages: {error}") - # Don't treat "No pages found" as an error that should stop indexing if "No pages found" in error: + logger.info(f"No BookStack pages found: {error}") logger.info( "No pages found is not a critical error, continuing with update" ) @@ -159,6 +158,7 @@ async def index_bookstack_pages( ) return 0, None else: + logger.error(f"Failed to get BookStack pages: {error}") await task_logger.log_task_failure( log_entry, f"Failed to get BookStack pages: {error}", diff --git a/surfsense_backend/app/tasks/connector_indexers/confluence_indexer.py b/surfsense_backend/app/tasks/connector_indexers/confluence_indexer.py index 7289b0ccd..ddbefafb9 100644 --- a/surfsense_backend/app/tasks/connector_indexers/confluence_indexer.py +++ b/surfsense_backend/app/tasks/connector_indexers/confluence_indexer.py @@ -120,10 +120,9 @@ async def index_confluence_pages( ) if error: - logger.error(f"Failed to get Confluence pages: {error}") - # Don't treat "No pages found" as an error that should stop indexing if "No pages found" in error: + logger.info(f"No Confluence pages found: {error}") logger.info( "No pages found is not a critical error, continuing with update" ) @@ -147,6 +146,7 @@ async def index_confluence_pages( await confluence_client.close() return 0, None else: + logger.error(f"Failed to get Confluence pages: {error}") await task_logger.log_task_failure( log_entry, f"Failed to get Confluence pages: {error}", diff --git a/surfsense_backend/app/tasks/connector_indexers/google_calendar_indexer.py b/surfsense_backend/app/tasks/connector_indexers/google_calendar_indexer.py index b8c0e564d..2365ff984 100644 --- a/surfsense_backend/app/tasks/connector_indexers/google_calendar_indexer.py +++ b/surfsense_backend/app/tasks/connector_indexers/google_calendar_indexer.py @@ -4,6 +4,8 @@ Google Calendar connector indexer. from datetime import datetime, timedelta +import pytz +from dateutil.parser import isoparse from google.oauth2.credentials import Credentials from sqlalchemy.exc import SQLAlchemyError from sqlalchemy.ext.asyncio import AsyncSession @@ -21,6 +23,7 @@ from app.utils.document_converters import ( from .base import ( check_document_by_unique_identifier, + check_duplicate_document_by_hash, get_connector_by_id, get_current_timestamp, logger, @@ -206,6 +209,23 @@ async def index_google_calendar_events( start_date_str = start_date end_date_str = end_date + # If start_date and end_date are the same, adjust end_date to be one day later + # to ensure valid date range (start_date must be strictly before end_date) + if start_date_str == end_date_str: + # Parse the date and add one day to ensure valid range + dt = isoparse(end_date_str) + if dt.tzinfo is None: + dt = dt.replace(tzinfo=pytz.UTC) + else: + dt = dt.astimezone(pytz.UTC) + # Add one day to end_date to make it strictly after start_date + dt_end = dt + timedelta(days=1) + end_date_str = dt_end.strftime("%Y-%m-%d") + logger.info( + f"Adjusted end_date from {end_date} to {end_date_str} " + f"to ensure valid date range (start_date must be strictly before end_date)" + ) + await task_logger.log_task_progress( log_entry, f"Fetching Google Calendar events from {start_date_str} to {end_date_str}", @@ -223,10 +243,9 @@ async def index_google_calendar_events( ) if error: - logger.error(f"Failed to get Google Calendar events: {error}") - # Don't treat "No events found" as an error that should stop indexing if "No events found" in error: + logger.info(f"No Google Calendar events found: {error}") logger.info( "No events found is not a critical error, continuing with update" ) @@ -246,13 +265,25 @@ async def index_google_calendar_events( ) return 0, None else: + logger.error(f"Failed to get Google Calendar events: {error}") + # Check if this is an authentication error that requires re-authentication + error_message = error + error_type = "APIError" + if ( + "re-authenticate" in error.lower() + or "expired or been revoked" in error.lower() + or "authentication failed" in error.lower() + ): + error_message = "Google Calendar authentication failed. Please re-authenticate." + error_type = "AuthenticationError" + await task_logger.log_task_failure( log_entry, - f"Failed to get Google Calendar events: {error}", - "API Error", - {"error_type": "APIError"}, + error_message, + error, + {"error_type": error_type}, ) - return 0, f"Failed to get Google Calendar events: {error}" + return 0, error_message logger.info(f"Retrieved {len(events)} events from Google Calendar API") @@ -263,6 +294,9 @@ async def index_google_calendar_events( documents_indexed = 0 documents_skipped = 0 skipped_events = [] + duplicate_content_count = ( + 0 # Track events skipped due to duplicate content_hash + ) for event in events: try: @@ -383,6 +417,27 @@ async def index_google_calendar_events( ) continue + # Document doesn't exist by unique_identifier_hash + # Check if a document with the same content_hash exists (from another connector) + with session.no_autoflush: + duplicate_by_content = await check_duplicate_document_by_hash( + session, content_hash + ) + + if duplicate_by_content: + # A document with the same content already exists (likely from Composio connector) + logger.info( + f"Event {event_summary} already indexed by another connector " + f"(existing document ID: {duplicate_by_content.id}, " + f"type: {duplicate_by_content.document_type}). Skipping to avoid duplicate content." + ) + duplicate_content_count += 1 + documents_skipped += 1 + skipped_events.append( + f"{event_summary} (already indexed by another connector)" + ) + continue + # Document doesn't exist - create new one # Generate summary with metadata user_llm = await get_user_long_context_llm( @@ -475,7 +530,28 @@ async def index_google_calendar_events( logger.info( f"Final commit: Total {documents_indexed} Google Calendar events processed" ) - await session.commit() + try: + await session.commit() + except Exception as e: + # Handle any remaining integrity errors gracefully (race conditions, etc.) + if ( + "duplicate key value violates unique constraint" in str(e).lower() + or "uniqueviolationerror" in str(e).lower() + ): + logger.warning( + f"Duplicate content_hash detected during final commit. " + f"This may occur if the same event was indexed by multiple connectors. " + f"Rolling back and continuing. Error: {e!s}" + ) + await session.rollback() + # Don't fail the entire task - some documents may have been successfully indexed + else: + raise + + # Build warning message if duplicates were found + warning_message = None + if duplicate_content_count > 0: + warning_message = f"{duplicate_content_count} skipped (duplicate)" await task_logger.log_task_success( log_entry, @@ -484,14 +560,16 @@ async def index_google_calendar_events( "events_processed": total_processed, "documents_indexed": documents_indexed, "documents_skipped": documents_skipped, + "duplicate_content_count": duplicate_content_count, "skipped_events_count": len(skipped_events), }, ) logger.info( - f"Google Calendar indexing completed: {documents_indexed} new events, {documents_skipped} skipped" + f"Google Calendar indexing completed: {documents_indexed} new events, {documents_skipped} skipped " + f"({duplicate_content_count} due to duplicate content from other connectors)" ) - return total_processed, None + return total_processed, warning_message except SQLAlchemyError as db_error: await session.rollback() diff --git a/surfsense_backend/app/tasks/connector_indexers/google_drive_indexer.py b/surfsense_backend/app/tasks/connector_indexers/google_drive_indexer.py index 48282a1af..f50e149d3 100644 --- a/surfsense_backend/app/tasks/connector_indexers/google_drive_indexer.py +++ b/surfsense_backend/app/tasks/connector_indexers/google_drive_indexer.py @@ -578,7 +578,7 @@ async def _check_rename_only_update( - (True, message): Only filename changed, document was updated - (False, None): Content changed or new file, needs full processing """ - from sqlalchemy import select + from sqlalchemy import String, cast, select from sqlalchemy.orm.attributes import flag_modified from app.db import Document @@ -603,7 +603,8 @@ async def _check_rename_only_update( select(Document).where( Document.search_space_id == search_space_id, Document.document_type == DocumentType.GOOGLE_DRIVE_FILE, - Document.document_metadata["google_drive_file_id"].astext == file_id, + cast(Document.document_metadata["google_drive_file_id"], String) + == file_id, ) ) existing_document = result.scalar_one_or_none() @@ -755,7 +756,7 @@ async def _remove_document(session: AsyncSession, file_id: str, search_space_id: Handles both new (file_id-based) and legacy (filename-based) hash schemes. """ - from sqlalchemy import select + from sqlalchemy import String, cast, select from app.db import Document @@ -774,7 +775,8 @@ async def _remove_document(session: AsyncSession, file_id: str, search_space_id: select(Document).where( Document.search_space_id == search_space_id, Document.document_type == DocumentType.GOOGLE_DRIVE_FILE, - Document.document_metadata["google_drive_file_id"].astext == file_id, + cast(Document.document_metadata["google_drive_file_id"], String) + == file_id, ) ) existing_document = result.scalar_one_or_none() diff --git a/surfsense_backend/app/tasks/connector_indexers/google_gmail_indexer.py b/surfsense_backend/app/tasks/connector_indexers/google_gmail_indexer.py index e10297057..08d2904d6 100644 --- a/surfsense_backend/app/tasks/connector_indexers/google_gmail_indexer.py +++ b/surfsense_backend/app/tasks/connector_indexers/google_gmail_indexer.py @@ -170,10 +170,21 @@ async def index_google_gmail_messages( ) if error: + # Check if this is an authentication error that requires re-authentication + error_message = error + error_type = "APIError" + if ( + "re-authenticate" in error.lower() + or "expired or been revoked" in error.lower() + or "authentication failed" in error.lower() + ): + error_message = "Gmail authentication failed. Please re-authenticate." + error_type = "AuthenticationError" + await task_logger.log_task_failure( - log_entry, f"Failed to fetch messages: {error}", {} + log_entry, error_message, error, {"error_type": error_type} ) - return 0, f"Failed to fetch Gmail messages: {error}" + return 0, error_message if not messages: success_msg = "No Google gmail messages found in the specified date range" diff --git a/surfsense_backend/app/tasks/connector_indexers/jira_indexer.py b/surfsense_backend/app/tasks/connector_indexers/jira_indexer.py index fdbeb93b0..4851a6466 100644 --- a/surfsense_backend/app/tasks/connector_indexers/jira_indexer.py +++ b/surfsense_backend/app/tasks/connector_indexers/jira_indexer.py @@ -126,10 +126,9 @@ async def index_jira_issues( ) if error: - logger.error(f"Failed to get Jira issues: {error}") - # Don't treat "No issues found" as an error that should stop indexing if "No issues found" in error: + logger.info(f"No Jira issues found: {error}") logger.info( "No issues found is not a critical error, continuing with update" ) @@ -149,6 +148,7 @@ async def index_jira_issues( ) return 0, None else: + logger.error(f"Failed to get Jira issues: {error}") await task_logger.log_task_failure( log_entry, f"Failed to get Jira issues: {error}", diff --git a/surfsense_backend/app/tasks/connector_indexers/linear_indexer.py b/surfsense_backend/app/tasks/connector_indexers/linear_indexer.py index f1bfd42e8..7d8e0c30e 100644 --- a/surfsense_backend/app/tasks/connector_indexers/linear_indexer.py +++ b/surfsense_backend/app/tasks/connector_indexers/linear_indexer.py @@ -145,10 +145,9 @@ async def index_linear_issues( ) if error: - logger.error(f"Failed to get Linear issues: {error}") - # Don't treat "No issues found" as an error that should stop indexing if "No issues found" in error: + logger.info(f"No Linear issues found: {error}") logger.info( "No issues found is not a critical error, continuing with update" ) @@ -162,6 +161,7 @@ async def index_linear_issues( ) return 0, None else: + logger.error(f"Failed to get Linear issues: {error}") return 0, f"Failed to get Linear issues: {error}" logger.info(f"Retrieved {len(issues)} issues from Linear API") diff --git a/surfsense_backend/app/tasks/connector_indexers/luma_indexer.py b/surfsense_backend/app/tasks/connector_indexers/luma_indexer.py index 91f81ac20..ead259a44 100644 --- a/surfsense_backend/app/tasks/connector_indexers/luma_indexer.py +++ b/surfsense_backend/app/tasks/connector_indexers/luma_indexer.py @@ -116,6 +116,13 @@ async def index_luma_events( luma_client = LumaConnector(api_key=api_key) + # Handle 'undefined' string from frontend (treat as None) + # This prevents "time data 'undefined' does not match format" errors + if start_date == "undefined" or start_date == "": + start_date = None + if end_date == "undefined" or end_date == "": + end_date = None + # Calculate date range # For calendar connectors, allow future dates to index upcoming events if start_date is None or end_date is None: @@ -172,10 +179,9 @@ async def index_luma_events( ) if error: - logger.error(f"Failed to get Luma events: {error}") - # Don't treat "No events found" as an error that should stop indexing if "No events found" in error or "no events" in error.lower(): + logger.info(f"No Luma events found: {error}") logger.info( "No events found is not a critical error, continuing with update" ) @@ -195,6 +201,7 @@ async def index_luma_events( ) return 0, None else: + logger.error(f"Failed to get Luma events: {error}") await task_logger.log_task_failure( log_entry, f"Failed to get Luma events: {error}", diff --git a/surfsense_backend/app/utils/connector_naming.py b/surfsense_backend/app/utils/connector_naming.py index a2b748a3a..7d3efc001 100644 --- a/surfsense_backend/app/utils/connector_naming.py +++ b/surfsense_backend/app/utils/connector_naming.py @@ -28,6 +28,9 @@ BASE_NAME_FOR_TYPE = { SearchSourceConnectorType.CONFLUENCE_CONNECTOR: "Confluence", SearchSourceConnectorType.AIRTABLE_CONNECTOR: "Airtable", SearchSourceConnectorType.MCP_CONNECTOR: "Model Context Protocol (MCP)", + SearchSourceConnectorType.COMPOSIO_GMAIL_CONNECTOR: "Gmail", + SearchSourceConnectorType.COMPOSIO_GOOGLE_DRIVE_CONNECTOR: "Google Drive", + SearchSourceConnectorType.COMPOSIO_GOOGLE_CALENDAR_CONNECTOR: "Google Calendar", } diff --git a/surfsense_backend/pyproject.toml b/surfsense_backend/pyproject.toml index ffe9e5232..57dbdc7b5 100644 --- a/surfsense_backend/pyproject.toml +++ b/surfsense_backend/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "surf-new-backend" -version = "0.0.11" +version = "0.0.12" description = "SurfSense Backend" requires-python = ">=3.12" dependencies = [ diff --git a/surfsense_backend/uv.lock b/surfsense_backend/uv.lock index 18f04288e..16b77a7b2 100644 --- a/surfsense_backend/uv.lock +++ b/surfsense_backend/uv.lock @@ -6545,7 +6545,7 @@ wheels = [ [[package]] name = "surf-new-backend" -version = "0.0.11" +version = "0.0.12" source = { editable = "." } dependencies = [ { name = "alembic" }, diff --git a/surfsense_browser_extension/package.json b/surfsense_browser_extension/package.json index b225bc206..bf926d09f 100644 --- a/surfsense_browser_extension/package.json +++ b/surfsense_browser_extension/package.json @@ -1,7 +1,7 @@ { "name": "surfsense_browser_extension", "displayName": "Surfsense Browser Extension", - "version": "0.0.11", + "version": "0.0.12", "description": "Extension to collect Browsing History for SurfSense.", "author": "https://github.com/MODSetter", "engines": { diff --git a/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/components/RowActions.tsx b/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/components/RowActions.tsx index d277a84ee..d9a894e5a 100644 --- a/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/components/RowActions.tsx +++ b/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/components/RowActions.tsx @@ -209,7 +209,7 @@ export function RowActions({ disabled={isDeleting} className="bg-destructive text-destructive-foreground hover:bg-destructive/90" > - {isDeleting ? "Deleting..." : "Delete"} + {isDeleting ? "Deleting" : "Delete"} diff --git a/surfsense_web/components/assistant-ui/connector-popup.tsx b/surfsense_web/components/assistant-ui/connector-popup.tsx index 72e330770..293d4a243 100644 --- a/surfsense_web/components/assistant-ui/connector-popup.tsx +++ b/surfsense_web/components/assistant-ui/connector-popup.tsx @@ -5,33 +5,45 @@ import { Cable, Loader2 } from "lucide-react"; import { useSearchParams } from "next/navigation"; import type { FC } from "react"; import { activeSearchSpaceIdAtom } from "@/atoms/search-spaces/search-space-query.atoms"; +import { currentUserAtom } from "@/atoms/user/user-query.atoms"; import { TooltipIconButton } from "@/components/assistant-ui/tooltip-icon-button"; -import { Dialog, DialogContent } from "@/components/ui/dialog"; +import { Dialog, DialogContent, DialogTitle } from "@/components/ui/dialog"; import { Tabs, TabsContent } from "@/components/ui/tabs"; import type { SearchSourceConnector } from "@/contracts/types/connector.types"; import { useConnectorsElectric } from "@/hooks/use-connectors-electric"; import { useDocumentsElectric } from "@/hooks/use-documents-electric"; +import { useInbox } from "@/hooks/use-inbox"; import { cn } from "@/lib/utils"; import { ConnectorDialogHeader } from "./connector-popup/components/connector-dialog-header"; import { ConnectorConnectView } from "./connector-popup/connector-configs/views/connector-connect-view"; import { ConnectorEditView } from "./connector-popup/connector-configs/views/connector-edit-view"; import { IndexingConfigurationView } from "./connector-popup/connector-configs/views/indexing-configuration-view"; -import { OAUTH_CONNECTORS } from "./connector-popup/constants/connector-constants"; +import { + COMPOSIO_CONNECTORS, + OAUTH_CONNECTORS, +} from "./connector-popup/constants/connector-constants"; import { useConnectorDialog } from "./connector-popup/hooks/use-connector-dialog"; import { useIndexingConnectors } from "./connector-popup/hooks/use-indexing-connectors"; import { ActiveConnectorsTab } from "./connector-popup/tabs/active-connectors-tab"; import { AllConnectorsTab } from "./connector-popup/tabs/all-connectors-tab"; -import { ComposioToolkitView } from "./connector-popup/views/composio-toolkit-view"; import { ConnectorAccountsListView } from "./connector-popup/views/connector-accounts-list-view"; import { YouTubeCrawlerView } from "./connector-popup/views/youtube-crawler-view"; export const ConnectorIndicator: FC = () => { const searchSpaceId = useAtomValue(activeSearchSpaceIdAtom); const searchParams = useSearchParams(); + const { data: currentUser } = useAtomValue(currentUserAtom); // Fetch document type counts using Electric SQL + PGlite for real-time updates const { documentTypeCounts, loading: documentTypesLoading } = useDocumentsElectric(searchSpaceId); + // Fetch notifications to detect indexing failures + const { inboxItems = [] } = useInbox( + currentUser?.id ?? null, + searchSpaceId ? Number(searchSpaceId) : null, + "connector_indexing" + ); + // Check if YouTube view is active const isYouTubeView = searchParams.get("view") === "youtube"; @@ -88,12 +100,6 @@ export const ConnectorIndicator: FC = () => { setConnectorConfig, setIndexingConnectorConfig, setConnectorName, - // Composio - viewingComposio, - connectingComposioToolkit, - handleOpenComposio, - handleBackFromComposio, - handleConnectComposioToolkit, } = useConnectorDialog(); // Fetch connectors using Electric SQL + PGlite for real-time updates @@ -123,8 +129,10 @@ export const ConnectorIndicator: FC = () => { }; // Track indexing state locally - clears automatically when Electric SQL detects last_indexed_at changed - const { indexingConnectorIds, startIndexing } = useIndexingConnectors( - connectors as SearchSourceConnector[] + // Also clears when failed notifications are detected + const { indexingConnectorIds, startIndexing, stopIndexing } = useIndexingConnectors( + connectors as SearchSourceConnector[], + inboxItems ); const isLoading = connectorsLoading || documentTypesLoading; @@ -142,7 +150,7 @@ export const ConnectorIndicator: FC = () => { // Check which connectors are already connected // Using Electric SQL + PGlite for real-time connector updates - const connectedTypes = new Set( + const connectedTypes = new Set( (connectors || []).map((c: SearchSourceConnector) => c.connector_type) ); @@ -179,22 +187,11 @@ export const ConnectorIndicator: FC = () => { )} - + + Manage Connectors {/* YouTube Crawler View - shown when adding YouTube videos */} {isYouTubeView && searchSpaceId ? ( - ) : viewingComposio && searchSpaceId ? ( - c.connector_type === "COMPOSIO_CONNECTOR") - .map((c: SearchSourceConnector) => c.config?.toolkit_id as string) - .filter(Boolean)} - onBack={handleBackFromComposio} - onConnectToolkit={handleConnectComposioToolkit} - isConnecting={connectingComposioToolkit !== null} - connectingToolkitId={connectingComposioToolkit} - /> ) : viewingMCPList ? ( { onBack={handleBackFromAccountsList} onManage={handleStartEdit} onAddAccount={() => { - const oauthConnector = OAUTH_CONNECTORS.find( - (c) => c.connectorType === viewingAccountsType.connectorType - ); + // Check both OAUTH_CONNECTORS and COMPOSIO_CONNECTORS + const oauthConnector = + OAUTH_CONNECTORS.find( + (c) => c.connectorType === viewingAccountsType.connectorType + ) || + COMPOSIO_CONNECTORS.find( + (c) => c.connectorType === viewingAccountsType.connectorType + ); if (oauthConnector) { handleConnectOAuth(oauthConnector); } @@ -260,7 +262,13 @@ export const ConnectorIndicator: FC = () => { editingConnector.connector_type !== "GOOGLE_DRIVE_CONNECTOR" ? () => { startIndexing(editingConnector.id); - handleQuickIndexConnector(editingConnector.id, editingConnector.connector_type); + handleQuickIndexConnector( + editingConnector.id, + editingConnector.connector_type, + stopIndexing, + startDate, + endDate + ); } : undefined } @@ -331,7 +339,6 @@ export const ConnectorIndicator: FC = () => { onCreateYouTubeCrawler={handleCreateYouTubeCrawler} onManage={handleStartEdit} onViewAccountsList={handleViewAccountsList} - onOpenComposio={handleOpenComposio} /> diff --git a/surfsense_web/components/assistant-ui/connector-popup/components/composio-connector-card.tsx b/surfsense_web/components/assistant-ui/connector-popup/components/composio-connector-card.tsx deleted file mode 100644 index 671fc3ce6..000000000 --- a/surfsense_web/components/assistant-ui/connector-popup/components/composio-connector-card.tsx +++ /dev/null @@ -1,78 +0,0 @@ -"use client"; - -import { Zap } from "lucide-react"; -import Image from "next/image"; -import type { FC } from "react"; -import { Button } from "@/components/ui/button"; -import { cn } from "@/lib/utils"; - -interface ComposioConnectorCardProps { - id: string; - title: string; - description: string; - connectorCount?: number; - onConnect: () => void; -} - -export const ComposioConnectorCard: FC = ({ - id, - title, - description, - connectorCount = 0, - onConnect, -}) => { - const hasConnections = connectorCount > 0; - - return ( -
-
- Composio -
-
-
- {title} - -
- {hasConnections ? ( -

- - {connectorCount} {connectorCount === 1 ? "connection" : "connections"} - -

- ) : ( -

{description}

- )} -
- -
- ); -}; diff --git a/surfsense_web/components/assistant-ui/connector-popup/config/connector-status-config.json b/surfsense_web/components/assistant-ui/connector-popup/config/connector-status-config.json index b729c3f8b..2c1010b1c 100644 --- a/surfsense_web/components/assistant-ui/connector-popup/config/connector-status-config.json +++ b/surfsense_web/components/assistant-ui/connector-popup/config/connector-status-config.json @@ -24,6 +24,16 @@ "enabled": true, "status": "warning", "statusMessage": "Some requests may be blocked if not using Firecrawl." + }, + "COMPOSIO_GOOGLE_DRIVE_CONNECTOR": { + "enabled": false, + "status": "disabled", + "statusMessage": "Not available yet." + }, + "GITHUB_CONNECTOR": { + "enabled": false, + "status": "warning", + "statusMessage": "Some issues with indexing repositories." } }, "globalSettings": { diff --git a/surfsense_web/components/assistant-ui/connector-popup/connect-forms/components/obsidian-connect-form.tsx b/surfsense_web/components/assistant-ui/connector-popup/connect-forms/components/obsidian-connect-form.tsx index 064e10e2f..94839b03b 100644 --- a/surfsense_web/components/assistant-ui/connector-popup/connect-forms/components/obsidian-connect-form.tsx +++ b/surfsense_web/components/assistant-ui/connector-popup/connect-forms/components/obsidian-connect-form.tsx @@ -1,7 +1,7 @@ "use client"; import { zodResolver } from "@hookform/resolvers/zod"; -import { FolderOpen, Info } from "lucide-react"; +import { Info } from "lucide-react"; import type { FC } from "react"; import { useRef, useState } from "react"; import { useForm } from "react-hook-form"; @@ -109,7 +109,7 @@ export const ObsidianConnectForm: FC = ({ onSubmit, isSubmitti return (
- +
Self-Hosted Only diff --git a/surfsense_web/components/assistant-ui/connector-popup/connector-configs/components/composio-calendar-config.tsx b/surfsense_web/components/assistant-ui/connector-popup/connector-configs/components/composio-calendar-config.tsx new file mode 100644 index 000000000..6f282d892 --- /dev/null +++ b/surfsense_web/components/assistant-ui/connector-popup/connector-configs/components/composio-calendar-config.tsx @@ -0,0 +1,14 @@ +"use client"; + +import type { FC } from "react"; +import type { SearchSourceConnector } from "@/contracts/types/connector.types"; + +interface ComposioCalendarConfigProps { + connector: SearchSourceConnector; + onConfigChange?: (config: Record) => void; + onNameChange?: (name: string) => void; +} + +export const ComposioCalendarConfig: FC = () => { + return
; +}; diff --git a/surfsense_web/components/assistant-ui/connector-popup/connector-configs/components/composio-config.tsx b/surfsense_web/components/assistant-ui/connector-popup/connector-configs/components/composio-config.tsx deleted file mode 100644 index 6fe37e1e5..000000000 --- a/surfsense_web/components/assistant-ui/connector-popup/connector-configs/components/composio-config.tsx +++ /dev/null @@ -1,160 +0,0 @@ -"use client"; - -import { ExternalLink, Info, Zap } from "lucide-react"; -import Image from "next/image"; -import type { FC } from "react"; -import { Badge } from "@/components/ui/badge"; -import type { SearchSourceConnector } from "@/contracts/types/connector.types"; -import { cn } from "@/lib/utils"; - -interface ComposioConfigProps { - connector: SearchSourceConnector; - onConfigChange?: (config: Record) => void; - onNameChange?: (name: string) => void; -} - -// Get toolkit display info -const getToolkitInfo = (toolkitId: string): { name: string; icon: string; description: string } => { - switch (toolkitId) { - case "googledrive": - return { - name: "Google Drive", - icon: "/connectors/google-drive.svg", - description: "Files and documents from Google Drive", - }; - case "gmail": - return { - name: "Gmail", - icon: "/connectors/google-gmail.svg", - description: "Emails from Gmail", - }; - case "googlecalendar": - return { - name: "Google Calendar", - icon: "/connectors/google-calendar.svg", - description: "Events from Google Calendar", - }; - case "slack": - return { - name: "Slack", - icon: "/connectors/slack.svg", - description: "Messages from Slack", - }; - case "notion": - return { - name: "Notion", - icon: "/connectors/notion.svg", - description: "Pages from Notion", - }; - case "github": - return { - name: "GitHub", - icon: "/connectors/github.svg", - description: "Repositories from GitHub", - }; - default: - return { - name: toolkitId, - icon: "/connectors/composio.svg", - description: "Connected via Composio", - }; - } -}; - -export const ComposioConfig: FC = ({ connector }) => { - const toolkitId = connector.config?.toolkit_id as string; - const toolkitName = connector.config?.toolkit_name as string; - const isIndexable = connector.config?.is_indexable as boolean; - const composioAccountId = connector.config?.composio_connected_account_id as string; - - const toolkitInfo = getToolkitInfo(toolkitId); - - return ( -
- {/* Toolkit Info Card */} -
-
-
- {toolkitInfo.name} -
-
-
-

{toolkitName || toolkitInfo.name}

- - - Composio - -
-

{toolkitInfo.description}

-
-
-
- - {/* Connection Details */} -
-

- Connection Details -

-
-
- Toolkit - {toolkitId} -
-
- Indexing Supported - - {isIndexable ? "Yes" : "Coming Soon"} - -
- {composioAccountId && ( -
- Account ID - - {composioAccountId} - -
- )} -
-
- - {/* Info Banner */} -
-
- -
-

- This connection uses Composio's managed OAuth, which means you don't need to - wait for app verification. Your data is securely accessed through Composio. -

- - Learn more about Composio - - -
-
-
-
- ); -}; diff --git a/surfsense_web/components/assistant-ui/connector-popup/connector-configs/components/composio-drive-config.tsx b/surfsense_web/components/assistant-ui/connector-popup/connector-configs/components/composio-drive-config.tsx new file mode 100644 index 000000000..239125565 --- /dev/null +++ b/surfsense_web/components/assistant-ui/connector-popup/connector-configs/components/composio-drive-config.tsx @@ -0,0 +1,347 @@ +"use client"; + +import { + File, + FileSpreadsheet, + FileText, + FolderClosed, + Image, + Presentation, + X, +} from "lucide-react"; +import type { FC } from "react"; +import { useEffect, useState } from "react"; +import { ComposioDriveFolderTree } from "@/components/connectors/composio-drive-folder-tree"; +import { Button } from "@/components/ui/button"; +import { Label } from "@/components/ui/label"; +import { + Select, + SelectContent, + SelectItem, + SelectTrigger, + SelectValue, +} from "@/components/ui/select"; +import { Switch } from "@/components/ui/switch"; +import type { SearchSourceConnector } from "@/contracts/types/connector.types"; + +interface ComposioDriveConfigProps { + connector: SearchSourceConnector; + onConfigChange?: (config: Record) => void; + onNameChange?: (name: string) => void; +} + +interface SelectedFolder { + id: string; + name: string; +} + +interface IndexingOptions { + max_files_per_folder: number; + incremental_sync: boolean; + include_subfolders: boolean; +} + +const DEFAULT_INDEXING_OPTIONS: IndexingOptions = { + max_files_per_folder: 100, + incremental_sync: true, + include_subfolders: true, +}; + +// Helper to get appropriate icon for file type based on file name +function getFileIconFromName(fileName: string, className: string = "size-3.5 shrink-0") { + const lowerName = fileName.toLowerCase(); + // Spreadsheets + if ( + lowerName.endsWith(".xlsx") || + lowerName.endsWith(".xls") || + lowerName.endsWith(".csv") || + lowerName.includes("spreadsheet") + ) { + return ; + } + // Presentations + if ( + lowerName.endsWith(".pptx") || + lowerName.endsWith(".ppt") || + lowerName.includes("presentation") + ) { + return ; + } + // Documents (word, text only - not PDF) + if ( + lowerName.endsWith(".docx") || + lowerName.endsWith(".doc") || + lowerName.endsWith(".txt") || + lowerName.includes("document") || + lowerName.includes("word") || + lowerName.includes("text") + ) { + return ; + } + // Images + if ( + lowerName.endsWith(".png") || + lowerName.endsWith(".jpg") || + lowerName.endsWith(".jpeg") || + lowerName.endsWith(".gif") || + lowerName.endsWith(".webp") || + lowerName.endsWith(".svg") + ) { + return ; + } + // Default (including PDF) + return ; +} + +export const ComposioDriveConfig: FC = ({ + connector, + onConfigChange, +}) => { + const isIndexable = connector.config?.is_indexable as boolean; + + // Initialize with existing selected folders and files from connector config + const existingFolders = + (connector.config?.selected_folders as SelectedFolder[] | undefined) || []; + const existingFiles = (connector.config?.selected_files as SelectedFolder[] | undefined) || []; + const existingIndexingOptions = + (connector.config?.indexing_options as IndexingOptions | undefined) || DEFAULT_INDEXING_OPTIONS; + + const [selectedFolders, setSelectedFolders] = useState(existingFolders); + const [selectedFiles, setSelectedFiles] = useState(existingFiles); + const [showFolderSelector, setShowFolderSelector] = useState(false); + const [indexingOptions, setIndexingOptions] = useState(existingIndexingOptions); + + // Update selected folders and files when connector config changes + useEffect(() => { + const folders = (connector.config?.selected_folders as SelectedFolder[] | undefined) || []; + const files = (connector.config?.selected_files as SelectedFolder[] | undefined) || []; + const options = + (connector.config?.indexing_options as IndexingOptions | undefined) || + DEFAULT_INDEXING_OPTIONS; + setSelectedFolders(folders); + setSelectedFiles(files); + setIndexingOptions(options); + }, [connector.config]); + + const updateConfig = ( + folders: SelectedFolder[], + files: SelectedFolder[], + options: IndexingOptions + ) => { + if (onConfigChange) { + onConfigChange({ + ...connector.config, + selected_folders: folders, + selected_files: files, + indexing_options: options, + }); + } + }; + + const handleSelectFolders = (folders: SelectedFolder[]) => { + setSelectedFolders(folders); + updateConfig(folders, selectedFiles, indexingOptions); + }; + + const handleSelectFiles = (files: SelectedFolder[]) => { + setSelectedFiles(files); + updateConfig(selectedFolders, files, indexingOptions); + }; + + const handleIndexingOptionChange = (key: keyof IndexingOptions, value: number | boolean) => { + const newOptions = { ...indexingOptions, [key]: value }; + setIndexingOptions(newOptions); + updateConfig(selectedFolders, selectedFiles, newOptions); + }; + + const handleRemoveFolder = (folderId: string) => { + const newFolders = selectedFolders.filter((folder) => folder.id !== folderId); + setSelectedFolders(newFolders); + updateConfig(newFolders, selectedFiles, indexingOptions); + }; + + const handleRemoveFile = (fileId: string) => { + const newFiles = selectedFiles.filter((file) => file.id !== fileId); + setSelectedFiles(newFiles); + updateConfig(selectedFolders, newFiles, indexingOptions); + }; + + const totalSelected = selectedFolders.length + selectedFiles.length; + + // Only show configuration if the connector is indexable + if (!isIndexable) { + return
; + } + + return ( +
+ {/* Folder & File Selection */} +
+
+

Folder & File Selection

+

+ Select specific folders and/or individual files to index from your Google Drive. +

+
+ + {totalSelected > 0 && ( +
+

+ Selected {totalSelected} item{totalSelected > 1 ? "s" : ""}: {(() => { + const parts: string[] = []; + if (selectedFolders.length > 0) { + parts.push( + `${selectedFolders.length} folder${selectedFolders.length > 1 ? "s" : ""}` + ); + } + if (selectedFiles.length > 0) { + parts.push(`${selectedFiles.length} file${selectedFiles.length > 1 ? "s" : ""}`); + } + return parts.length > 0 ? `(${parts.join(", ")})` : ""; + })()} +

+
+ {selectedFolders.map((folder) => ( +
+ + {folder.name} + +
+ ))} + {selectedFiles.map((file) => ( +
+ {getFileIconFromName(file.name)} + {file.name} + +
+ ))} +
+
+ )} + + {showFolderSelector ? ( +
+ + +
+ ) : ( + + )} +
+ + {/* Indexing Options */} +
+
+

Indexing Options

+

+ Configure how files are indexed from your Google Drive. +

+
+ + {/* Max files per folder */} +
+
+
+ +

+ Maximum number of files to index from each folder +

+
+ +
+
+ + {/* Include subfolders toggle */} +
+
+ +

+ Recursively index files in subfolders of selected folders +

+
+ handleIndexingOptionChange("include_subfolders", checked)} + /> +
+
+
+ ); +}; diff --git a/surfsense_web/components/assistant-ui/connector-popup/connector-configs/components/composio-gmail-config.tsx b/surfsense_web/components/assistant-ui/connector-popup/connector-configs/components/composio-gmail-config.tsx new file mode 100644 index 000000000..494e1362f --- /dev/null +++ b/surfsense_web/components/assistant-ui/connector-popup/connector-configs/components/composio-gmail-config.tsx @@ -0,0 +1,14 @@ +"use client"; + +import type { FC } from "react"; +import type { SearchSourceConnector } from "@/contracts/types/connector.types"; + +interface ComposioGmailConfigProps { + connector: SearchSourceConnector; + onConfigChange?: (config: Record) => void; + onNameChange?: (name: string) => void; +} + +export const ComposioGmailConfig: FC = () => { + return
; +}; diff --git a/surfsense_web/components/assistant-ui/connector-popup/connector-configs/components/google-drive-config.tsx b/surfsense_web/components/assistant-ui/connector-popup/connector-configs/components/google-drive-config.tsx index 17f4a49a5..383f6ce0e 100644 --- a/surfsense_web/components/assistant-ui/connector-popup/connector-configs/components/google-drive-config.tsx +++ b/surfsense_web/components/assistant-ui/connector-popup/connector-configs/components/google-drive-config.tsx @@ -1,6 +1,14 @@ "use client"; -import { File, FileSpreadsheet, FileText, FolderClosed, Image, Presentation } from "lucide-react"; +import { + File, + FileSpreadsheet, + FileText, + FolderClosed, + Image, + Presentation, + X, +} from "lucide-react"; import type { FC } from "react"; import { useEffect, useState } from "react"; import { GoogleDriveFolderTree } from "@/components/connectors/google-drive-folder-tree"; @@ -135,6 +143,18 @@ export const GoogleDriveConfig: FC = ({ connector, onConfi updateConfig(selectedFolders, selectedFiles, newOptions); }; + const handleRemoveFolder = (folderId: string) => { + const newFolders = selectedFolders.filter((folder) => folder.id !== folderId); + setSelectedFolders(newFolders); + updateConfig(newFolders, selectedFiles, indexingOptions); + }; + + const handleRemoveFile = (fileId: string) => { + const newFiles = selectedFiles.filter((file) => file.id !== fileId); + setSelectedFiles(newFiles); + updateConfig(selectedFolders, newFiles, indexingOptions); + }; + const totalSelected = selectedFolders.length + selectedFiles.length; return ( @@ -161,29 +181,45 @@ export const GoogleDriveConfig: FC = ({ connector, onConfi if (selectedFiles.length > 0) { parts.push(`${selectedFiles.length} file${selectedFiles.length > 1 ? "s" : ""}`); } - return parts.length > 0 ? `(${parts.join(" ")})` : ""; + return parts.length > 0 ? `(${parts.join(", ")})` : ""; })()}

{selectedFolders.map((folder) => ( -

- {folder.name} -

+ {folder.name} + +
))} {selectedFiles.map((file) => ( -

{getFileIconFromName(file.name)} - {file.name} -

+ {file.name} + +
))}
diff --git a/surfsense_web/components/assistant-ui/connector-popup/connector-configs/index.tsx b/surfsense_web/components/assistant-ui/connector-popup/connector-configs/index.tsx index 877b16627..6b4d86b5a 100644 --- a/surfsense_web/components/assistant-ui/connector-popup/connector-configs/index.tsx +++ b/surfsense_web/components/assistant-ui/connector-popup/connector-configs/index.tsx @@ -6,7 +6,9 @@ import { BaiduSearchApiConfig } from "./components/baidu-search-api-config"; import { BookStackConfig } from "./components/bookstack-config"; import { CirclebackConfig } from "./components/circleback-config"; import { ClickUpConfig } from "./components/clickup-config"; -import { ComposioConfig } from "./components/composio-config"; +import { ComposioCalendarConfig } from "./components/composio-calendar-config"; +import { ComposioDriveConfig } from "./components/composio-drive-config"; +import { ComposioGmailConfig } from "./components/composio-gmail-config"; import { ConfluenceConfig } from "./components/confluence-config"; import { DiscordConfig } from "./components/discord-config"; import { ElasticsearchConfig } from "./components/elasticsearch-config"; @@ -77,8 +79,12 @@ export function getConnectorConfigComponent( return MCPConfig; case "OBSIDIAN_CONNECTOR": return ObsidianConfig; - case "COMPOSIO_CONNECTOR": - return ComposioConfig; + case "COMPOSIO_GOOGLE_DRIVE_CONNECTOR": + return ComposioDriveConfig; + case "COMPOSIO_GMAIL_CONNECTOR": + return ComposioGmailConfig; + case "COMPOSIO_GOOGLE_CALENDAR_CONNECTOR": + return ComposioCalendarConfig; // OAuth connectors (Gmail, Calendar, Airtable, Notion) and others don't need special config UI default: return null; diff --git a/surfsense_web/components/assistant-ui/connector-popup/connector-configs/views/connector-edit-view.tsx b/surfsense_web/components/assistant-ui/connector-popup/connector-configs/views/connector-edit-view.tsx index 66afd84a5..5668d398e 100644 --- a/surfsense_web/components/assistant-ui/connector-popup/connector-configs/views/connector-edit-view.tsx +++ b/surfsense_web/components/assistant-ui/connector-popup/connector-configs/views/connector-edit-view.tsx @@ -9,6 +9,7 @@ import { cn } from "@/lib/utils"; import { DateRangeSelector } from "../../components/date-range-selector"; import { PeriodicSyncConfig } from "../../components/periodic-sync-config"; import { getConnectorConfigComponent } from "../index"; +import { getConnectorDisplayName } from "../../tabs/all-connectors-tab"; interface ConnectorEditViewProps { connector: SearchSourceConnector; @@ -97,12 +98,16 @@ export const ConnectorEditView: FC = ({ }; }, [checkScrollState]); - // Reset local quick indexing state when indexing completes + // Reset local quick indexing state when indexing completes or fails useEffect(() => { - if (!isIndexing) { - setIsQuickIndexing(false); + if (!isIndexing && isQuickIndexing) { + // Small delay to ensure smooth transition + const timer = setTimeout(() => { + setIsQuickIndexing(false); + }, 100); + return () => clearTimeout(timer); } - }, [isIndexing]); + }, [isIndexing, isQuickIndexing]); const handleDisconnectClick = () => { setShowDisconnectConfirm(true); @@ -118,11 +123,11 @@ export const ConnectorEditView: FC = ({ }; const handleQuickIndex = useCallback(() => { - if (onQuickIndex) { + if (onQuickIndex && !isQuickIndexing && !isIndexing) { setIsQuickIndexing(true); onQuickIndex(); } - }, [onQuickIndex]); + }, [onQuickIndex, isQuickIndexing, isIndexing]); return (
@@ -151,7 +156,7 @@ export const ConnectorEditView: FC = ({

- {connector.name} + {getConnectorDisplayName(connector.name)}

Manage your connector settings and sync configuration @@ -206,8 +211,9 @@ export const ConnectorEditView: FC = ({ {/* Date range selector and periodic sync - only shown for indexable connectors */} {connector.is_indexable && ( <> - {/* Date range selector - not shown for Google Drive, Webcrawler, or GitHub (indexes full repo snapshots) */} + {/* Date range selector - not shown for Google Drive (regular and Composio), Webcrawler, or GitHub (indexes full repo snapshots) */} {connector.connector_type !== "GOOGLE_DRIVE_CONNECTOR" && + connector.connector_type !== "COMPOSIO_GOOGLE_DRIVE_CONNECTOR" && connector.connector_type !== "WEBCRAWLER_CONNECTOR" && connector.connector_type !== "GITHUB_CONNECTOR" && ( = ({ onEndDateChange={onEndDateChange} allowFutureDates={ connector.connector_type === "GOOGLE_CALENDAR_CONNECTOR" || + connector.connector_type === "COMPOSIO_GOOGLE_CALENDAR_CONNECTOR" || connector.connector_type === "LUMA_CONNECTOR" } /> @@ -224,8 +231,11 @@ export const ConnectorEditView: FC = ({ {/* Periodic sync - shown for all indexable connectors */} {(() => { - // Check if Google Drive has folders/files selected + // Check if Google Drive (regular or Composio) has folders/files selected const isGoogleDrive = connector.connector_type === "GOOGLE_DRIVE_CONNECTOR"; + const isComposioGoogleDrive = + connector.connector_type === "COMPOSIO_GOOGLE_DRIVE_CONNECTOR"; + const requiresFolderSelection = isGoogleDrive || isComposioGoogleDrive; const selectedFolders = (connector.config?.selected_folders as | Array<{ id: string; name: string }> @@ -235,7 +245,7 @@ export const ConnectorEditView: FC = ({ | Array<{ id: string; name: string }> | undefined) || []; const hasItemsSelected = selectedFolders.length > 0 || selectedFiles.length > 0; - const isDisabled = isGoogleDrive && !hasItemsSelected; + const isDisabled = requiresFolderSelection && !hasItemsSelected; return ( = ({ Re-indexing runs in the background

- You can continue using SurfSense while we sync your data. Check the Active tab - to see progress. + You can continue using SurfSense while we sync your data. Check inbox for + updates.

@@ -338,7 +348,7 @@ export const ConnectorEditView: FC = ({ {isSaving ? ( <> - Saving... + Saving ) : ( "Save Changes" diff --git a/surfsense_web/components/assistant-ui/connector-popup/connector-configs/views/indexing-configuration-view.tsx b/surfsense_web/components/assistant-ui/connector-popup/connector-configs/views/indexing-configuration-view.tsx index ea489aec8..684f03252 100644 --- a/surfsense_web/components/assistant-ui/connector-popup/connector-configs/views/indexing-configuration-view.tsx +++ b/surfsense_web/components/assistant-ui/connector-popup/connector-configs/views/indexing-configuration-view.tsx @@ -9,7 +9,7 @@ import { getConnectorTypeDisplay } from "@/lib/connectors/utils"; import { cn } from "@/lib/utils"; import { DateRangeSelector } from "../../components/date-range-selector"; import { PeriodicSyncConfig } from "../../components/periodic-sync-config"; -import { type IndexingConfigState, OAUTH_CONNECTORS } from "../../constants/connector-constants"; +import type { IndexingConfigState } from "../../constants/connector-constants"; import { getConnectorDisplayName } from "../../tabs/all-connectors-tab"; import { getConnectorConfigComponent } from "../index"; @@ -91,8 +91,6 @@ export const IndexingConfigurationView: FC = ({ }; }, [checkScrollState]); - const authConnector = OAUTH_CONNECTORS.find((c) => c.connectorType === connector?.connector_type); - return (
{/* Fixed Header */} @@ -151,8 +149,9 @@ export const IndexingConfigurationView: FC = ({ {/* Date range selector and periodic sync - only shown for indexable connectors */} {connector?.is_indexable && ( <> - {/* Date range selector - not shown for Google Drive, Webcrawler, or GitHub (indexes full repo snapshots) */} + {/* Date range selector - not shown for Google Drive (regular and Composio), Webcrawler, or GitHub (indexes full repo snapshots) */} {config.connectorType !== "GOOGLE_DRIVE_CONNECTOR" && + config.connectorType !== "COMPOSIO_GOOGLE_DRIVE_CONNECTOR" && config.connectorType !== "WEBCRAWLER_CONNECTOR" && config.connectorType !== "GITHUB_CONNECTOR" && ( = ({ onEndDateChange={onEndDateChange} allowFutureDates={ config.connectorType === "GOOGLE_CALENDAR_CONNECTOR" || + config.connectorType === "COMPOSIO_GOOGLE_CALENDAR_CONNECTOR" || config.connectorType === "LUMA_CONNECTOR" } /> )} - {/* Periodic sync - not shown for Google Drive */} - {config.connectorType !== "GOOGLE_DRIVE_CONNECTOR" && ( - - )} + {/* Periodic sync - not shown for Google Drive (regular and Composio) */} + {config.connectorType !== "GOOGLE_DRIVE_CONNECTOR" && + config.connectorType !== "COMPOSIO_GOOGLE_DRIVE_CONNECTOR" && ( + + )} )} @@ -188,8 +189,8 @@ export const IndexingConfigurationView: FC = ({

Indexing runs in the background

- You can continue using SurfSense while we sync your data. Check the Active tab - to see progress. + You can continue using SurfSense while we sync your data. Check inbox for + updates.

diff --git a/surfsense_web/components/assistant-ui/connector-popup/constants/connector-constants.ts b/surfsense_web/components/assistant-ui/connector-popup/constants/connector-constants.ts index dd07386b6..a3e8ae272 100644 --- a/surfsense_web/components/assistant-ui/connector-popup/constants/connector-constants.ts +++ b/surfsense_web/components/assistant-ui/connector-popup/constants/connector-constants.ts @@ -175,14 +175,28 @@ export const OTHER_CONNECTORS = [ }, ] as const; -// Composio Connector (Single entry that opens toolkit selector) +// Composio Connectors - Individual entries for each supported toolkit export const COMPOSIO_CONNECTORS = [ { - id: "composio-connector", - title: "Composio", - description: "Connect 100+ apps via Composio (Google, Slack, Notion, etc.)", - connectorType: EnumConnectorName.COMPOSIO_CONNECTOR, - // No authEndpoint - handled via toolkit selector view + id: "composio-googledrive", + title: "Google Drive", + description: "Search your Drive files via Composio", + connectorType: EnumConnectorName.COMPOSIO_GOOGLE_DRIVE_CONNECTOR, + authEndpoint: "/api/v1/auth/composio/connector/add/?toolkit_id=googledrive", + }, + { + id: "composio-gmail", + title: "Gmail", + description: "Search through your emails via Composio", + connectorType: EnumConnectorName.COMPOSIO_GMAIL_CONNECTOR, + authEndpoint: "/api/v1/auth/composio/connector/add/?toolkit_id=gmail", + }, + { + id: "composio-googlecalendar", + title: "Google Calendar", + description: "Search through your events via Composio", + connectorType: EnumConnectorName.COMPOSIO_GOOGLE_CALENDAR_CONNECTOR, + authEndpoint: "/api/v1/auth/composio/connector/add/?toolkit_id=googlecalendar", }, ] as const; diff --git a/surfsense_web/components/assistant-ui/connector-popup/constants/connector-popup.schemas.ts b/surfsense_web/components/assistant-ui/connector-popup/constants/connector-popup.schemas.ts index d74d66203..5a0a8e8c8 100644 --- a/surfsense_web/components/assistant-ui/connector-popup/constants/connector-popup.schemas.ts +++ b/surfsense_web/components/assistant-ui/connector-popup/constants/connector-popup.schemas.ts @@ -7,7 +7,9 @@ import { searchSourceConnectorTypeEnum } from "@/contracts/types/connector.types export const connectorPopupQueryParamsSchema = z.object({ modal: z.enum(["connectors"]).optional(), tab: z.enum(["all", "active"]).optional(), - view: z.enum(["configure", "edit", "connect", "youtube", "accounts", "mcp-list"]).optional(), + view: z + .enum(["configure", "edit", "connect", "youtube", "accounts", "mcp-list", "composio"]) + .optional(), connector: z.string().optional(), connectorId: z.string().optional(), connectorType: z.string().optional(), diff --git a/surfsense_web/components/assistant-ui/connector-popup/hooks/use-connector-dialog.ts b/surfsense_web/components/assistant-ui/connector-popup/hooks/use-connector-dialog.ts index c6ef1a927..639d0f7ed 100644 --- a/surfsense_web/components/assistant-ui/connector-popup/hooks/use-connector-dialog.ts +++ b/surfsense_web/components/assistant-ui/connector-popup/hooks/use-connector-dialog.ts @@ -26,7 +26,11 @@ import { import { cacheKeys } from "@/lib/query-client/cache-keys"; import { queryClient } from "@/lib/query-client/client"; import type { IndexingConfigState } from "../constants/connector-constants"; -import { OAUTH_CONNECTORS, OTHER_CONNECTORS } from "../constants/connector-constants"; +import { + COMPOSIO_CONNECTORS, + OAUTH_CONNECTORS, + OTHER_CONNECTORS, +} from "../constants/connector-constants"; import { dateRangeSchema, frequencyMinutesSchema, @@ -83,10 +87,6 @@ export const useConnectorDialog = () => { // MCP list view state (for managing multiple MCP connectors) const [viewingMCPList, setViewingMCPList] = useState(false); - // Composio toolkit view state - const [viewingComposio, setViewingComposio] = useState(false); - const [connectingComposioToolkit, setConnectingComposioToolkit] = useState(null); - // Track if we came from accounts list when entering edit mode const [cameFromAccountsList, setCameFromAccountsList] = useState<{ connectorType: string; @@ -159,32 +159,28 @@ export const useConnectorDialog = () => { setViewingMCPList(true); } - // Clear Composio view if view is not "composio" anymore - if (params.view !== "composio" && viewingComposio) { - setViewingComposio(false); - setConnectingComposioToolkit(null); - } - - // Handle Composio view - if (params.view === "composio" && !viewingComposio) { - setViewingComposio(true); - } - // Handle connect view if (params.view === "connect" && params.connectorType && !connectingConnectorType) { setConnectingConnectorType(params.connectorType); } // Handle accounts view - if (params.view === "accounts" && params.connectorType && !viewingAccountsType) { - const oauthConnector = OAUTH_CONNECTORS.find( - (c) => c.connectorType === params.connectorType - ); - if (oauthConnector) { - setViewingAccountsType({ - connectorType: oauthConnector.connectorType, - connectorTitle: oauthConnector.title, - }); + if (params.view === "accounts" && params.connectorType) { + // Update state if not set, or if connectorType has changed + const needsUpdate = + !viewingAccountsType || viewingAccountsType.connectorType !== params.connectorType; + + if (needsUpdate) { + // Check both OAUTH_CONNECTORS and COMPOSIO_CONNECTORS + const oauthConnector = + OAUTH_CONNECTORS.find((c) => c.connectorType === params.connectorType) || + COMPOSIO_CONNECTORS.find((c) => c.connectorType === params.connectorType); + if (oauthConnector) { + setViewingAccountsType({ + connectorType: oauthConnector.connectorType, + connectorTitle: oauthConnector.title, + }); + } } } @@ -195,7 +191,10 @@ export const useConnectorDialog = () => { // Handle configure view (for page refresh support) if (params.view === "configure" && params.connector && !indexingConfig && allConnectors) { - const oauthConnector = OAUTH_CONNECTORS.find((c) => c.id === params.connector); + // Check both OAUTH_CONNECTORS and COMPOSIO_CONNECTORS + const oauthConnector = + OAUTH_CONNECTORS.find((c) => c.id === params.connector) || + COMPOSIO_CONNECTORS.find((c) => c.id === params.connector); if (oauthConnector) { let existingConnector: SearchSourceConnector | undefined; if (params.connectorId) { @@ -293,6 +292,7 @@ export const useConnectorDialog = () => { indexingConfig, connectingConnectorType, viewingAccountsType, + viewingMCPList, ]); // Detect OAuth success / Failure and transition to config view @@ -328,58 +328,72 @@ export const useConnectorDialog = () => { return; } - if ( - params.success === "true" && - params.connector && - searchSpaceId && - params.modal === "connectors" - ) { - const oauthConnector = OAUTH_CONNECTORS.find((c) => c.id === params.connector); - if (oauthConnector) { - refetchAllConnectors().then((result) => { - if (!result.data) return; + if (params.success === "true" && searchSpaceId && params.modal === "connectors") { + refetchAllConnectors().then((result) => { + if (!result.data) return; - let newConnector: SearchSourceConnector | undefined; - if (params.connectorId) { - const connectorId = parseInt(params.connectorId, 10); - newConnector = result.data.find((c: SearchSourceConnector) => c.id === connectorId); - } else { + let newConnector: SearchSourceConnector | undefined; + let oauthConnector: + | (typeof OAUTH_CONNECTORS)[number] + | (typeof COMPOSIO_CONNECTORS)[number] + | undefined; + + // First, try to find connector by connectorId if provided + if (params.connectorId) { + const connectorId = parseInt(params.connectorId, 10); + newConnector = result.data.find((c: SearchSourceConnector) => c.id === connectorId); + + // If we found the connector, find the matching OAuth/Composio connector by type + if (newConnector) { + oauthConnector = + OAUTH_CONNECTORS.find((c) => c.connectorType === newConnector!.connector_type) || + COMPOSIO_CONNECTORS.find((c) => c.connectorType === newConnector!.connector_type); + } + } + + // If we don't have a connector yet, try to find by connector param + if (!newConnector && params.connector) { + oauthConnector = + OAUTH_CONNECTORS.find((c) => c.id === params.connector) || + COMPOSIO_CONNECTORS.find((c) => c.id === params.connector); + + if (oauthConnector) { newConnector = result.data.find( - (c: SearchSourceConnector) => c.connector_type === oauthConnector.connectorType + (c: SearchSourceConnector) => c.connector_type === oauthConnector!.connectorType ); } + } - if (newConnector) { - const connectorValidation = searchSourceConnector.safeParse(newConnector); - if (connectorValidation.success) { - // Track connector connected event for OAuth connectors - trackConnectorConnected( - Number(searchSpaceId), - oauthConnector.connectorType, - newConnector.id - ); + if (newConnector && oauthConnector) { + const connectorValidation = searchSourceConnector.safeParse(newConnector); + if (connectorValidation.success) { + // Track connector connected event for OAuth/Composio connectors + trackConnectorConnected( + Number(searchSpaceId), + oauthConnector.connectorType, + newConnector.id + ); - const config = validateIndexingConfigState({ - connectorType: oauthConnector.connectorType, - connectorId: newConnector.id, - connectorTitle: oauthConnector.title, - }); - setIndexingConfig(config); - setIndexingConnector(newConnector); - setIndexingConnectorConfig(newConnector.config); - setIsOpen(true); - const url = new URL(window.location.href); - url.searchParams.delete("success"); - url.searchParams.set("connectorId", newConnector.id.toString()); - url.searchParams.set("view", "configure"); - window.history.replaceState({}, "", url.toString()); - } else { - console.warn("Invalid connector data after OAuth:", connectorValidation.error); - toast.error("Failed to validate connector data"); - } + const config = validateIndexingConfigState({ + connectorType: oauthConnector.connectorType, + connectorId: newConnector.id, + connectorTitle: oauthConnector.title, + }); + setIndexingConfig(config); + setIndexingConnector(newConnector); + setIndexingConnectorConfig(newConnector.config); + setIsOpen(true); + const url = new URL(window.location.href); + url.searchParams.delete("success"); + url.searchParams.set("connectorId", newConnector.id.toString()); + url.searchParams.set("view", "configure"); + window.history.replaceState({}, "", url.toString()); + } else { + console.warn("Invalid connector data after OAuth:", connectorValidation.error); + toast.error("Failed to validate connector data"); } - }); - } + } + }); } } catch (error) { // Invalid query params - log but don't crash @@ -389,17 +403,18 @@ export const useConnectorDialog = () => { // Handle OAuth connection const handleConnectOAuth = useCallback( - async (connector: (typeof OAUTH_CONNECTORS)[number]) => { + async (connector: (typeof OAUTH_CONNECTORS)[number] | (typeof COMPOSIO_CONNECTORS)[number]) => { if (!searchSpaceId || !connector.authEndpoint) return; // Set connecting state immediately to disable button and show spinner setConnectingId(connector.id); try { - const response = await authenticatedFetch( - `${process.env.NEXT_PUBLIC_FASTAPI_BACKEND_URL}${connector.authEndpoint}?space_id=${searchSpaceId}`, - { method: "GET" } - ); + // Check if authEndpoint already has query parameters + const separator = connector.authEndpoint.includes("?") ? "&" : "?"; + const url = `${process.env.NEXT_PUBLIC_FASTAPI_BACKEND_URL}${connector.authEndpoint}${separator}space_id=${searchSpaceId}`; + + const response = await authenticatedFetch(url, { method: "GET" }); if (!response.ok) { throw new Error(`Failed to initiate ${connector.title} OAuth`); @@ -799,23 +814,19 @@ export const useConnectorDialog = () => { // Handle viewing accounts list for OAuth connector type const handleViewAccountsList = useCallback( - (connectorType: string, connectorTitle: string) => { + (connectorType: string, _connectorTitle?: string) => { if (!searchSpaceId) return; - setViewingAccountsType({ - connectorType, - connectorTitle, - }); - // Update URL to show accounts view, preserving current tab + // The useEffect will handle setting viewingAccountsType based on URL params const url = new URL(window.location.href); url.searchParams.set("modal", "connectors"); url.searchParams.set("view", "accounts"); url.searchParams.set("connectorType", connectorType); // Keep the current tab in URL so we can go back to it - window.history.pushState({ modal: true }, "", url.toString()); + router.replace(url.pathname + url.search, { scroll: false }); }, - [searchSpaceId] + [searchSpaceId, router] ); // Handle going back from accounts list view @@ -839,8 +850,8 @@ export const useConnectorDialog = () => { const url = new URL(window.location.href); url.searchParams.set("modal", "connectors"); url.searchParams.set("view", "mcp-list"); - window.history.pushState({ modal: true }, "", url.toString()); - }, [searchSpaceId]); + router.replace(url.pathname + url.search, { scroll: false }); + }, [searchSpaceId, router]); // Handle going back from MCP list view const handleBackFromMCPList = useCallback(() => { @@ -861,71 +872,15 @@ export const useConnectorDialog = () => { router.replace(url.pathname + url.search, { scroll: false }); }, [router]); - // Handle opening Composio toolkit view - const handleOpenComposio = useCallback(() => { - if (!searchSpaceId) return; - - setViewingComposio(true); - - // Update URL to show Composio view - const url = new URL(window.location.href); - url.searchParams.set("modal", "connectors"); - url.searchParams.set("view", "composio"); - window.history.pushState({ modal: true }, "", url.toString()); - }, [searchSpaceId]); - - // Handle going back from Composio view - const handleBackFromComposio = useCallback(() => { - setViewingComposio(false); - setConnectingComposioToolkit(null); - const url = new URL(window.location.href); - url.searchParams.set("modal", "connectors"); - url.searchParams.delete("view"); - router.replace(url.pathname + url.search, { scroll: false }); - }, [router]); - - // Handle connecting a Composio toolkit - const handleConnectComposioToolkit = useCallback( - async (toolkitId: string) => { - if (!searchSpaceId) return; - - setConnectingComposioToolkit(toolkitId); - - try { - const response = await authenticatedFetch( - `${process.env.NEXT_PUBLIC_FASTAPI_BACKEND_URL}/api/v1/auth/composio/connector/add?space_id=${searchSpaceId}&toolkit_id=${toolkitId}`, - { method: "GET" } - ); - - if (!response.ok) { - throw new Error(`Failed to initiate Composio OAuth for ${toolkitId}`); - } - - const data = await response.json(); - - if (data.auth_url) { - // Redirect to Composio OAuth - window.location.href = data.auth_url; - } else { - throw new Error("No authorization URL received from Composio"); - } - } catch (error) { - console.error("Error connecting Composio toolkit:", error); - toast.error(`Failed to connect ${toolkitId}. Please try again.`); - setConnectingComposioToolkit(null); - } - }, - [searchSpaceId] - ); - // Handle starting indexing const handleStartIndexing = useCallback( async (refreshConnectors: () => void) => { if (!indexingConfig || !searchSpaceId) return; - // Validate date range (skip for Google Drive and Webcrawler) + // Validate date range (skip for Google Drive, Composio Drive, and Webcrawler) if ( indexingConfig.connectorType !== "GOOGLE_DRIVE_CONNECTOR" && + indexingConfig.connectorType !== "COMPOSIO_GOOGLE_DRIVE_CONNECTOR" && indexingConfig.connectorType !== "WEBCRAWLER_CONNECTOR" ) { const dateRangeValidation = dateRangeSchema.safeParse({ startDate, endDate }); @@ -970,8 +925,12 @@ export const useConnectorDialog = () => { }); } - // Handle Google Drive folder selection - if (indexingConfig.connectorType === "GOOGLE_DRIVE_CONNECTOR" && indexingConnectorConfig) { + // Handle Google Drive folder selection (regular and Composio) + if ( + (indexingConfig.connectorType === "GOOGLE_DRIVE_CONNECTOR" || + indexingConfig.connectorType === "COMPOSIO_GOOGLE_DRIVE_CONNECTOR") && + indexingConnectorConfig + ) { const selectedFolders = indexingConnectorConfig.selected_folders as | Array<{ id: string; name: string }> | undefined; @@ -1191,8 +1150,12 @@ export const useConnectorDialog = () => { return; } - // Prevent periodic indexing for Google Drive without folders/files selected - if (periodicEnabled && editingConnector.connector_type === "GOOGLE_DRIVE_CONNECTOR") { + // Prevent periodic indexing for Google Drive (regular or Composio) without folders/files selected + if ( + periodicEnabled && + (editingConnector.connector_type === "GOOGLE_DRIVE_CONNECTOR" || + editingConnector.connector_type === "COMPOSIO_GOOGLE_DRIVE_CONNECTOR") + ) { const selectedFolders = (connectorConfig || editingConnector.config)?.selected_folders as | Array<{ id: string; name: string }> | undefined; @@ -1241,8 +1204,11 @@ export const useConnectorDialog = () => { if (!editingConnector.is_indexable) { // Non-indexable connectors (like Tavily API) don't need re-indexing indexingDescription = "Settings saved."; - } else if (editingConnector.connector_type === "GOOGLE_DRIVE_CONNECTOR") { - // Google Drive uses folder selection from config, not date ranges + } else if ( + editingConnector.connector_type === "GOOGLE_DRIVE_CONNECTOR" || + editingConnector.connector_type === "COMPOSIO_GOOGLE_DRIVE_CONNECTOR" + ) { + // Google Drive (both regular and Composio) uses folder selection from config, not date ranges const selectedFolders = (connectorConfig || editingConnector.config)?.selected_folders as | Array<{ id: string; name: string }> | undefined; @@ -1423,13 +1389,24 @@ export const useConnectorDialog = () => { setIsDisconnecting(false); } }, - [editingConnector, searchSpaceId, deleteConnector, router] + [editingConnector, searchSpaceId, deleteConnector, router, cameFromMCPList] ); - // Handle quick index (index without date picker, uses backend defaults) + // Handle quick index (index with selected date range, or backend defaults if none selected) const handleQuickIndexConnector = useCallback( - async (connectorId: number, connectorType?: string) => { - if (!searchSpaceId) return; + async ( + connectorId: number, + connectorType?: string, + stopIndexing?: (id: number) => void, + startDate?: Date, + endDate?: Date + ) => { + if (!searchSpaceId) { + if (stopIndexing) { + stopIndexing(connectorId); + } + return; + } // Track quick index clicked event if (connectorType) { @@ -1437,10 +1414,16 @@ export const useConnectorDialog = () => { } try { + // Format dates if provided, otherwise pass undefined (backend will use defaults) + const startDateStr = startDate ? format(startDate, "yyyy-MM-dd") : undefined; + const endDateStr = endDate ? format(endDate, "yyyy-MM-dd") : undefined; + await indexConnector({ connector_id: connectorId, queryParams: { search_space_id: searchSpaceId, + start_date: startDateStr, + end_date: endDateStr, }, }); toast.success("Indexing started", { @@ -1451,12 +1434,18 @@ export const useConnectorDialog = () => { queryClient.invalidateQueries({ queryKey: cacheKeys.logs.summary(Number(searchSpaceId)), }); + // Note: Don't call stopIndexing here - let useIndexingConnectors hook + // detect when last_indexed_at changes via Electric SQL } catch (error) { console.error("Error indexing connector content:", error); toast.error(error instanceof Error ? error.message : "Failed to start indexing"); + // Stop indexing state on error + if (stopIndexing) { + stopIndexing(connectorId); + } } }, - [searchSpaceId, indexConnector] + [searchSpaceId, indexConnector, queryClient] ); // Handle going back from edit view @@ -1578,7 +1567,6 @@ export const useConnectorDialog = () => { allConnectors, viewingAccountsType, viewingMCPList, - viewingComposio, // Setters setSearchQuery, @@ -1614,12 +1602,5 @@ export const useConnectorDialog = () => { connectorConfig, setConnectorConfig, setIndexingConnectorConfig, - - // Composio - viewingComposio, - connectingComposioToolkit, - handleOpenComposio, - handleBackFromComposio, - handleConnectComposioToolkit, }; }; diff --git a/surfsense_web/components/assistant-ui/connector-popup/hooks/use-indexing-connectors.ts b/surfsense_web/components/assistant-ui/connector-popup/hooks/use-indexing-connectors.ts index 2ac8d340a..19741e020 100644 --- a/surfsense_web/components/assistant-ui/connector-popup/hooks/use-indexing-connectors.ts +++ b/surfsense_web/components/assistant-ui/connector-popup/hooks/use-indexing-connectors.ts @@ -2,17 +2,24 @@ import { useCallback, useEffect, useRef, useState } from "react"; import type { SearchSourceConnector } from "@/contracts/types/connector.types"; +import type { InboxItem } from "@/contracts/types/inbox.types"; +import { isConnectorIndexingMetadata } from "@/contracts/types/inbox.types"; /** * Hook to track which connectors are currently indexing using local state. * * This provides a better UX than polling by: * 1. Setting indexing state immediately when user triggers indexing (optimistic) - * 2. Clearing indexing state when Electric SQL detects last_indexed_at changed + * 2. Detecting in_progress notifications from Electric SQL to restore state after remounts + * 3. Clearing indexing state when notifications become completed or failed + * 4. Clearing indexing state when Electric SQL detects last_indexed_at changed * * The actual `last_indexed_at` value comes from Electric SQL/PGlite, not local state. */ -export function useIndexingConnectors(connectors: SearchSourceConnector[]) { +export function useIndexingConnectors( + connectors: SearchSourceConnector[], + inboxItems?: InboxItem[] +) { // Set of connector IDs that are currently indexing const [indexingConnectorIds, setIndexingConnectorIds] = useState>(new Set()); @@ -22,31 +29,71 @@ export function useIndexingConnectors(connectors: SearchSourceConnector[]) { // Detect when last_indexed_at changes (indexing completed) via Electric SQL useEffect(() => { const previousValues = previousLastIndexedAtRef.current; - const newIndexingIds = new Set(indexingConnectorIds); - let hasChanges = false; for (const connector of connectors) { const previousValue = previousValues.get(connector.id); const currentValue = connector.last_indexed_at; - // If last_indexed_at changed and connector was in indexing state, clear it + // If last_indexed_at changed, clear it from indexing state if ( previousValue !== undefined && // We've seen this connector before - previousValue !== currentValue && // Value changed - indexingConnectorIds.has(connector.id) // It was marked as indexing + previousValue !== currentValue // Value changed ) { - newIndexingIds.delete(connector.id); - hasChanges = true; + // Use functional update to access current state + setIndexingConnectorIds((prev) => { + if (prev.has(connector.id)) { + const next = new Set(prev); + next.delete(connector.id); + return next; + } + return prev; + }); } // Update previous value tracking previousValues.set(connector.id, currentValue); } + }, [connectors]); - if (hasChanges) { - setIndexingConnectorIds(newIndexingIds); - } - }, [connectors, indexingConnectorIds]); + // Detect notification status changes and update indexing state accordingly + // This restores spinner state after component remounts and handles all status transitions + useEffect(() => { + if (!inboxItems || inboxItems.length === 0) return; + + setIndexingConnectorIds((prev) => { + const newIndexingIds = new Set(prev); + let hasChanges = false; + + for (const item of inboxItems) { + // Only check connector_indexing notifications + if (item.type !== "connector_indexing") continue; + + const metadata = isConnectorIndexingMetadata(item.metadata) ? item.metadata : null; + if (!metadata) continue; + + // If status is "in_progress", add connector to indexing set + if (metadata.status === "in_progress") { + if (!newIndexingIds.has(metadata.connector_id)) { + newIndexingIds.add(metadata.connector_id); + hasChanges = true; + } + } + // If status is "completed" or "failed", remove connector from indexing set + else if ( + metadata.status === "completed" || + metadata.status === "failed" || + (metadata.error_message && metadata.error_message.trim().length > 0) + ) { + if (newIndexingIds.has(metadata.connector_id)) { + newIndexingIds.delete(metadata.connector_id); + hasChanges = true; + } + } + } + + return hasChanges ? newIndexingIds : prev; + }); + }, [inboxItems]); // Add a connector to the indexing set (called when indexing starts) const startIndexing = useCallback((connectorId: number) => { diff --git a/surfsense_web/components/assistant-ui/connector-popup/tabs/active-connectors-tab.tsx b/surfsense_web/components/assistant-ui/connector-popup/tabs/active-connectors-tab.tsx index a518d63a6..2067ca9ad 100644 --- a/surfsense_web/components/assistant-ui/connector-popup/tabs/active-connectors-tab.tsx +++ b/surfsense_web/components/assistant-ui/connector-popup/tabs/active-connectors-tab.tsx @@ -13,8 +13,9 @@ import type { SearchSourceConnector } from "@/contracts/types/connector.types"; import type { LogActiveTask, LogSummary } from "@/contracts/types/log.types"; import { connectorsApiService } from "@/lib/apis/connectors-api.service"; import { cn } from "@/lib/utils"; -import { OAUTH_CONNECTORS } from "../constants/connector-constants"; +import { COMPOSIO_CONNECTORS, OAUTH_CONNECTORS } from "../constants/connector-constants"; import { getDocumentCountForConnector } from "../utils/connector-document-mapping"; +import { getConnectorDisplayName } from "./all-connectors-tab"; interface ActiveConnectorsTabProps { searchQuery: string; @@ -113,7 +114,10 @@ export const ActiveConnectorsTab: FC = ({ // Get display info for OAuth connector type const getOAuthConnectorTypeInfo = (connectorType: string) => { - const oauthConnector = OAUTH_CONNECTORS.find((c) => c.connectorType === connectorType); + // Check both OAUTH_CONNECTORS and COMPOSIO_CONNECTORS + const oauthConnector = + OAUTH_CONNECTORS.find((c) => c.connectorType === connectorType) || + COMPOSIO_CONNECTORS.find((c) => c.connectorType === connectorType); return { title: oauthConnector?.title || @@ -260,8 +264,8 @@ export const ActiveConnectorsTab: FC = ({
-

- {connector.name} +

+ {getConnectorDisplayName(connector.name)}

{isIndexing ? ( diff --git a/surfsense_web/components/assistant-ui/connector-popup/tabs/all-connectors-tab.tsx b/surfsense_web/components/assistant-ui/connector-popup/tabs/all-connectors-tab.tsx index 65968dea8..0268ab761 100644 --- a/surfsense_web/components/assistant-ui/connector-popup/tabs/all-connectors-tab.tsx +++ b/surfsense_web/components/assistant-ui/connector-popup/tabs/all-connectors-tab.tsx @@ -4,7 +4,6 @@ import type { FC } from "react"; import { EnumConnectorName } from "@/contracts/enums/connector"; import type { SearchSourceConnector } from "@/contracts/types/connector.types"; import { isSelfHosted } from "@/lib/env-config"; -import { ComposioConnectorCard } from "../components/composio-connector-card"; import { ConnectorCard } from "../components/connector-card"; import { COMPOSIO_CONNECTORS, @@ -35,13 +34,14 @@ interface AllConnectorsTabProps { allConnectors: SearchSourceConnector[] | undefined; documentTypeCounts?: Record; indexingConnectorIds?: Set; - onConnectOAuth: (connector: (typeof OAUTH_CONNECTORS)[number]) => void; + onConnectOAuth: ( + connector: (typeof OAUTH_CONNECTORS)[number] | (typeof COMPOSIO_CONNECTORS)[number] + ) => void; onConnectNonOAuth?: (connectorType: string) => void; onCreateWebcrawler?: () => void; onCreateYouTubeCrawler?: () => void; onManage?: (connector: SearchSourceConnector) => void; onViewAccountsList?: (connectorType: string, connectorTitle: string) => void; - onOpenComposio?: () => void; } export const AllConnectorsTab: FC = ({ @@ -57,7 +57,6 @@ export const AllConnectorsTab: FC = ({ onCreateYouTubeCrawler, onManage, onViewAccountsList, - onOpenComposio, }) => { // Check if self-hosted mode (for showing self-hosted only connectors) const selfHosted = isSelfHosted(); @@ -93,23 +92,18 @@ export const AllConnectorsTab: FC = ({ c.description.toLowerCase().includes(searchQuery.toLowerCase()) ); - // Count Composio connectors - const composioConnectorCount = allConnectors - ? allConnectors.filter( - (c: SearchSourceConnector) => c.connector_type === EnumConnectorName.COMPOSIO_CONNECTOR - ).length - : 0; - return (
- {/* Quick Connect */} - {filteredOAuth.length > 0 && ( + {/* Managed OAuth (Composio Integrations) */} + {filteredComposio.length > 0 && (
-

Quick Connect

+

+ Managed OAuth (Composio) +

- {filteredOAuth.map((connector) => { + {filteredComposio.map((connector) => { const isConnected = connectedTypes.has(connector.connectorType); const isConnecting = connectingId === connector.id; @@ -123,18 +117,6 @@ export const AllConnectorsTab: FC = ({ const accountCount = typeConnectors.length; - // Get the most recent last_indexed_at across all accounts - const mostRecentLastIndexed = typeConnectors.reduce( - (latest, c) => { - if (!c.last_indexed_at) return latest; - if (!latest) return c.last_indexed_at; - return new Date(c.last_indexed_at) > new Date(latest) - ? c.last_indexed_at - : latest; - }, - undefined - ); - const documentCount = getDocumentCountForConnector( connector.connectorType, documentTypeCounts @@ -168,29 +150,59 @@ export const AllConnectorsTab: FC = ({
)} - {/* Composio Integrations */} - {/* {filteredComposio.length > 0 && onOpenComposio && ( + {/* Quick Connect */} + {filteredOAuth.length > 0 && (
-

Managed OAuth

- - No verification needed - +

Quick Connect

- {filteredComposio.map((connector) => ( - - ))} + {filteredOAuth.map((connector) => { + const isConnected = connectedTypes.has(connector.connectorType); + const isConnecting = connectingId === connector.id; + + // Find all connectors of this type + const typeConnectors = + isConnected && allConnectors + ? allConnectors.filter( + (c: SearchSourceConnector) => c.connector_type === connector.connectorType + ) + : []; + + const accountCount = typeConnectors.length; + + const documentCount = getDocumentCountForConnector( + connector.connectorType, + documentTypeCounts + ); + + // Check if any account is currently indexing + const isIndexing = typeConnectors.some((c) => indexingConnectorIds?.has(c.id)); + + return ( + onConnectOAuth(connector)} + onManage={ + isConnected && onViewAccountsList + ? () => onViewAccountsList(connector.connectorType, connector.title) + : undefined + } + /> + ); + })}
- )} */} + )} {/* More Integrations */} {filteredOther.length > 0 && ( diff --git a/surfsense_web/components/assistant-ui/connector-popup/utils/connector-document-mapping.ts b/surfsense_web/components/assistant-ui/connector-popup/utils/connector-document-mapping.ts index 6b721e774..090207bbb 100644 --- a/surfsense_web/components/assistant-ui/connector-popup/utils/connector-document-mapping.ts +++ b/surfsense_web/components/assistant-ui/connector-popup/utils/connector-document-mapping.ts @@ -31,7 +31,10 @@ export const CONNECTOR_TO_DOCUMENT_TYPE: Record = { // Special mappings (connector type differs from document type) GOOGLE_DRIVE_CONNECTOR: "GOOGLE_DRIVE_FILE", WEBCRAWLER_CONNECTOR: "CRAWLED_URL", - COMPOSIO_CONNECTOR: "COMPOSIO_CONNECTOR", + // Composio connectors map to their own document types + COMPOSIO_GOOGLE_DRIVE_CONNECTOR: "COMPOSIO_GOOGLE_DRIVE_CONNECTOR", + COMPOSIO_GMAIL_CONNECTOR: "COMPOSIO_GMAIL_CONNECTOR", + COMPOSIO_GOOGLE_CALENDAR_CONNECTOR: "COMPOSIO_GOOGLE_CALENDAR_CONNECTOR", }; /** diff --git a/surfsense_web/components/assistant-ui/connector-popup/views/composio-toolkit-view.tsx b/surfsense_web/components/assistant-ui/connector-popup/views/composio-toolkit-view.tsx deleted file mode 100644 index 9c0bd7223..000000000 --- a/surfsense_web/components/assistant-ui/connector-popup/views/composio-toolkit-view.tsx +++ /dev/null @@ -1,355 +0,0 @@ -"use client"; - -import { - ArrowLeft, - Calendar, - Check, - ExternalLink, - FileText, - Github, - HardDrive, - Loader2, - Mail, - MessageSquare, - Zap, -} from "lucide-react"; -import Image from "next/image"; -import type { FC } from "react"; -import { useState } from "react"; -import { Badge } from "@/components/ui/badge"; -import { Button } from "@/components/ui/button"; -import { cn } from "@/lib/utils"; - -interface ComposioToolkit { - id: string; - name: string; - description: string; - isIndexable: boolean; -} - -interface ComposioToolkitViewProps { - searchSpaceId: string; - connectedToolkits: string[]; - onBack: () => void; - onConnectToolkit: (toolkitId: string) => void; - isConnecting: boolean; - connectingToolkitId: string | null; -} - -// Available Composio toolkits -const COMPOSIO_TOOLKITS: ComposioToolkit[] = [ - { - id: "googledrive", - name: "Google Drive", - description: "Search your Drive files and documents", - isIndexable: true, - }, - { - id: "gmail", - name: "Gmail", - description: "Search through your emails", - isIndexable: true, - }, - { - id: "googlecalendar", - name: "Google Calendar", - description: "Search through your events", - isIndexable: true, - }, - { - id: "slack", - name: "Slack", - description: "Search Slack messages", - isIndexable: false, - }, - { - id: "notion", - name: "Notion", - description: "Search Notion pages", - isIndexable: false, - }, - { - id: "github", - name: "GitHub", - description: "Search repositories and code", - isIndexable: false, - }, -]; - -// Get icon for toolkit -const getToolkitIcon = (toolkitId: string, className?: string) => { - const iconClass = className || "size-5"; - - switch (toolkitId) { - case "googledrive": - return ( - Google Drive - ); - case "gmail": - return ( - Gmail - ); - case "googlecalendar": - return ( - Google Calendar - ); - case "slack": - return ( - Slack - ); - case "notion": - return ( - Notion - ); - case "github": - return ( - GitHub - ); - default: - return ; - } -}; - -export const ComposioToolkitView: FC = ({ - searchSpaceId, - connectedToolkits, - onBack, - onConnectToolkit, - isConnecting, - connectingToolkitId, -}) => { - const [hoveredToolkit, setHoveredToolkit] = useState(null); - - // Separate indexable and non-indexable toolkits - const indexableToolkits = COMPOSIO_TOOLKITS.filter((t) => t.isIndexable); - const nonIndexableToolkits = COMPOSIO_TOOLKITS.filter((t) => !t.isIndexable); - - return ( -
- {/* Header */} -
- {/* Back button */} - - - {/* Header content */} -
-
-
- Composio -
-
-

Composio

-

- Connect 100+ apps with managed OAuth - no verification needed -

-
-
- - Powered by Composio - - -
-
- - {/* Content */} -
- {/* Indexable Toolkits (Google Services) */} -
-
-

Google Services

- - Indexable - -
-

- Connect Google services via Composio's verified OAuth app. Your data will be - indexed and searchable. -

-
- {indexableToolkits.map((toolkit) => { - const isConnected = connectedToolkits.includes(toolkit.id); - const isThisConnecting = connectingToolkitId === toolkit.id; - - return ( -
setHoveredToolkit(toolkit.id)} - onMouseLeave={() => setHoveredToolkit(null)} - className={cn( - "group relative flex flex-col p-4 rounded-xl border transition-all duration-200", - isConnected - ? "border-emerald-500/30 bg-emerald-500/5" - : "border-border bg-card hover:border-violet-500/30 hover:bg-violet-500/5" - )} - > -
-
- {getToolkitIcon(toolkit.id, "size-5")} -
- {isConnected && ( - - - Connected - - )} -
-

{toolkit.name}

-

{toolkit.description}

- -
- ); - })} -
-
- - {/* Non-Indexable Toolkits (Coming Soon) */} -
-
-

More Integrations

- - Coming Soon - -
-

- Connect these services for future indexing support. Currently available for connection - only. -

-
- {nonIndexableToolkits.map((toolkit) => ( -
-
-
- {getToolkitIcon(toolkit.id, "size-5")} -
- - Soon - -
-

{toolkit.name}

-

{toolkit.description}

- -
- ))} -
-
- - {/* Info footer */} -
-
-
- -
-
-

Why use Composio?

-

- Composio provides pre-verified OAuth apps, so you don't need to wait for Google - app verification. Your data is securely processed through Composio's managed - authentication. -

-
-
-
-
-
- ); -}; diff --git a/surfsense_web/components/connectors/composio-drive-folder-tree.tsx b/surfsense_web/components/connectors/composio-drive-folder-tree.tsx new file mode 100644 index 000000000..76ae218cb --- /dev/null +++ b/surfsense_web/components/connectors/composio-drive-folder-tree.tsx @@ -0,0 +1,364 @@ +"use client"; + +import { + ChevronDown, + ChevronRight, + File, + FileSpreadsheet, + FileText, + FolderClosed, + FolderOpen, + HardDrive, + Image, + Loader2, + Presentation, +} from "lucide-react"; +import { useState } from "react"; +import { Checkbox } from "@/components/ui/checkbox"; +import { ScrollArea } from "@/components/ui/scroll-area"; +import { useComposioDriveFolders } from "@/hooks/use-composio-drive-folders"; +import { connectorsApiService } from "@/lib/apis/connectors-api.service"; +import { cn } from "@/lib/utils"; + +interface DriveItem { + id: string; + name: string; + mimeType: string; + isFolder: boolean; + parents?: string[]; + size?: number; + iconLink?: string; +} + +interface ItemTreeNode { + item: DriveItem; + children: DriveItem[] | null; // null = not loaded, [] = loaded but empty + isExpanded: boolean; + isLoading: boolean; +} + +interface SelectedFolder { + id: string; + name: string; +} + +interface ComposioDriveFolderTreeProps { + connectorId: number; + selectedFolders: SelectedFolder[]; + onSelectFolders: (folders: SelectedFolder[]) => void; + selectedFiles?: SelectedFolder[]; + onSelectFiles?: (files: SelectedFolder[]) => void; +} + +// Helper to get appropriate icon for file type +function getFileIcon(mimeType: string, className: string = "h-4 w-4") { + if (mimeType.includes("spreadsheet") || mimeType.includes("excel")) { + return ; + } + if (mimeType.includes("presentation") || mimeType.includes("powerpoint")) { + return ; + } + if (mimeType.includes("document") || mimeType.includes("word") || mimeType.includes("text")) { + return ; + } + if (mimeType.includes("image")) { + return ; + } + return ; +} + +export function ComposioDriveFolderTree({ + connectorId, + selectedFolders, + onSelectFolders, + selectedFiles = [], + onSelectFiles = () => {}, +}: ComposioDriveFolderTreeProps) { + const [itemStates, setItemStates] = useState>(new Map()); + + const { data: rootData, isLoading: isLoadingRoot } = useComposioDriveFolders({ + connectorId, + }); + + const rootItems = rootData?.items || []; + + const isFolderSelected = (folderId: string): boolean => { + return selectedFolders.some((f) => f.id === folderId); + }; + + const isFileSelected = (fileId: string): boolean => { + return selectedFiles.some((f) => f.id === fileId); + }; + + const toggleFolderSelection = (folderId: string, folderName: string) => { + if (isFolderSelected(folderId)) { + onSelectFolders(selectedFolders.filter((f) => f.id !== folderId)); + } else { + onSelectFolders([...selectedFolders, { id: folderId, name: folderName }]); + } + }; + + const toggleFileSelection = (fileId: string, fileName: string) => { + if (isFileSelected(fileId)) { + onSelectFiles(selectedFiles.filter((f) => f.id !== fileId)); + } else { + onSelectFiles([...selectedFiles, { id: fileId, name: fileName }]); + } + }; + + /** + * Find an item by ID across all loaded items (root and nested). + */ + const findItem = (itemId: string): DriveItem | undefined => { + const state = itemStates.get(itemId); + if (state?.item) return state.item; + + const rootItem = rootItems.find((item) => item.id === itemId); + if (rootItem) return rootItem; + + for (const [, nodeState] of itemStates) { + if (nodeState.children) { + const found = nodeState.children.find((child) => child.id === itemId); + if (found) return found; + } + } + + return undefined; + }; + + /** + * Load and display contents of a specific folder. + */ + const loadFolderContents = async (folderId: string) => { + try { + setItemStates((prev) => { + const newMap = new Map(prev); + const existing = newMap.get(folderId); + if (existing) { + newMap.set(folderId, { ...existing, isLoading: true }); + } else { + const item = findItem(folderId); + if (item) { + newMap.set(folderId, { + item, + children: null, + isExpanded: false, + isLoading: true, + }); + } + } + return newMap; + }); + + const data = await connectorsApiService.listComposioDriveFolders({ + connector_id: connectorId, + parent_id: folderId, + }); + const items = data.items || []; + + setItemStates((prev) => { + const newMap = new Map(prev); + const existing = newMap.get(folderId); + const item = existing?.item || findItem(folderId); + + if (item) { + newMap.set(folderId, { + item, + children: items, + isExpanded: true, + isLoading: false, + }); + } else { + console.error(`Could not find item for folderId: ${folderId}`); + } + return newMap; + }); + } catch (error) { + console.error("Error loading folder contents:", error); + setItemStates((prev) => { + const newMap = new Map(prev); + const existing = newMap.get(folderId); + if (existing) { + newMap.set(folderId, { ...existing, isLoading: false }); + } + return newMap; + }); + } + }; + + /** + * Toggle folder expand/collapse state. + */ + const toggleFolder = async (item: DriveItem) => { + if (!item.isFolder) return; + + const state = itemStates.get(item.id); + + if (!state || state.children === null) { + await loadFolderContents(item.id); + } else { + setItemStates((prev) => { + const newMap = new Map(prev); + newMap.set(item.id, { + ...state, + isExpanded: !state.isExpanded, + }); + return newMap; + }); + } + }; + + /** + * Render a single item (folder or file) with its children. + */ + const renderItem = (item: DriveItem, level: number = 0) => { + const state = itemStates.get(item.id); + const isExpanded = state?.isExpanded || false; + const isLoading = state?.isLoading || false; + const children = state?.children; + const isFolder = item.isFolder; + const isSelected = isFolder ? isFolderSelected(item.id) : isFileSelected(item.id); + + const childFolders = children?.filter((c) => c.isFolder) || []; + const childFiles = children?.filter((c) => !c.isFolder) || []; + + const indentSize = 0.75; // Smaller indent for mobile + + return ( +
+
+ {isFolder ? ( + + ) : ( + + )} + + { + if (isFolder) { + toggleFolderSelection(item.id, item.name); + } else { + toggleFileSelection(item.id, item.name); + } + }} + className="shrink-0 h-3.5 w-3.5 sm:h-4 sm:w-4 border-slate-400/20 dark:border-white/20" + onClick={(e) => e.stopPropagation()} + /> + +
+ {isFolder ? ( + isExpanded ? ( + + ) : ( + + ) + ) : ( + getFileIcon(item.mimeType, "h-3 w-3 sm:h-4 sm:w-4") + )} +
+ + {isFolder ? ( + + ) : ( + + {item.name} + + )} +
+ + {isExpanded && isFolder && children && ( +
+ {childFolders.map((child) => renderItem(child, level + 1))} + {childFiles.map((child) => renderItem(child, level + 1))} + + {children.length === 0 && ( +
+ Empty folder +
+ )} +
+ )} +
+ ); + }; + + return ( +
+ +
+
+
+ toggleFolderSelection("root", "My Drive")} + className="shrink-0 h-3.5 w-3.5 sm:h-4 sm:w-4 border-slate-400/20 dark:border-white/20" + /> + + +
+
+ + {isLoadingRoot && ( +
+ +
+ )} + +
+ {!isLoadingRoot && rootItems.map((item) => renderItem(item, 0))} +
+ + {!isLoadingRoot && rootItems.length === 0 && ( +
+ No files or folders found in your Google Drive +
+ )} +
+
+
+ ); +} diff --git a/surfsense_web/components/layout/providers/LayoutDataProvider.tsx b/surfsense_web/components/layout/providers/LayoutDataProvider.tsx index 52dc7196a..732b3099c 100644 --- a/surfsense_web/components/layout/providers/LayoutDataProvider.tsx +++ b/surfsense_web/components/layout/providers/LayoutDataProvider.tsx @@ -1,12 +1,13 @@ "use client"; import { useQuery, useQueryClient } from "@tanstack/react-query"; -import { useAtomValue } from "jotai"; +import { useAtomValue, useSetAtom } from "jotai"; import { Inbox, LogOut, SquareLibrary, Trash2 } from "lucide-react"; import { useParams, usePathname, useRouter } from "next/navigation"; import { useTranslations } from "next-intl"; import { useTheme } from "next-themes"; -import { useCallback, useMemo, useState } from "react"; +import { useCallback, useEffect, useMemo, useState } from "react"; +import { currentThreadAtom, resetCurrentThreadAtom } from "@/atoms/chat/current-thread.atom"; import { deleteSearchSpaceMutationAtom } from "@/atoms/search-spaces/search-space-mutation.atoms"; import { searchSpacesAtom } from "@/atoms/search-spaces/search-space-query.atoms"; import { currentUserAtom } from "@/atoms/user/user-query.atoms"; @@ -38,6 +39,17 @@ interface LayoutDataProviderProps { breadcrumb?: React.ReactNode; } +/** + * Format count for display: shows numbers up to 999, then "1k+", "2k+", etc. + */ +function formatInboxCount(count: number): string { + if (count <= 999) { + return count.toString(); + } + const thousands = Math.floor(count / 1000); + return `${thousands}k+`; +} + export function LayoutDataProvider({ searchSpaceId, children, @@ -55,11 +67,16 @@ export function LayoutDataProvider({ const { data: user } = useAtomValue(currentUserAtom); const { data: searchSpacesData, refetch: refetchSearchSpaces } = useAtomValue(searchSpacesAtom); const { mutateAsync: deleteSearchSpace } = useAtomValue(deleteSearchSpaceMutationAtom); + const currentThreadState = useAtomValue(currentThreadAtom); + const resetCurrentThread = useSetAtom(resetCurrentThreadAtom); - // Current IDs from URL + // State for handling new chat navigation when router is out of sync + const [pendingNewChat, setPendingNewChat] = useState(false); + + // Current IDs from URL, with fallback to atom for replaceState updates const currentChatId = params?.chat_id ? Number(Array.isArray(params.chat_id) ? params.chat_id[0] : params.chat_id) - : null; + : currentThreadState.id; // Fetch current search space (for caching purposes) useQuery({ @@ -111,6 +128,17 @@ export function LayoutDataProvider({ const [isDeletingSearchSpace, setIsDeletingSearchSpace] = useState(false); const [isLeavingSearchSpace, setIsLeavingSearchSpace] = useState(false); + // Effect to complete new chat navigation after router syncs + // This runs when handleNewChat detected an out-of-sync state and triggered a sync + useEffect(() => { + if (pendingNewChat && params?.chat_id) { + // Router is now synced (chat_id is in params), complete navigation to new-chat + resetCurrentThread(); + router.push(`/dashboard/${searchSpaceId}/new-chat`); + setPendingNewChat(false); + } + }, [pendingNewChat, params?.chat_id, router, searchSpaceId, resetCurrentThread]); + const searchSpaces: SearchSpace[] = useMemo(() => { if (!searchSpacesData || !Array.isArray(searchSpacesData)) return []; return searchSpacesData.map((space) => ({ @@ -161,18 +189,18 @@ export function LayoutDataProvider({ // Navigation items const navItems: NavItem[] = useMemo( () => [ - { - title: "Documents", - url: `/dashboard/${searchSpaceId}/documents`, - icon: SquareLibrary, - isActive: pathname?.includes("/documents"), - }, { title: "Inbox", url: "#inbox", // Special URL to indicate this is handled differently icon: Inbox, isActive: isInboxSidebarOpen, - badge: unreadCount > 0 ? (unreadCount > 99 ? "99+" : unreadCount) : undefined, + badge: unreadCount > 0 ? formatInboxCount(unreadCount) : undefined, + }, + { + title: "Documents", + url: `/dashboard/${searchSpaceId}/documents`, + icon: SquareLibrary, + isActive: pathname?.includes("/documents"), }, ], [searchSpaceId, pathname, isInboxSidebarOpen, unreadCount] @@ -278,8 +306,20 @@ export function LayoutDataProvider({ ); const handleNewChat = useCallback(() => { - router.push(`/dashboard/${searchSpaceId}/new-chat`); - }, [router, searchSpaceId]); + // Check if router is out of sync (thread created via replaceState but params don't have chat_id) + const isOutOfSync = currentThreadState.id !== null && !params?.chat_id; + + if (isOutOfSync) { + // First sync Next.js router by navigating to the current chat's actual URL + // This updates the router's internal state to match the browser URL + router.replace(`/dashboard/${searchSpaceId}/new-chat/${currentThreadState.id}`); + // Set flag to trigger navigation to new-chat after params update + setPendingNewChat(true); + } else { + // Normal navigation - router is in sync + router.push(`/dashboard/${searchSpaceId}/new-chat`); + } + }, [router, searchSpaceId, currentThreadState.id, params?.chat_id]); const handleChatSelect = useCallback( (chat: ChatItem) => { diff --git a/surfsense_web/components/layout/ui/sidebar/AllPrivateChatsSidebar.tsx b/surfsense_web/components/layout/ui/sidebar/AllPrivateChatsSidebar.tsx index 39f1b95bc..c094ff44a 100644 --- a/surfsense_web/components/layout/ui/sidebar/AllPrivateChatsSidebar.tsx +++ b/surfsense_web/components/layout/ui/sidebar/AllPrivateChatsSidebar.tsx @@ -231,7 +231,7 @@ export function AllPrivateChatsSidebar({ initial={{ x: "-100%" }} animate={{ x: 0 }} exit={{ x: "-100%" }} - transition={{ type: "spring", damping: 25, stiffness: 300 }} + transition={{ type: "tween", duration: 0.3, ease: "easeOut" }} className="fixed inset-y-0 left-0 z-70 w-80 bg-background shadow-xl flex flex-col pointer-events-auto isolate" role="dialog" aria-modal="true" diff --git a/surfsense_web/components/layout/ui/sidebar/AllSharedChatsSidebar.tsx b/surfsense_web/components/layout/ui/sidebar/AllSharedChatsSidebar.tsx index 8dd593945..76dbf1aad 100644 --- a/surfsense_web/components/layout/ui/sidebar/AllSharedChatsSidebar.tsx +++ b/surfsense_web/components/layout/ui/sidebar/AllSharedChatsSidebar.tsx @@ -231,7 +231,7 @@ export function AllSharedChatsSidebar({ initial={{ x: "-100%" }} animate={{ x: 0 }} exit={{ x: "-100%" }} - transition={{ type: "spring", damping: 25, stiffness: 300 }} + transition={{ type: "tween", duration: 0.3, ease: "easeOut" }} className="fixed inset-y-0 left-0 z-70 w-80 bg-background shadow-xl flex flex-col pointer-events-auto isolate" role="dialog" aria-modal="true" diff --git a/surfsense_web/components/layout/ui/sidebar/InboxSidebar.tsx b/surfsense_web/components/layout/ui/sidebar/InboxSidebar.tsx index 810e3a22e..e80c6e62d 100644 --- a/surfsense_web/components/layout/ui/sidebar/InboxSidebar.tsx +++ b/surfsense_web/components/layout/ui/sidebar/InboxSidebar.tsx @@ -70,6 +70,17 @@ function getInitials(name: string | null | undefined, email: string | null | und return "U"; } +/** + * Format count for display: shows numbers up to 999, then "1k+", "2k+", etc. + */ +function formatInboxCount(count: number): string { + if (count <= 999) { + return count.toString(); + } + const thousands = Math.floor(count / 1000); + return `${thousands}k+`; +} + /** * Get display name for connector type */ @@ -79,6 +90,9 @@ function getConnectorTypeDisplayName(connectorType: string): string { GOOGLE_CALENDAR_CONNECTOR: "Google Calendar", GOOGLE_GMAIL_CONNECTOR: "Gmail", GOOGLE_DRIVE_CONNECTOR: "Google Drive", + COMPOSIO_GOOGLE_DRIVE_CONNECTOR: "Composio Google Drive", + COMPOSIO_GMAIL_CONNECTOR: "Composio Gmail", + COMPOSIO_GOOGLE_CALENDAR_CONNECTOR: "Composio Google Calendar", LINEAR_CONNECTOR: "Linear", NOTION_CONNECTOR: "Notion", SLACK_CONNECTOR: "Slack", @@ -446,7 +460,7 @@ export function InboxSidebar({ initial={{ x: "-100%" }} animate={{ x: 0 }} exit={{ x: "-100%" }} - transition={{ type: "spring", damping: 25, stiffness: 300 }} + transition={{ type: "tween", duration: 0.3, ease: "easeOut" }} className="fixed inset-y-0 left-0 z-70 w-90 bg-background shadow-xl flex flex-col pointer-events-auto isolate" role="dialog" aria-modal="true" @@ -729,7 +743,7 @@ export function InboxSidebar({ {t("mentions") || "Mentions"} - {unreadMentionsCount} + {formatInboxCount(unreadMentionsCount)} @@ -741,7 +755,7 @@ export function InboxSidebar({ {t("status") || "Status"} - {unreadStatusCount} + {formatInboxCount(unreadStatusCount)} diff --git a/surfsense_web/components/layout/ui/sidebar/NavSection.tsx b/surfsense_web/components/layout/ui/sidebar/NavSection.tsx index d2d926de8..742a27bbc 100644 --- a/surfsense_web/components/layout/ui/sidebar/NavSection.tsx +++ b/surfsense_web/components/layout/ui/sidebar/NavSection.tsx @@ -39,7 +39,7 @@ export function NavSection({ items, onItemClick, isCollapsed = false }: NavSecti > {item.badge && ( - + {item.badge} )} @@ -70,7 +70,7 @@ export function NavSection({ items, onItemClick, isCollapsed = false }: NavSecti {item.title} {item.badge && ( - + {item.badge} )} diff --git a/surfsense_web/components/settings/llm-role-manager.tsx b/surfsense_web/components/settings/llm-role-manager.tsx index ba4c4970c..c41a2d3bf 100644 --- a/surfsense_web/components/settings/llm-role-manager.tsx +++ b/surfsense_web/components/settings/llm-role-manager.tsx @@ -398,7 +398,7 @@ export function LLMRoleManager({ searchSpaceId }: LLMRoleManagerProps) { className="flex items-center gap-2 text-xs md:text-sm h-9 md:h-10" > - {isSaving ? "Saving..." : "Save Changes"} + {isSaving ? "Saving" : "Save Changes"}
diff --git a/surfsense_web/components/ui/dialog.tsx b/surfsense_web/components/ui/dialog.tsx index d04d76520..f3fa856d3 100644 --- a/surfsense_web/components/ui/dialog.tsx +++ b/surfsense_web/components/ui/dialog.tsx @@ -38,7 +38,7 @@ const DialogContent = React.forwardRef< ; case EnumConnectorName.OBSIDIAN_CONNECTOR: return Obsidian; - case EnumConnectorName.COMPOSIO_CONNECTOR: - return Composio; + case EnumConnectorName.COMPOSIO_GOOGLE_DRIVE_CONNECTOR: + return Google Drive; + case EnumConnectorName.COMPOSIO_GMAIL_CONNECTOR: + return Gmail; + case EnumConnectorName.COMPOSIO_GOOGLE_CALENDAR_CONNECTOR: + return Google Calendar; // Additional cases for non-enum connector types case "YOUTUBE_CONNECTOR": return YouTube; @@ -89,8 +93,12 @@ export const getConnectorIcon = (connectorType: EnumConnectorName | string, clas return ; case "GOOGLE_DRIVE_FILE": return ; - case "COMPOSIO_CONNECTOR": - return Composio; + case "COMPOSIO_GOOGLE_DRIVE_CONNECTOR": + return Google Drive; + case "COMPOSIO_GMAIL_CONNECTOR": + return Gmail; + case "COMPOSIO_GOOGLE_CALENDAR_CONNECTOR": + return Google Calendar; case "NOTE": return ; case "EXTENSION": diff --git a/surfsense_web/contracts/types/connector.types.ts b/surfsense_web/contracts/types/connector.types.ts index 1c81507c6..5082fe49c 100644 --- a/surfsense_web/contracts/types/connector.types.ts +++ b/surfsense_web/contracts/types/connector.types.ts @@ -28,7 +28,9 @@ export const searchSourceConnectorTypeEnum = z.enum([ "CIRCLEBACK_CONNECTOR", "MCP_CONNECTOR", "OBSIDIAN_CONNECTOR", - "COMPOSIO_CONNECTOR", + "COMPOSIO_GOOGLE_DRIVE_CONNECTOR", + "COMPOSIO_GMAIL_CONNECTOR", + "COMPOSIO_GOOGLE_CALENDAR_CONNECTOR", ]); export const searchSourceConnector = z.object({ @@ -150,6 +152,13 @@ export const googleDriveIndexBody = z.object({ name: z.string(), }) ), + indexing_options: z + .object({ + max_files_per_folder: z.number().int().min(1).max(1000), + incremental_sync: z.boolean(), + include_subfolders: z.boolean(), + }) + .optional(), }); /** diff --git a/surfsense_web/contracts/types/document.types.ts b/surfsense_web/contracts/types/document.types.ts index a8f3a3b38..01a58173e 100644 --- a/surfsense_web/contracts/types/document.types.ts +++ b/surfsense_web/contracts/types/document.types.ts @@ -25,7 +25,9 @@ export const documentTypeEnum = z.enum([ "CIRCLEBACK", "SURFSENSE_DOCS", "NOTE", - "COMPOSIO_CONNECTOR", + "COMPOSIO_GOOGLE_DRIVE_CONNECTOR", + "COMPOSIO_GMAIL_CONNECTOR", + "COMPOSIO_GOOGLE_CALENDAR_CONNECTOR", ]); export const document = z.object({ diff --git a/surfsense_web/hooks/use-composio-drive-folders.ts b/surfsense_web/hooks/use-composio-drive-folders.ts new file mode 100644 index 000000000..31e516286 --- /dev/null +++ b/surfsense_web/hooks/use-composio-drive-folders.ts @@ -0,0 +1,28 @@ +import { useQuery } from "@tanstack/react-query"; +import { connectorsApiService } from "@/lib/apis/connectors-api.service"; +import { cacheKeys } from "@/lib/query-client/cache-keys"; + +interface UseComposioDriveFoldersOptions { + connectorId: number; + parentId?: string; + enabled?: boolean; +} + +export function useComposioDriveFolders({ + connectorId, + parentId, + enabled = true, +}: UseComposioDriveFoldersOptions) { + return useQuery({ + queryKey: cacheKeys.connectors.composioDrive.folders(connectorId, parentId), + queryFn: async () => { + return connectorsApiService.listComposioDriveFolders({ + connector_id: connectorId, + parent_id: parentId, + }); + }, + enabled: enabled && !!connectorId, + staleTime: 5 * 60 * 1000, // 5 minutes + retry: 2, + }); +} diff --git a/surfsense_web/lib/apis/connectors-api.service.ts b/surfsense_web/lib/apis/connectors-api.service.ts index 0e4f7f4d5..567db38de 100644 --- a/surfsense_web/lib/apis/connectors-api.service.ts +++ b/surfsense_web/lib/apis/connectors-api.service.ts @@ -233,6 +233,29 @@ class ConnectorsApiService { ); }; + /** + * List Composio Google Drive folders and files + */ + listComposioDriveFolders = async (request: ListGoogleDriveFoldersRequest) => { + const parsedRequest = listGoogleDriveFoldersRequest.safeParse(request); + + if (!parsedRequest.success) { + console.error("Invalid request:", parsedRequest.error); + + const errorMessage = parsedRequest.error.issues.map((issue) => issue.message).join(", "); + throw new ValidationError(`Invalid request: ${errorMessage}`); + } + + const { connector_id, parent_id } = parsedRequest.data; + + const queryParams = parent_id ? `?parent_id=${encodeURIComponent(parent_id)}` : ""; + + return baseApiService.get( + `/api/v1/connectors/${connector_id}/composio-drive/folders${queryParams}`, + listGoogleDriveFoldersResponse + ); + }; + // ============================================================================= // MCP Connector Methods // ============================================================================= diff --git a/surfsense_web/lib/connectors/utils.ts b/surfsense_web/lib/connectors/utils.ts index a85b912ed..0ca1c1ea9 100644 --- a/surfsense_web/lib/connectors/utils.ts +++ b/surfsense_web/lib/connectors/utils.ts @@ -16,11 +16,15 @@ export const getConnectorTypeDisplay = (type: string): string => { GOOGLE_CALENDAR_CONNECTOR: "Google Calendar", GOOGLE_GMAIL_CONNECTOR: "Google Gmail", GOOGLE_DRIVE_CONNECTOR: "Google Drive", + COMPOSIO_GOOGLE_DRIVE_CONNECTOR: "Google Drive", + COMPOSIO_GMAIL_CONNECTOR: "Gmail", + COMPOSIO_GOOGLE_CALENDAR_CONNECTOR: "Google Calendar", AIRTABLE_CONNECTOR: "Airtable", LUMA_CONNECTOR: "Luma", ELASTICSEARCH_CONNECTOR: "Elasticsearch", WEBCRAWLER_CONNECTOR: "Web Pages", CIRCLEBACK_CONNECTOR: "Circleback", + OBSIDIAN_CONNECTOR: "Obsidian", }; return typeMap[type] || type; }; diff --git a/surfsense_web/lib/query-client/cache-keys.ts b/surfsense_web/lib/query-client/cache-keys.ts index 72f2bbd54..8ffc3b786 100644 --- a/surfsense_web/lib/query-client/cache-keys.ts +++ b/surfsense_web/lib/query-client/cache-keys.ts @@ -71,6 +71,10 @@ export const cacheKeys = { folders: (connectorId: number, parentId?: string) => ["connectors", "google-drive", connectorId, "folders", parentId] as const, }, + composioDrive: { + folders: (connectorId: number, parentId?: string) => + ["connectors", "composio-drive", connectorId, "folders", parentId] as const, + }, }, comments: { byMessage: (messageId: number) => ["comments", "message", messageId] as const, diff --git a/surfsense_web/messages/en.json b/surfsense_web/messages/en.json index 94e44c8ec..8ca382669 100644 --- a/surfsense_web/messages/en.json +++ b/surfsense_web/messages/en.json @@ -157,7 +157,7 @@ "delete_note": "Delete Note", "delete_note_confirm": "Are you sure you want to delete", "action_cannot_undone": "This action cannot be undone.", - "deleting": "Deleting...", + "deleting": "Deleting", "surfsense_dashboard": "SurfSense Dashboard", "welcome_message": "Welcome to your SurfSense dashboard.", "your_search_spaces": "Your Search Spaces", @@ -498,7 +498,7 @@ "base": "Base", "all_roles_assigned": "All roles are assigned and ready to use! Your LLM configuration is complete.", "save_changes": "Save Changes", - "saving": "Saving...", + "saving": "Saving", "reset": "Reset", "status": "Status", "status_ready": "Ready", @@ -548,7 +548,7 @@ "log_deleted_error": "Failed to delete log", "confirm_delete_log_title": "Are you sure?", "confirm_delete_log_desc": "This action cannot be undone. This will permanently delete the log entry.", - "deleting": "Deleting..." + "deleting": "Deleting" }, "onboard": { "welcome_title": "Welcome to SurfSense", diff --git a/surfsense_web/package.json b/surfsense_web/package.json index 7ec05c95d..235f4b9db 100644 --- a/surfsense_web/package.json +++ b/surfsense_web/package.json @@ -1,6 +1,6 @@ { "name": "surfsense_web", - "version": "0.0.11", + "version": "0.0.12", "private": true, "description": "SurfSense Frontend", "scripts": { diff --git a/surfsense_web/public/connectors/obsidian.svg b/surfsense_web/public/connectors/obsidian.svg index 9fe15c4a3..b5afd5724 100644 --- a/surfsense_web/public/connectors/obsidian.svg +++ b/surfsense_web/public/connectors/obsidian.svg @@ -1,12 +1,46 @@ - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +