diff --git a/Dockerfile.allinone b/Dockerfile.allinone index 2e160d3dc..64e99a14d 100644 --- a/Dockerfile.allinone +++ b/Dockerfile.allinone @@ -228,7 +228,7 @@ COPY scripts/docker/init-postgres.sh /app/init-postgres.sh RUN dos2unix /app/init-postgres.sh && chmod +x /app/init-postgres.sh # Clean up build dependencies to reduce image size -RUN apt-get purge -y build-essential postgresql-server-dev-14 git \ +RUN apt-get purge -y build-essential postgresql-server-dev-14 \ && apt-get autoremove -y \ && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* diff --git a/README.md b/README.md index 7f50b924c..4dd368c04 100644 --- a/README.md +++ b/README.md @@ -29,8 +29,7 @@ SurfSense is a highly customizable AI research agent, connected to external sour # Video -https://github.com/user-attachments/assets/42a29ea1-d4d8-4213-9c69-972b5b806d58 - +https://github.com/user-attachments/assets/cc0c84d3-1f2f-4f7a-b519-2ecce22310b1 ## Podcast Sample @@ -52,8 +51,10 @@ https://github.com/user-attachments/assets/a0a16566-6967-4374-ac51-9b3e07fbecd7 - Interact in Natural Language and get cited answers. ### 📄 **Cited Answers** - Get Cited answers just like Perplexity. +### 🧩 **Universal Compatibility** +- Connect virtually any inference provider via the OpenAI spec and LiteLLM. ### 🔔 **Privacy & Local LLM Support** -- Works Flawlessly with Ollama local LLMs. +- Works Flawlessly with local LLMs like vLLM and Ollama. ### 🏠 **Self Hostable** - Open source and easy to deploy locally. ### 👥 **Team Collaboration with RBAC** @@ -61,6 +62,7 @@ https://github.com/user-attachments/assets/a0a16566-6967-4374-ac51-9b3e07fbecd7 - Invite team members with customizable roles (Owner, Admin, Editor, Viewer) - Granular permissions for documents, chats, connectors, and settings - Share knowledge bases securely within your organization +- Team chats update in real-time and "Chat about the chat" in comment threads ### 🎙️ Podcasts - Blazingly fast podcast generation agent. (Creates a 3-minute podcast in under 20 seconds.) - Convert your chat conversations into engaging audio content @@ -237,6 +239,8 @@ Before self-hosting installation, make sure to complete the [prerequisite setup ### **BackEnd** +- **LiteLLM**: Universal LLM integration supporting 100+ models (OpenAI, Anthropic, Ollama, etc.) + - **FastAPI**: Modern, fast web framework for building APIs with Python - **PostgreSQL with pgvector**: Database with vector search capabilities for similarity searches @@ -253,8 +257,6 @@ Before self-hosting installation, make sure to complete the [prerequisite setup - **LangChain**: Framework for developing AI-powered applications. -- **LiteLLM**: Universal LLM integration supporting 100+ models (OpenAI, Anthropic, Ollama, etc.) - - **Rerankers**: Advanced result ranking for improved search relevance - **Hybrid Search**: Combines vector similarity and full-text search for optimal results using Reciprocal Rank Fusion (RRF) diff --git a/surfsense_backend/Dockerfile b/surfsense_backend/Dockerfile index fa3aaeae8..9ce6467b3 100644 --- a/surfsense_backend/Dockerfile +++ b/surfsense_backend/Dockerfile @@ -19,6 +19,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ libxext6 \ libxrender1 \ dos2unix \ + git \ && rm -rf /var/lib/apt/lists/* # Update certificates and install SSL tools diff --git a/surfsense_backend/alembic/versions/79_add_composio_connector_enums.py b/surfsense_backend/alembic/versions/79_add_composio_connector_enums.py new file mode 100644 index 000000000..0869fdcc3 --- /dev/null +++ b/surfsense_backend/alembic/versions/79_add_composio_connector_enums.py @@ -0,0 +1,95 @@ +"""Add Composio connector types to SearchSourceConnectorType and DocumentType enums + +Revision ID: 79 +Revises: 78 + +This migration adds the Composio connector enum values to both: +- searchsourceconnectortype (for connector type tracking) +- documenttype (for document type tracking) + +Composio is a managed OAuth integration service that allows connecting +to various third-party services (Google Drive, Gmail, Calendar, etc.) +without requiring separate OAuth app verification. + +This migration adds three specific connector types: +- COMPOSIO_GOOGLE_DRIVE_CONNECTOR +- COMPOSIO_GMAIL_CONNECTOR +- COMPOSIO_GOOGLE_CALENDAR_CONNECTOR +""" + +from collections.abc import Sequence + +from alembic import op + +# revision identifiers, used by Alembic. +revision: str = "79" +down_revision: str | None = "78" +branch_labels: str | Sequence[str] | None = None +depends_on: str | Sequence[str] | None = None + +# Define the ENUM type names and the new values +CONNECTOR_ENUM = "searchsourceconnectortype" +CONNECTOR_NEW_VALUES = [ + "COMPOSIO_GOOGLE_DRIVE_CONNECTOR", + "COMPOSIO_GMAIL_CONNECTOR", + "COMPOSIO_GOOGLE_CALENDAR_CONNECTOR", +] +DOCUMENT_ENUM = "documenttype" +DOCUMENT_NEW_VALUES = [ + "COMPOSIO_GOOGLE_DRIVE_CONNECTOR", + "COMPOSIO_GMAIL_CONNECTOR", + "COMPOSIO_GOOGLE_CALENDAR_CONNECTOR", +] + + +def upgrade() -> None: + """Upgrade schema - add Composio connector types to connector and document enums safely.""" + # Add each Composio connector type to searchsourceconnectortype only if not exists + for value in CONNECTOR_NEW_VALUES: + op.execute( + f""" + DO $$ + BEGIN + IF NOT EXISTS ( + SELECT 1 FROM pg_enum e + JOIN pg_type t ON e.enumtypid = t.oid + WHERE t.typname = '{CONNECTOR_ENUM}' AND e.enumlabel = '{value}' + ) THEN + ALTER TYPE {CONNECTOR_ENUM} ADD VALUE '{value}'; + END IF; + END$$; + """ + ) + + # Add each Composio connector type to documenttype only if not exists + for value in DOCUMENT_NEW_VALUES: + op.execute( + f""" + DO $$ + BEGIN + IF NOT EXISTS ( + SELECT 1 FROM pg_enum e + JOIN pg_type t ON e.enumtypid = t.oid + WHERE t.typname = '{DOCUMENT_ENUM}' AND e.enumlabel = '{value}' + ) THEN + ALTER TYPE {DOCUMENT_ENUM} ADD VALUE '{value}'; + END IF; + END$$; + """ + ) + + +def downgrade() -> None: + """Downgrade schema - remove Composio connector types from connector and document enums. + + Note: PostgreSQL does not support removing enum values directly. + To properly downgrade, you would need to: + 1. Delete any rows using the Composio connector type values + 2. Create new enums without the Composio connector types + 3. Alter the columns to use the new enums + 4. Drop the old enums + + This is left as a no-op since removing enum values is complex + and typically not needed in practice. + """ + pass diff --git a/surfsense_backend/alembic/versions/80_add_user_incentive_tasks_table.py b/surfsense_backend/alembic/versions/80_add_user_incentive_tasks_table.py new file mode 100644 index 000000000..7fcadb763 --- /dev/null +++ b/surfsense_backend/alembic/versions/80_add_user_incentive_tasks_table.py @@ -0,0 +1,97 @@ +"""Add user incentive tasks table for earning free pages + +Revision ID: 80 +Revises: 79 + +Changes: +1. Create incentive_task_type enum with GITHUB_STAR value +2. Create user_incentive_tasks table to track completed tasks +""" + +from collections.abc import Sequence + +import sqlalchemy as sa +from sqlalchemy.dialects import postgresql + +from alembic import op + +# revision identifiers, used by Alembic. +revision: str = "80" +down_revision: str | None = "79" +branch_labels: str | Sequence[str] | None = None +depends_on: str | Sequence[str] | None = None + + +def upgrade() -> None: + """Create incentive tasks infrastructure.""" + + # Check if enum already exists (handles partial migration recovery) + conn = op.get_bind() + result = conn.execute( + sa.text("SELECT 1 FROM pg_type WHERE typname = 'incentivetasktype'") + ) + enum_exists = result.fetchone() is not None + + # Create the enum type only if it doesn't exist + if not enum_exists: + incentive_task_type_enum = postgresql.ENUM( + "GITHUB_STAR", + name="incentivetasktype", + create_type=False, + ) + incentive_task_type_enum.create(op.get_bind(), checkfirst=True) + + # Check if table already exists (handles partial migration recovery) + result = conn.execute( + sa.text( + "SELECT 1 FROM information_schema.tables WHERE table_name = 'user_incentive_tasks'" + ) + ) + table_exists = result.fetchone() is not None + + if not table_exists: + # Create the user_incentive_tasks table + op.create_table( + "user_incentive_tasks", + sa.Column("id", sa.Integer(), primary_key=True, index=True), + sa.Column( + "user_id", + sa.UUID(as_uuid=True), + sa.ForeignKey("user.id", ondelete="CASCADE"), + nullable=False, + index=True, + ), + sa.Column( + "task_type", + postgresql.ENUM( + "GITHUB_STAR", name="incentivetasktype", create_type=False + ), + nullable=False, + index=True, + ), + sa.Column("pages_awarded", sa.Integer(), nullable=False), + sa.Column( + "completed_at", + sa.TIMESTAMP(timezone=True), + nullable=False, + server_default=sa.func.now(), + ), + sa.Column( + "created_at", + sa.TIMESTAMP(timezone=True), + nullable=False, + server_default=sa.func.now(), + index=True, + ), + sa.UniqueConstraint("user_id", "task_type", name="uq_user_incentive_task"), + ) + + +def downgrade() -> None: + """Remove incentive tasks infrastructure.""" + + # Drop the table + op.drop_table("user_incentive_tasks") + + # Drop the enum type + postgresql.ENUM(name="incentivetasktype").drop(op.get_bind(), checkfirst=True) diff --git a/surfsense_backend/alembic/versions/79_add_public_share_to_chat_threads.py b/surfsense_backend/alembic/versions/81_add_public_share_to_chat_threads.py similarity index 95% rename from surfsense_backend/alembic/versions/79_add_public_share_to_chat_threads.py rename to surfsense_backend/alembic/versions/81_add_public_share_to_chat_threads.py index e1c21a353..33e1a88e9 100644 --- a/surfsense_backend/alembic/versions/79_add_public_share_to_chat_threads.py +++ b/surfsense_backend/alembic/versions/81_add_public_share_to_chat_threads.py @@ -1,7 +1,7 @@ """Add public sharing columns to new_chat_threads -Revision ID: 79 -Revises: 78 +Revision ID: 81 +Revises: 80 Create Date: 2026-01-23 Adds public_share_token and public_share_enabled columns to enable @@ -13,8 +13,8 @@ from collections.abc import Sequence from alembic import op # revision identifiers, used by Alembic. -revision: str = "79" -down_revision: str | None = "78" +revision: str = "81" +down_revision: str | None = "80" branch_labels: str | Sequence[str] | None = None depends_on: str | Sequence[str] | None = None diff --git a/surfsense_backend/alembic/versions/80_add_thread_id_to_podcasts.py b/surfsense_backend/alembic/versions/82_add_thread_id_to_podcasts.py similarity index 90% rename from surfsense_backend/alembic/versions/80_add_thread_id_to_podcasts.py rename to surfsense_backend/alembic/versions/82_add_thread_id_to_podcasts.py index ea66a09a1..f08fe32d8 100644 --- a/surfsense_backend/alembic/versions/80_add_thread_id_to_podcasts.py +++ b/surfsense_backend/alembic/versions/82_add_thread_id_to_podcasts.py @@ -1,7 +1,7 @@ """Add thread_id to podcasts -Revision ID: 80 -Revises: 79 +Revision ID: 82 +Revises: 81 Create Date: 2026-01-23 """ @@ -10,8 +10,8 @@ from collections.abc import Sequence from alembic import op -revision: str = "80" -down_revision: str | None = "79" +revision: str = "82" +down_revision: str | None = "81" branch_labels: str | Sequence[str] | None = None depends_on: str | Sequence[str] | None = None diff --git a/surfsense_backend/app/agents/new_chat/chat_deepagent.py b/surfsense_backend/app/agents/new_chat/chat_deepagent.py index 1a2029c42..fda22aec3 100644 --- a/surfsense_backend/app/agents/new_chat/chat_deepagent.py +++ b/surfsense_backend/app/agents/new_chat/chat_deepagent.py @@ -7,6 +7,7 @@ via NewLLMConfig. """ from collections.abc import Sequence +from typing import Any from deepagents import create_deep_agent from langchain_core.tools import BaseTool @@ -23,6 +24,90 @@ from app.agents.new_chat.system_prompt import ( from app.agents.new_chat.tools.registry import build_tools_async from app.services.connector_service import ConnectorService +# ============================================================================= +# Connector Type Mapping +# ============================================================================= + +# Maps SearchSourceConnectorType enum values to the searchable document/connector types +# used by the knowledge_base tool. Some connectors map to different document types. +_CONNECTOR_TYPE_TO_SEARCHABLE: dict[str, str] = { + # Direct mappings (connector type == searchable type) + "TAVILY_API": "TAVILY_API", + "SEARXNG_API": "SEARXNG_API", + "LINKUP_API": "LINKUP_API", + "BAIDU_SEARCH_API": "BAIDU_SEARCH_API", + "SLACK_CONNECTOR": "SLACK_CONNECTOR", + "TEAMS_CONNECTOR": "TEAMS_CONNECTOR", + "NOTION_CONNECTOR": "NOTION_CONNECTOR", + "GITHUB_CONNECTOR": "GITHUB_CONNECTOR", + "LINEAR_CONNECTOR": "LINEAR_CONNECTOR", + "DISCORD_CONNECTOR": "DISCORD_CONNECTOR", + "JIRA_CONNECTOR": "JIRA_CONNECTOR", + "CONFLUENCE_CONNECTOR": "CONFLUENCE_CONNECTOR", + "CLICKUP_CONNECTOR": "CLICKUP_CONNECTOR", + "GOOGLE_CALENDAR_CONNECTOR": "GOOGLE_CALENDAR_CONNECTOR", + "GOOGLE_GMAIL_CONNECTOR": "GOOGLE_GMAIL_CONNECTOR", + "GOOGLE_DRIVE_CONNECTOR": "GOOGLE_DRIVE_FILE", # Connector type differs from document type + "AIRTABLE_CONNECTOR": "AIRTABLE_CONNECTOR", + "LUMA_CONNECTOR": "LUMA_CONNECTOR", + "ELASTICSEARCH_CONNECTOR": "ELASTICSEARCH_CONNECTOR", + "WEBCRAWLER_CONNECTOR": "CRAWLED_URL", # Maps to document type + "BOOKSTACK_CONNECTOR": "BOOKSTACK_CONNECTOR", + "CIRCLEBACK_CONNECTOR": "CIRCLEBACK", # Connector type differs from document type + "OBSIDIAN_CONNECTOR": "OBSIDIAN_CONNECTOR", + # Composio connectors + "COMPOSIO_GOOGLE_DRIVE_CONNECTOR": "COMPOSIO_GOOGLE_DRIVE_CONNECTOR", + "COMPOSIO_GMAIL_CONNECTOR": "COMPOSIO_GMAIL_CONNECTOR", + "COMPOSIO_GOOGLE_CALENDAR_CONNECTOR": "COMPOSIO_GOOGLE_CALENDAR_CONNECTOR", +} + +# Document types that don't come from SearchSourceConnector but should always be searchable +_ALWAYS_AVAILABLE_DOC_TYPES: list[str] = [ + "EXTENSION", # Browser extension data + "FILE", # Uploaded files + "NOTE", # User notes + "YOUTUBE_VIDEO", # YouTube videos +] + + +def _map_connectors_to_searchable_types( + connector_types: list[Any], +) -> list[str]: + """ + Map SearchSourceConnectorType enums to searchable document/connector types. + + This function: + 1. Converts connector type enums to their searchable counterparts + 2. Includes always-available document types (EXTENSION, FILE, NOTE, YOUTUBE_VIDEO) + 3. Deduplicates while preserving order + + Args: + connector_types: List of SearchSourceConnectorType enum values + + Returns: + List of searchable connector/document type strings + """ + result_set: set[str] = set() + result_list: list[str] = [] + + # Add always-available document types first + for doc_type in _ALWAYS_AVAILABLE_DOC_TYPES: + if doc_type not in result_set: + result_set.add(doc_type) + result_list.append(doc_type) + + # Map each connector type to its searchable equivalent + for ct in connector_types: + # Handle both enum and string types + ct_str = ct.value if hasattr(ct, "value") else str(ct) + searchable = _CONNECTOR_TYPE_TO_SEARCHABLE.get(ct_str) + if searchable and searchable not in result_set: + result_set.add(searchable) + result_list.append(searchable) + + return result_list + + # ============================================================================= # Deep Agent Factory # ============================================================================= @@ -117,6 +202,30 @@ async def create_surfsense_deep_agent( additional_tools=[my_custom_tool] ) """ + # Discover available connectors and document types for this search space + # This enables dynamic tool docstrings that inform the LLM about what's actually available + available_connectors: list[str] | None = None + available_document_types: list[str] | None = None + + try: + # Get enabled search source connectors for this search space + connector_types = await connector_service.get_available_connectors( + search_space_id + ) + if connector_types: + # Convert enum values to strings and also include mapped document types + available_connectors = _map_connectors_to_searchable_types(connector_types) + + # Get document types that have at least one document indexed + available_document_types = await connector_service.get_available_document_types( + search_space_id + ) + except Exception as e: + # Log but don't fail - fall back to all connectors if discovery fails + import logging + + logging.warning(f"Failed to discover available connectors/document types: {e}") + # Build dependencies dict for the tools registry dependencies = { "search_space_id": search_space_id, @@ -125,6 +234,9 @@ async def create_surfsense_deep_agent( "firecrawl_api_key": firecrawl_api_key, "user_id": user_id, # Required for memory tools "thread_id": thread_id, # For podcast tool + # Dynamic connector/document type discovery for knowledge base tool + "available_connectors": available_connectors, + "available_document_types": available_document_types, } # Build tools using the async registry (includes MCP tools) diff --git a/surfsense_backend/app/agents/new_chat/tools/__init__.py b/surfsense_backend/app/agents/new_chat/tools/__init__.py index acbdbcb3a..9e1a4f19c 100644 --- a/surfsense_backend/app/agents/new_chat/tools/__init__.py +++ b/surfsense_backend/app/agents/new_chat/tools/__init__.py @@ -19,6 +19,7 @@ Available tools: # Tool factory exports (for direct use) from .display_image import create_display_image_tool from .knowledge_base import ( + CONNECTOR_DESCRIPTIONS, create_search_knowledge_base_tool, format_documents_for_context, search_knowledge_base_async, @@ -40,6 +41,8 @@ from .user_memory import create_recall_memory_tool, create_save_memory_tool __all__ = [ # Registry "BUILTIN_TOOLS", + # Knowledge base utilities + "CONNECTOR_DESCRIPTIONS", "ToolDefinition", "build_tools", # Tool factories @@ -51,7 +54,6 @@ __all__ = [ "create_scrape_webpage_tool", "create_search_knowledge_base_tool", "create_search_surfsense_docs_tool", - # Knowledge base utilities "format_documents_for_context", "get_all_tool_names", "get_default_enabled_tools", diff --git a/surfsense_backend/app/agents/new_chat/tools/knowledge_base.py b/surfsense_backend/app/agents/new_chat/tools/knowledge_base.py index 552019dda..a11e4ac38 100644 --- a/surfsense_backend/app/agents/new_chat/tools/knowledge_base.py +++ b/surfsense_backend/app/agents/new_chat/tools/knowledge_base.py @@ -12,7 +12,8 @@ import json from datetime import datetime from typing import Any -from langchain_core.tools import tool +from langchain_core.tools import StructuredTool +from pydantic import BaseModel, Field from sqlalchemy.ext.asyncio import AsyncSession from app.services.connector_service import ConnectorService @@ -22,6 +23,7 @@ from app.services.connector_service import ConnectorService # ============================================================================= # Canonical connector values used internally by ConnectorService +# Includes all document types and search source connectors _ALL_CONNECTORS: list[str] = [ "EXTENSION", "FILE", @@ -50,41 +52,117 @@ _ALL_CONNECTORS: list[str] = [ "CRAWLED_URL", "CIRCLEBACK", "OBSIDIAN_CONNECTOR", + # Composio connectors + "COMPOSIO_GOOGLE_DRIVE_CONNECTOR", + "COMPOSIO_GMAIL_CONNECTOR", + "COMPOSIO_GOOGLE_CALENDAR_CONNECTOR", ] +# Human-readable descriptions for each connector type +# Used for generating dynamic docstrings and informing the LLM +CONNECTOR_DESCRIPTIONS: dict[str, str] = { + "EXTENSION": "Web content saved via SurfSense browser extension (personal browsing history)", + "FILE": "User-uploaded documents (PDFs, Word, etc.) (personal files)", + "NOTE": "SurfSense Notes (notes created inside SurfSense)", + "SLACK_CONNECTOR": "Slack conversations and shared content (personal workspace communications)", + "TEAMS_CONNECTOR": "Microsoft Teams messages and conversations (personal Teams communications)", + "NOTION_CONNECTOR": "Notion workspace pages and databases (personal knowledge management)", + "YOUTUBE_VIDEO": "YouTube video transcripts and metadata (personally saved videos)", + "GITHUB_CONNECTOR": "GitHub repository content and issues (personal repositories and interactions)", + "ELASTICSEARCH_CONNECTOR": "Elasticsearch indexed documents and data (personal Elasticsearch instances)", + "LINEAR_CONNECTOR": "Linear project issues and discussions (personal project management)", + "JIRA_CONNECTOR": "Jira project issues, tickets, and comments (personal project tracking)", + "CONFLUENCE_CONNECTOR": "Confluence pages and comments (personal project documentation)", + "CLICKUP_CONNECTOR": "ClickUp tasks and project data (personal task management)", + "GOOGLE_CALENDAR_CONNECTOR": "Google Calendar events, meetings, and schedules (personal calendar)", + "GOOGLE_GMAIL_CONNECTOR": "Google Gmail emails and conversations (personal emails)", + "GOOGLE_DRIVE_FILE": "Google Drive files and documents (personal cloud storage)", + "DISCORD_CONNECTOR": "Discord server conversations and shared content (personal community)", + "AIRTABLE_CONNECTOR": "Airtable records, tables, and database content (personal data)", + "TAVILY_API": "Tavily web search API results (real-time web search)", + "SEARXNG_API": "SearxNG search API results (privacy-focused web search)", + "LINKUP_API": "Linkup search API results (web search)", + "BAIDU_SEARCH_API": "Baidu search API results (Chinese web search)", + "LUMA_CONNECTOR": "Luma events and meetings", + "WEBCRAWLER_CONNECTOR": "Webpages indexed by SurfSense (personally selected websites)", + "CRAWLED_URL": "Webpages indexed by SurfSense (personally selected websites)", + "BOOKSTACK_CONNECTOR": "BookStack pages (personal documentation)", + "CIRCLEBACK": "Circleback meeting notes, transcripts, and action items", + "OBSIDIAN_CONNECTOR": "Obsidian vault notes and markdown files (personal notes)", + # Composio connectors + "COMPOSIO_GOOGLE_DRIVE_CONNECTOR": "Google Drive files via Composio (personal cloud storage)", + "COMPOSIO_GMAIL_CONNECTOR": "Gmail emails via Composio (personal emails)", + "COMPOSIO_GOOGLE_CALENDAR_CONNECTOR": "Google Calendar events via Composio (personal calendar)", +} -def _normalize_connectors(connectors_to_search: list[str] | None) -> list[str]: + +def _normalize_connectors( + connectors_to_search: list[str] | None, + available_connectors: list[str] | None = None, +) -> list[str]: """ Normalize connectors provided by the model. - Accepts user-facing enums like WEBCRAWLER_CONNECTOR and maps them to canonical ConnectorService types. - Drops unknown values. - - If None/empty, defaults to searching across all known connectors. + - If available_connectors is provided, only includes connectors from that list. + - If connectors_to_search is None/empty, defaults to available_connectors or all. + + Args: + connectors_to_search: List of connectors requested by the model + available_connectors: List of connectors actually available in the search space + + Returns: + List of normalized connector strings to search """ + # Determine the set of valid connectors to consider + valid_set = ( + set(available_connectors) if available_connectors else set(_ALL_CONNECTORS) + ) + if not connectors_to_search: - return list(_ALL_CONNECTORS) + # Search all available connectors if none specified + return ( + list(available_connectors) + if available_connectors + else list(_ALL_CONNECTORS) + ) normalized: list[str] = [] for raw in connectors_to_search: c = (raw or "").strip().upper() if not c: continue + # Map user-facing aliases to canonical names if c == "WEBCRAWLER_CONNECTOR": c = "CRAWLED_URL" normalized.append(c) - # de-dupe while preserving order + filter unknown + # de-dupe while preserving order + filter to valid connectors seen: set[str] = set() out: list[str] = [] for c in normalized: if c in seen: continue + # Only include if it's a known connector AND available if c not in _ALL_CONNECTORS: continue + if c not in valid_set: + continue seen.add(c) out.append(c) - return out if out else list(_ALL_CONNECTORS) + + # Fallback to all available if nothing matched + return ( + out + if out + else ( + list(available_connectors) + if available_connectors + else list(_ALL_CONNECTORS) + ) + ) # ============================================================================= @@ -233,6 +311,7 @@ async def search_knowledge_base_async( top_k: int = 10, start_date: datetime | None = None, end_date: datetime | None = None, + available_connectors: list[str] | None = None, ) -> str: """ Search the user's knowledge base for relevant documents. @@ -248,6 +327,8 @@ async def search_knowledge_base_async( top_k: Number of results per connector start_date: Optional start datetime (UTC) for filtering documents end_date: Optional end datetime (UTC) for filtering documents + available_connectors: Optional list of connectors actually available in the search space. + If provided, only these connectors will be searched. Returns: Formatted string with search results @@ -262,7 +343,7 @@ async def search_knowledge_base_async( end_date=end_date, ) - connectors = _normalize_connectors(connectors_to_search) + connectors = _normalize_connectors(connectors_to_search, available_connectors) for connector in connectors: try: @@ -316,6 +397,16 @@ async def search_knowledge_base_async( ) all_documents.extend(chunks) + elif connector == "TEAMS_CONNECTOR": + _, chunks = await connector_service.search_teams( + user_query=query, + search_space_id=search_space_id, + top_k=top_k, + start_date=resolved_start_date, + end_date=resolved_end_date, + ) + all_documents.extend(chunks) + elif connector == "NOTION_CONNECTOR": _, chunks = await connector_service.search_notion( user_query=query, @@ -519,6 +610,39 @@ async def search_knowledge_base_async( ) all_documents.extend(chunks) + # ========================================================= + # Composio Connectors + # ========================================================= + elif connector == "COMPOSIO_GOOGLE_DRIVE_CONNECTOR": + _, chunks = await connector_service.search_composio_google_drive( + user_query=query, + search_space_id=search_space_id, + top_k=top_k, + start_date=resolved_start_date, + end_date=resolved_end_date, + ) + all_documents.extend(chunks) + + elif connector == "COMPOSIO_GMAIL_CONNECTOR": + _, chunks = await connector_service.search_composio_gmail( + user_query=query, + search_space_id=search_space_id, + top_k=top_k, + start_date=resolved_start_date, + end_date=resolved_end_date, + ) + all_documents.extend(chunks) + + elif connector == "COMPOSIO_GOOGLE_CALENDAR_CONNECTOR": + _, chunks = await connector_service.search_composio_google_calendar( + user_query=query, + search_space_id=search_space_id, + top_k=top_k, + start_date=resolved_start_date, + end_date=resolved_end_date, + ) + all_documents.extend(chunks) + except Exception as e: print(f"Error searching connector {connector}: {e}") continue @@ -543,11 +667,68 @@ async def search_knowledge_base_async( return format_documents_for_context(deduplicated) +def _build_connector_docstring(available_connectors: list[str] | None) -> str: + """ + Build the connector documentation section for the tool docstring. + + Args: + available_connectors: List of available connector types, or None for all + + Returns: + Formatted docstring section listing available connectors + """ + connectors = available_connectors if available_connectors else list(_ALL_CONNECTORS) + + lines = [] + for connector in connectors: + # Skip internal names, prefer user-facing aliases + if connector == "CRAWLED_URL": + # Show as WEBCRAWLER_CONNECTOR for user-facing docs + description = CONNECTOR_DESCRIPTIONS.get(connector, connector) + lines.append(f"- WEBCRAWLER_CONNECTOR: {description}") + else: + description = CONNECTOR_DESCRIPTIONS.get(connector, connector) + lines.append(f"- {connector}: {description}") + + return "\n".join(lines) + + +# ============================================================================= +# Tool Input Schema +# ============================================================================= + + +class SearchKnowledgeBaseInput(BaseModel): + """Input schema for the search_knowledge_base tool.""" + + query: str = Field( + description="The search query - be specific and include key terms" + ) + top_k: int = Field( + default=10, + description="Number of results to retrieve (default: 10)", + ) + start_date: str | None = Field( + default=None, + description="Optional ISO date/datetime (e.g. '2025-12-12' or '2025-12-12T00:00:00+00:00')", + ) + end_date: str | None = Field( + default=None, + description="Optional ISO date/datetime (e.g. '2025-12-19' or '2025-12-19T23:59:59+00:00')", + ) + connectors_to_search: list[str] | None = Field( + default=None, + description="Optional list of connector enums to search. If omitted, searches all available.", + ) + + def create_search_knowledge_base_tool( search_space_id: int, db_session: AsyncSession, connector_service: ConnectorService, -): + available_connectors: list[str] | None = None, + available_document_types: list[str] | None = None, +) -> StructuredTool: """ Factory function to create the search_knowledge_base tool with injected dependencies. @@ -555,72 +736,57 @@ def create_search_knowledge_base_tool( search_space_id: The user's search space ID db_session: Database session connector_service: Initialized connector service + available_connectors: Optional list of connector types available in the search space. + Used to dynamically generate the tool docstring. + available_document_types: Optional list of document types that have data in the search space. + Used to inform the LLM about what data exists. Returns: - A configured tool function + A configured StructuredTool instance """ + # Build connector documentation dynamically + connector_docs = _build_connector_docstring(available_connectors) - @tool - async def search_knowledge_base( + # Build context about available document types + doc_types_info = "" + if available_document_types: + doc_types_info = f""" + +## Document types with indexed content in this search space + +The following document types have content available for search: +{", ".join(available_document_types)} + +Focus searches on these types for best results.""" + + # Build the dynamic description for the tool + # This is what the LLM sees when deciding whether/how to use the tool + dynamic_description = f"""Search the user's personal knowledge base for relevant information. + +Use this tool to find documents, notes, files, web pages, and other content that may help answer the user's question. + +IMPORTANT: +- If the user requests a specific source type (e.g. "my notes", "Slack messages"), pass `connectors_to_search=[...]` using the enums below. +- If `connectors_to_search` is omitted/empty, the system will search broadly. +- Only connectors that are enabled/configured for this search space are available.{doc_types_info} + +## Available connector enums for `connectors_to_search` + +{connector_docs} + +NOTE: `WEBCRAWLER_CONNECTOR` is mapped internally to the canonical document type `CRAWLED_URL`.""" + + # Capture for closure + _available_connectors = available_connectors + + async def _search_knowledge_base_impl( query: str, top_k: int = 10, start_date: str | None = None, end_date: str | None = None, connectors_to_search: list[str] | None = None, ) -> str: - """ - Search the user's personal knowledge base for relevant information. - - Use this tool to find documents, notes, files, web pages, and other content - that may help answer the user's question. - - IMPORTANT: - - If the user requests a specific source type (e.g. "my notes", "Slack messages"), - pass `connectors_to_search=[...]` using the enums below. - - If `connectors_to_search` is omitted/empty, the system will search broadly. - - ## Available connector enums for `connectors_to_search` - - - EXTENSION: "Web content saved via SurfSense browser extension" (personal browsing history) - - FILE: "User-uploaded documents (PDFs, Word, etc.)" (personal files) - - NOTE: "SurfSense Notes" (notes created inside SurfSense) - - SLACK_CONNECTOR: "Slack conversations and shared content" (personal workspace communications) - - TEAMS_CONNECTOR: "Microsoft Teams messages and conversations" (personal Teams communications) - - NOTION_CONNECTOR: "Notion workspace pages and databases" (personal knowledge management) - - YOUTUBE_VIDEO: "YouTube video transcripts and metadata" (personally saved videos) - - GITHUB_CONNECTOR: "GitHub repository content and issues" (personal repositories and interactions) - - ELASTICSEARCH_CONNECTOR: "Elasticsearch indexed documents and data" (personal Elasticsearch instances and custom data sources) - - LINEAR_CONNECTOR: "Linear project issues and discussions" (personal project management) - - JIRA_CONNECTOR: "Jira project issues, tickets, and comments" (personal project tracking) - - CONFLUENCE_CONNECTOR: "Confluence pages and comments" (personal project documentation) - - CLICKUP_CONNECTOR: "ClickUp tasks and project data" (personal task management) - - GOOGLE_CALENDAR_CONNECTOR: "Google Calendar events, meetings, and schedules" (personal calendar and time management) - - GOOGLE_GMAIL_CONNECTOR: "Google Gmail emails and conversations" (personal emails and communications) - - GOOGLE_DRIVE_FILE: "Google Drive files and documents" (personal cloud storage and file management) - - DISCORD_CONNECTOR: "Discord server conversations and shared content" (personal community communications) - - AIRTABLE_CONNECTOR: "Airtable records, tables, and database content" (personal data management and organization) - - TAVILY_API: "Tavily search API results" (personalized search results) - - SEARXNG_API: "SearxNG search API results" (personalized search results) - - LINKUP_API: "Linkup search API results" (personalized search results) - - BAIDU_SEARCH_API: "Baidu search API results" (personalized search results) - - LUMA_CONNECTOR: "Luma events" - - WEBCRAWLER_CONNECTOR: "Webpages indexed by SurfSense" (personally selected websites) - - BOOKSTACK_CONNECTOR: "BookStack pages" (personal documentation) - - CIRCLEBACK: "Circleback meeting notes, transcripts, and action items" (personal meeting records) - - OBSIDIAN_CONNECTOR: "Obsidian vault notes and markdown files" (personal notes and knowledge management) - - NOTE: `WEBCRAWLER_CONNECTOR` is mapped internally to the canonical document type `CRAWLED_URL`. - - Args: - query: The search query - be specific and include key terms - top_k: Number of results to retrieve (default: 10) - start_date: Optional ISO date/datetime (e.g. "2025-12-12" or "2025-12-12T00:00:00+00:00") - end_date: Optional ISO date/datetime (e.g. "2025-12-19" or "2025-12-19T23:59:59+00:00") - connectors_to_search: Optional list of connector enums to search. If omitted, searches all. - - Returns: - Formatted string with relevant documents and their content - """ + """Implementation function for knowledge base search.""" from app.agents.new_chat.utils import parse_date_or_datetime parsed_start: datetime | None = None @@ -640,6 +806,16 @@ def create_search_knowledge_base_tool( top_k=top_k, start_date=parsed_start, end_date=parsed_end, + available_connectors=_available_connectors, ) - return search_knowledge_base + # Create StructuredTool with dynamic description + # This properly sets the description that the LLM sees + tool = StructuredTool( + name="search_knowledge_base", + description=dynamic_description, + coroutine=_search_knowledge_base_impl, + args_schema=SearchKnowledgeBaseInput, + ) + + return tool diff --git a/surfsense_backend/app/agents/new_chat/tools/registry.py b/surfsense_backend/app/agents/new_chat/tools/registry.py index 8eeff18b8..c65445419 100644 --- a/surfsense_backend/app/agents/new_chat/tools/registry.py +++ b/surfsense_backend/app/agents/new_chat/tools/registry.py @@ -85,6 +85,7 @@ class ToolDefinition: # Contributors: Add your new tools here! BUILTIN_TOOLS: list[ToolDefinition] = [ # Core tool - searches the user's knowledge base + # Now supports dynamic connector/document type discovery ToolDefinition( name="search_knowledge_base", description="Search the user's personal knowledge base for relevant information", @@ -92,8 +93,12 @@ BUILTIN_TOOLS: list[ToolDefinition] = [ search_space_id=deps["search_space_id"], db_session=deps["db_session"], connector_service=deps["connector_service"], + # Optional: dynamically discovered connectors/document types + available_connectors=deps.get("available_connectors"), + available_document_types=deps.get("available_document_types"), ), requires=["search_space_id", "db_session", "connector_service"], + # Note: available_connectors and available_document_types are optional ), # Podcast generation tool ToolDefinition( diff --git a/surfsense_backend/app/connectors/composio_connector.py b/surfsense_backend/app/connectors/composio_connector.py index fdf57d8ea..301296378 100644 --- a/surfsense_backend/app/connectors/composio_connector.py +++ b/surfsense_backend/app/connectors/composio_connector.py @@ -1,7 +1,7 @@ """ -Composio Connector Module. +Composio Connector Base Module. -Provides a unified interface for interacting with various services via Composio, +Provides a base class for interacting with various services via Composio, primarily used during indexing operations. """ @@ -19,10 +19,10 @@ logger = logging.getLogger(__name__) class ComposioConnector: """ - Generic Composio connector for data retrieval. + Base Composio connector for data retrieval. Wraps the ComposioService to provide toolkit-specific data access - for indexing operations. + for indexing operations. Subclasses implement toolkit-specific methods. """ def __init__( @@ -89,302 +89,12 @@ class ComposioConnector: toolkit_id = await self.get_toolkit_id() return toolkit_id in INDEXABLE_TOOLKITS - # ===== Google Drive Methods ===== + @property + def session(self) -> AsyncSession: + """Get the database session.""" + return self._session - async def list_drive_files( - self, - folder_id: str | None = None, - page_token: str | None = None, - page_size: int = 100, - ) -> tuple[list[dict[str, Any]], str | None, str | None]: - """ - List files from Google Drive via Composio. - - Args: - folder_id: Optional folder ID to list contents of. - page_token: Pagination token. - page_size: Number of files per page. - - Returns: - Tuple of (files list, next_page_token, error message). - """ - connected_account_id = await self.get_connected_account_id() - if not connected_account_id: - return [], None, "No connected account ID found" - - entity_id = await self.get_entity_id() - service = await self._get_service() - return await service.get_drive_files( - connected_account_id=connected_account_id, - entity_id=entity_id, - folder_id=folder_id, - page_token=page_token, - page_size=page_size, - ) - - async def get_drive_file_content( - self, file_id: str - ) -> tuple[bytes | None, str | None]: - """ - Download file content from Google Drive via Composio. - - Args: - file_id: Google Drive file ID. - - Returns: - Tuple of (file content bytes, error message). - """ - connected_account_id = await self.get_connected_account_id() - if not connected_account_id: - return None, "No connected account ID found" - - entity_id = await self.get_entity_id() - service = await self._get_service() - return await service.get_drive_file_content( - connected_account_id=connected_account_id, - entity_id=entity_id, - file_id=file_id, - ) - - # ===== Gmail Methods ===== - - async def list_gmail_messages( - self, - query: str = "", - max_results: int = 100, - ) -> tuple[list[dict[str, Any]], str | None]: - """ - List Gmail messages via Composio. - - Args: - query: Gmail search query. - max_results: Maximum number of messages. - - Returns: - Tuple of (messages list, error message). - """ - connected_account_id = await self.get_connected_account_id() - if not connected_account_id: - return [], "No connected account ID found" - - entity_id = await self.get_entity_id() - service = await self._get_service() - return await service.get_gmail_messages( - connected_account_id=connected_account_id, - entity_id=entity_id, - query=query, - max_results=max_results, - ) - - async def get_gmail_message_detail( - self, message_id: str - ) -> tuple[dict[str, Any] | None, str | None]: - """ - Get full details of a Gmail message via Composio. - - Args: - message_id: Gmail message ID. - - Returns: - Tuple of (message details, error message). - """ - connected_account_id = await self.get_connected_account_id() - if not connected_account_id: - return None, "No connected account ID found" - - entity_id = await self.get_entity_id() - service = await self._get_service() - return await service.get_gmail_message_detail( - connected_account_id=connected_account_id, - entity_id=entity_id, - message_id=message_id, - ) - - # ===== Google Calendar Methods ===== - - async def list_calendar_events( - self, - time_min: str | None = None, - time_max: str | None = None, - max_results: int = 250, - ) -> tuple[list[dict[str, Any]], str | None]: - """ - List Google Calendar events via Composio. - - Args: - time_min: Start time (RFC3339 format). - time_max: End time (RFC3339 format). - max_results: Maximum number of events. - - Returns: - Tuple of (events list, error message). - """ - connected_account_id = await self.get_connected_account_id() - if not connected_account_id: - return [], "No connected account ID found" - - entity_id = await self.get_entity_id() - service = await self._get_service() - return await service.get_calendar_events( - connected_account_id=connected_account_id, - entity_id=entity_id, - time_min=time_min, - time_max=time_max, - max_results=max_results, - ) - - # ===== Utility Methods ===== - - def format_gmail_message_to_markdown(self, message: dict[str, Any]) -> str: - """ - Format a Gmail message to markdown. - - Args: - message: Message object from Composio's GMAIL_FETCH_EMAILS response. - Composio structure: messageId, messageText, messageTimestamp, - payload.headers, labelIds, attachmentList - - Returns: - Formatted markdown string. - """ - try: - # Composio uses 'messageId' (camelCase) - message_id = message.get("messageId", "") or message.get("id", "") - label_ids = message.get("labelIds", []) - - # Extract headers from payload - payload = message.get("payload", {}) - headers = payload.get("headers", []) - - # Parse headers into a dict - header_dict = {} - for header in headers: - name = header.get("name", "").lower() - value = header.get("value", "") - header_dict[name] = value - - # Extract key information - subject = header_dict.get("subject", "No Subject") - from_email = header_dict.get("from", "Unknown Sender") - to_email = header_dict.get("to", "Unknown Recipient") - # Composio provides messageTimestamp directly - date_str = message.get("messageTimestamp", "") or header_dict.get( - "date", "Unknown Date" - ) - - # Build markdown content - markdown_content = f"# {subject}\n\n" - markdown_content += f"**From:** {from_email}\n" - markdown_content += f"**To:** {to_email}\n" - markdown_content += f"**Date:** {date_str}\n" - - if label_ids: - markdown_content += f"**Labels:** {', '.join(label_ids)}\n" - - markdown_content += "\n---\n\n" - - # Composio provides full message text in 'messageText' - message_text = message.get("messageText", "") - if message_text: - markdown_content += f"## Content\n\n{message_text}\n\n" - else: - # Fallback to snippet if no messageText - snippet = message.get("snippet", "") - if snippet: - markdown_content += f"## Preview\n\n{snippet}\n\n" - - # Add attachment info if present - attachments = message.get("attachmentList", []) - if attachments: - markdown_content += "## Attachments\n\n" - for att in attachments: - att_name = att.get("filename", att.get("name", "Unknown")) - markdown_content += f"- {att_name}\n" - markdown_content += "\n" - - # Add message metadata - markdown_content += "## Message Details\n\n" - markdown_content += f"- **Message ID:** {message_id}\n" - - return markdown_content - - except Exception as e: - return f"Error formatting message to markdown: {e!s}" - - def format_calendar_event_to_markdown(self, event: dict[str, Any]) -> str: - """ - Format a Google Calendar event to markdown. - - Args: - event: Event object from Google Calendar API. - - Returns: - Formatted markdown string. - """ - from datetime import datetime - - try: - # Extract basic event information - summary = event.get("summary", "No Title") - description = event.get("description", "") - location = event.get("location", "") - - # Extract start and end times - start = event.get("start", {}) - end = event.get("end", {}) - - start_time = start.get("dateTime") or start.get("date", "") - end_time = end.get("dateTime") or end.get("date", "") - - # Format times for display - def format_time(time_str: str) -> str: - if not time_str: - return "Unknown" - try: - if "T" in time_str: - dt = datetime.fromisoformat(time_str.replace("Z", "+00:00")) - return dt.strftime("%Y-%m-%d %H:%M") - return time_str - except Exception: - return time_str - - start_formatted = format_time(start_time) - end_formatted = format_time(end_time) - - # Extract attendees - attendees = event.get("attendees", []) - attendee_list = [] - for attendee in attendees: - email = attendee.get("email", "") - display_name = attendee.get("displayName", email) - response_status = attendee.get("responseStatus", "") - attendee_list.append(f"- {display_name} ({response_status})") - - # Build markdown content - markdown_content = f"# {summary}\n\n" - markdown_content += f"**Start:** {start_formatted}\n" - markdown_content += f"**End:** {end_formatted}\n" - - if location: - markdown_content += f"**Location:** {location}\n" - - markdown_content += "\n" - - if description: - markdown_content += f"## Description\n\n{description}\n\n" - - if attendee_list: - markdown_content += "## Attendees\n\n" - markdown_content += "\n".join(attendee_list) - markdown_content += "\n\n" - - # Add event metadata - markdown_content += "## Event Details\n\n" - markdown_content += f"- **Event ID:** {event.get('id', 'Unknown')}\n" - markdown_content += f"- **Created:** {event.get('created', 'Unknown')}\n" - markdown_content += f"- **Updated:** {event.get('updated', 'Unknown')}\n" - - return markdown_content - - except Exception as e: - return f"Error formatting event to markdown: {e!s}" + @property + def connector_id(self) -> int: + """Get the connector ID.""" + return self._connector_id diff --git a/surfsense_backend/app/connectors/composio_gmail_connector.py b/surfsense_backend/app/connectors/composio_gmail_connector.py new file mode 100644 index 000000000..953e2e8fc --- /dev/null +++ b/surfsense_backend/app/connectors/composio_gmail_connector.py @@ -0,0 +1,613 @@ +""" +Composio Gmail Connector Module. + +Provides Gmail specific methods for data retrieval and indexing via Composio. +""" + +import logging +from datetime import UTC, datetime +from typing import Any + +from sqlalchemy.ext.asyncio import AsyncSession +from sqlalchemy.future import select +from sqlalchemy.orm import selectinload + +from app.config import config +from app.connectors.composio_connector import ComposioConnector +from app.db import Document, DocumentType +from app.services.composio_service import TOOLKIT_TO_DOCUMENT_TYPE +from app.services.llm_service import get_user_long_context_llm +from app.services.task_logging_service import TaskLoggingService +from app.tasks.connector_indexers.base import calculate_date_range +from app.utils.document_converters import ( + create_document_chunks, + generate_content_hash, + generate_document_summary, + generate_unique_identifier_hash, +) + +logger = logging.getLogger(__name__) + + +def get_current_timestamp() -> datetime: + """Get the current timestamp with timezone for updated_at field.""" + return datetime.now(UTC) + + +async def check_document_by_unique_identifier( + session: AsyncSession, unique_identifier_hash: str +) -> Document | None: + """Check if a document with the given unique identifier hash already exists.""" + existing_doc_result = await session.execute( + select(Document) + .options(selectinload(Document.chunks)) + .where(Document.unique_identifier_hash == unique_identifier_hash) + ) + return existing_doc_result.scalars().first() + + +async def update_connector_last_indexed( + session: AsyncSession, + connector, + update_last_indexed: bool = True, +) -> None: + """Update the last_indexed_at timestamp for a connector.""" + if update_last_indexed: + connector.last_indexed_at = datetime.now(UTC) + logger.info(f"Updated last_indexed_at to {connector.last_indexed_at}") + + +class ComposioGmailConnector(ComposioConnector): + """ + Gmail specific Composio connector. + + Provides methods for listing messages, getting message details, and formatting + Gmail messages from Gmail via Composio. + """ + + async def list_gmail_messages( + self, + query: str = "", + max_results: int = 50, + page_token: str | None = None, + ) -> tuple[list[dict[str, Any]], str | None, int | None, str | None]: + """ + List Gmail messages via Composio with pagination support. + + Args: + query: Gmail search query. + max_results: Maximum number of messages per page (default: 50). + page_token: Optional pagination token for next page. + + Returns: + Tuple of (messages list, next_page_token, result_size_estimate, error message). + """ + connected_account_id = await self.get_connected_account_id() + if not connected_account_id: + return [], None, None, "No connected account ID found" + + entity_id = await self.get_entity_id() + service = await self._get_service() + return await service.get_gmail_messages( + connected_account_id=connected_account_id, + entity_id=entity_id, + query=query, + max_results=max_results, + page_token=page_token, + ) + + async def get_gmail_message_detail( + self, message_id: str + ) -> tuple[dict[str, Any] | None, str | None]: + """ + Get full details of a Gmail message via Composio. + + Args: + message_id: Gmail message ID. + + Returns: + Tuple of (message details, error message). + """ + connected_account_id = await self.get_connected_account_id() + if not connected_account_id: + return None, "No connected account ID found" + + entity_id = await self.get_entity_id() + service = await self._get_service() + return await service.get_gmail_message_detail( + connected_account_id=connected_account_id, + entity_id=entity_id, + message_id=message_id, + ) + + def format_gmail_message_to_markdown(self, message: dict[str, Any]) -> str: + """ + Format a Gmail message to markdown. + + Args: + message: Message object from Composio's GMAIL_FETCH_EMAILS response. + Composio structure: messageId, messageText, messageTimestamp, + payload.headers, labelIds, attachmentList + + Returns: + Formatted markdown string. + """ + try: + # Composio uses 'messageId' (camelCase) + message_id = message.get("messageId", "") or message.get("id", "") + label_ids = message.get("labelIds", []) + + # Extract headers from payload + payload = message.get("payload", {}) + headers = payload.get("headers", []) + + # Parse headers into a dict + header_dict = {} + for header in headers: + name = header.get("name", "").lower() + value = header.get("value", "") + header_dict[name] = value + + # Extract key information + subject = header_dict.get("subject", "No Subject") + from_email = header_dict.get("from", "Unknown Sender") + to_email = header_dict.get("to", "Unknown Recipient") + # Composio provides messageTimestamp directly + date_str = message.get("messageTimestamp", "") or header_dict.get( + "date", "Unknown Date" + ) + + # Build markdown content + markdown_content = f"# {subject}\n\n" + markdown_content += f"**From:** {from_email}\n" + markdown_content += f"**To:** {to_email}\n" + markdown_content += f"**Date:** {date_str}\n" + + if label_ids: + markdown_content += f"**Labels:** {', '.join(label_ids)}\n" + + markdown_content += "\n---\n\n" + + # Composio provides full message text in 'messageText' + message_text = message.get("messageText", "") + if message_text: + markdown_content += f"## Content\n\n{message_text}\n\n" + else: + # Fallback to snippet if no messageText + snippet = message.get("snippet", "") + if snippet: + markdown_content += f"## Preview\n\n{snippet}\n\n" + + # Add attachment info if present + attachments = message.get("attachmentList", []) + if attachments: + markdown_content += "## Attachments\n\n" + for att in attachments: + att_name = att.get("filename", att.get("name", "Unknown")) + markdown_content += f"- {att_name}\n" + markdown_content += "\n" + + # Add message metadata + markdown_content += "## Message Details\n\n" + markdown_content += f"- **Message ID:** {message_id}\n" + + return markdown_content + + except Exception as e: + return f"Error formatting message to markdown: {e!s}" + + +# ============ Indexer Functions ============ + + +async def _process_gmail_message_batch( + session: AsyncSession, + messages: list[dict[str, Any]], + composio_connector: ComposioGmailConnector, + connector_id: int, + search_space_id: int, + user_id: str, + total_documents_indexed: int = 0, +) -> tuple[int, int]: + """ + Process a batch of Gmail messages and index them. + + Args: + total_documents_indexed: Running total of documents indexed so far (for batch commits). + + Returns: + Tuple of (documents_indexed, documents_skipped) + """ + documents_indexed = 0 + documents_skipped = 0 + + for message in messages: + try: + # Composio uses 'messageId' (camelCase), not 'id' + message_id = message.get("messageId", "") or message.get("id", "") + if not message_id: + documents_skipped += 1 + continue + + # Composio's GMAIL_FETCH_EMAILS already returns full message content + # No need for a separate detail API call + + # Extract message info from Composio response + # Composio structure: messageId, messageText, messageTimestamp, payload.headers, labelIds + payload = message.get("payload", {}) + headers = payload.get("headers", []) + + subject = "No Subject" + sender = "Unknown Sender" + date_str = message.get("messageTimestamp", "Unknown Date") + + for header in headers: + name = header.get("name", "").lower() + value = header.get("value", "") + if name == "subject": + subject = value + elif name == "from": + sender = value + elif name == "date": + date_str = value + + # Format to markdown using the full message data + markdown_content = composio_connector.format_gmail_message_to_markdown( + message + ) + + # Check for empty content (defensive parsing per Composio best practices) + if not markdown_content.strip(): + logger.warning(f"Skipping Gmail message with no content: {subject}") + documents_skipped += 1 + continue + + # Generate unique identifier + document_type = DocumentType(TOOLKIT_TO_DOCUMENT_TYPE["gmail"]) + unique_identifier_hash = generate_unique_identifier_hash( + document_type, f"gmail_{message_id}", search_space_id + ) + + content_hash = generate_content_hash(markdown_content, search_space_id) + + existing_document = await check_document_by_unique_identifier( + session, unique_identifier_hash + ) + + # Get label IDs from Composio response + label_ids = message.get("labelIds", []) + # Extract thread_id if available (for consistency with non-Composio implementation) + thread_id = message.get("threadId", "") or message.get("thread_id", "") + + if existing_document: + if existing_document.content_hash == content_hash: + documents_skipped += 1 + continue + + # Update existing + user_llm = await get_user_long_context_llm( + session, user_id, search_space_id + ) + + if user_llm: + document_metadata = { + "message_id": message_id, + "thread_id": thread_id, + "subject": subject, + "sender": sender, + "document_type": "Gmail Message (Composio)", + } + ( + summary_content, + summary_embedding, + ) = await generate_document_summary( + markdown_content, user_llm, document_metadata + ) + else: + summary_content = ( + f"Gmail: {subject}\n\nFrom: {sender}\nDate: {date_str}" + ) + summary_embedding = config.embedding_model_instance.embed( + summary_content + ) + + chunks = await create_document_chunks(markdown_content) + + existing_document.title = f"Gmail: {subject}" + existing_document.content = summary_content + existing_document.content_hash = content_hash + existing_document.embedding = summary_embedding + existing_document.document_metadata = { + "message_id": message_id, + "thread_id": thread_id, + "subject": subject, + "sender": sender, + "date": date_str, + "labels": label_ids, + "connector_id": connector_id, + "source": "composio", + } + existing_document.chunks = chunks + existing_document.updated_at = get_current_timestamp() + + documents_indexed += 1 + + # Batch commit every 10 documents + current_total = total_documents_indexed + documents_indexed + if current_total % 10 == 0: + logger.info( + f"Committing batch: {current_total} Gmail messages processed so far" + ) + await session.commit() + continue + + # Create new document + user_llm = await get_user_long_context_llm( + session, user_id, search_space_id + ) + + if user_llm: + document_metadata = { + "message_id": message_id, + "thread_id": thread_id, + "subject": subject, + "sender": sender, + "document_type": "Gmail Message (Composio)", + } + summary_content, summary_embedding = await generate_document_summary( + markdown_content, user_llm, document_metadata + ) + else: + summary_content = ( + f"Gmail: {subject}\n\nFrom: {sender}\nDate: {date_str}" + ) + summary_embedding = config.embedding_model_instance.embed( + summary_content + ) + + chunks = await create_document_chunks(markdown_content) + + document = Document( + search_space_id=search_space_id, + title=f"Gmail: {subject}", + document_type=DocumentType(TOOLKIT_TO_DOCUMENT_TYPE["gmail"]), + document_metadata={ + "message_id": message_id, + "thread_id": thread_id, + "subject": subject, + "sender": sender, + "date": date_str, + "labels": label_ids, + "connector_id": connector_id, + "toolkit_id": "gmail", + "source": "composio", + }, + content=summary_content, + content_hash=content_hash, + unique_identifier_hash=unique_identifier_hash, + embedding=summary_embedding, + chunks=chunks, + updated_at=get_current_timestamp(), + ) + session.add(document) + documents_indexed += 1 + + # Batch commit every 10 documents + current_total = total_documents_indexed + documents_indexed + if current_total % 10 == 0: + logger.info( + f"Committing batch: {current_total} Gmail messages processed so far" + ) + await session.commit() + + except Exception as e: + logger.error(f"Error processing Gmail message: {e!s}", exc_info=True) + documents_skipped += 1 + # Rollback on error to avoid partial state (per Composio best practices) + try: + await session.rollback() + except Exception as rollback_error: + logger.error( + f"Error during rollback: {rollback_error!s}", exc_info=True + ) + continue + + return documents_indexed, documents_skipped + + +async def index_composio_gmail( + session: AsyncSession, + connector, + connector_id: int, + search_space_id: int, + user_id: str, + start_date: str | None, + end_date: str | None, + task_logger: TaskLoggingService, + log_entry, + update_last_indexed: bool = True, + max_items: int = 1000, +) -> tuple[int, str]: + """Index Gmail messages via Composio with pagination and incremental processing.""" + try: + composio_connector = ComposioGmailConnector(session, connector_id) + + # Normalize date values - handle "undefined" strings from frontend + if start_date == "undefined" or start_date == "": + start_date = None + if end_date == "undefined" or end_date == "": + end_date = None + + # Use provided dates directly if both are provided, otherwise calculate from last_indexed_at + # This ensures user-selected dates are respected (matching non-Composio Gmail connector behavior) + if start_date is not None and end_date is not None: + # User provided both dates - use them directly + start_date_str = start_date + end_date_str = end_date + else: + # Calculate date range with defaults (uses last_indexed_at or 365 days back) + # This ensures indexing works even when user doesn't specify dates + start_date_str, end_date_str = calculate_date_range( + connector, start_date, end_date, default_days_back=365 + ) + + # Build query with date range + query_parts = [] + if start_date_str: + query_parts.append(f"after:{start_date_str.replace('-', '/')}") + if end_date_str: + query_parts.append(f"before:{end_date_str.replace('-', '/')}") + query = " ".join(query_parts) if query_parts else "" + + logger.info( + f"Gmail query for connector {connector_id}: '{query}' " + f"(start_date={start_date_str}, end_date={end_date_str})" + ) + + # Use smaller batch size to avoid 413 payload too large errors + batch_size = 50 + page_token = None + total_documents_indexed = 0 + total_documents_skipped = 0 + total_messages_fetched = 0 + result_size_estimate = None # Will be set from first API response + + while total_messages_fetched < max_items: + # Calculate how many messages to fetch in this batch + remaining = max_items - total_messages_fetched + current_batch_size = min(batch_size, remaining) + + # Use result_size_estimate if available, otherwise fall back to max_items + estimated_total = ( + result_size_estimate if result_size_estimate is not None else max_items + ) + # Cap estimated_total at max_items to avoid showing misleading progress + estimated_total = min(estimated_total, max_items) + + await task_logger.log_task_progress( + log_entry, + f"Fetching Gmail messages batch via Composio for connector {connector_id} " + f"({total_messages_fetched}/{estimated_total} fetched, {total_documents_indexed} indexed)", + { + "stage": "fetching_messages", + "batch_size": current_batch_size, + "total_fetched": total_messages_fetched, + "total_indexed": total_documents_indexed, + "estimated_total": estimated_total, + }, + ) + + # Fetch batch of messages + ( + messages, + next_token, + result_size_estimate_batch, + error, + ) = await composio_connector.list_gmail_messages( + query=query, + max_results=current_batch_size, + page_token=page_token, + ) + + if error: + await task_logger.log_task_failure( + log_entry, f"Failed to fetch Gmail messages: {error}", {} + ) + return 0, f"Failed to fetch Gmail messages: {error}" + + if not messages: + # No more messages available + break + + # Update result_size_estimate from first response (Gmail provides this estimate) + if result_size_estimate is None and result_size_estimate_batch is not None: + result_size_estimate = result_size_estimate_batch + logger.info( + f"Gmail API estimated {result_size_estimate} total messages for query: '{query}'" + ) + + total_messages_fetched += len(messages) + # Recalculate estimated_total after potentially updating result_size_estimate + estimated_total = ( + result_size_estimate if result_size_estimate is not None else max_items + ) + estimated_total = min(estimated_total, max_items) + + logger.info( + f"Fetched batch of {len(messages)} Gmail messages " + f"(total: {total_messages_fetched}/{estimated_total})" + ) + + # Process batch incrementally + batch_indexed, batch_skipped = await _process_gmail_message_batch( + session=session, + messages=messages, + composio_connector=composio_connector, + connector_id=connector_id, + search_space_id=search_space_id, + user_id=user_id, + total_documents_indexed=total_documents_indexed, + ) + + total_documents_indexed += batch_indexed + total_documents_skipped += batch_skipped + + logger.info( + f"Processed batch: {batch_indexed} indexed, {batch_skipped} skipped " + f"(total: {total_documents_indexed} indexed, {total_documents_skipped} skipped)" + ) + + # Batch commits happen in _process_gmail_message_batch every 10 documents + # This ensures progress is saved incrementally, preventing data loss on crashes + + # Check if we should continue + if not next_token: + # No more pages available + break + + if len(messages) < current_batch_size: + # Last page had fewer items than requested, we're done + break + + # Continue with next page + page_token = next_token + + if total_messages_fetched == 0: + success_msg = "No Gmail messages found in the specified date range" + await task_logger.log_task_success( + log_entry, success_msg, {"messages_count": 0} + ) + # CRITICAL: Update timestamp even when no messages found so Electric SQL syncs and UI shows indexed status + await update_connector_last_indexed(session, connector, update_last_indexed) + await session.commit() + return 0, None # Return None (not error) when no items found + + # CRITICAL: Always update timestamp (even if 0 documents indexed) so Electric SQL syncs + # This ensures the UI shows "Last indexed" instead of "Never indexed" + await update_connector_last_indexed(session, connector, update_last_indexed) + + # Final commit to ensure all documents are persisted (safety net) + # This matches the pattern used in non-Composio Gmail indexer + logger.info( + f"Final commit: Total {total_documents_indexed} Gmail messages processed" + ) + await session.commit() + logger.info( + "Successfully committed all Composio Gmail document changes to database" + ) + + await task_logger.log_task_success( + log_entry, + f"Successfully completed Gmail indexing via Composio for connector {connector_id}", + { + "documents_indexed": total_documents_indexed, + "documents_skipped": total_documents_skipped, + "messages_fetched": total_messages_fetched, + }, + ) + + return total_documents_indexed, None + + except Exception as e: + logger.error(f"Failed to index Gmail via Composio: {e!s}", exc_info=True) + return 0, f"Failed to index Gmail via Composio: {e!s}" diff --git a/surfsense_backend/app/connectors/composio_google_calendar_connector.py b/surfsense_backend/app/connectors/composio_google_calendar_connector.py new file mode 100644 index 000000000..ec5b22b7f --- /dev/null +++ b/surfsense_backend/app/connectors/composio_google_calendar_connector.py @@ -0,0 +1,502 @@ +""" +Composio Google Calendar Connector Module. + +Provides Google Calendar specific methods for data retrieval and indexing via Composio. +""" + +import logging +from datetime import UTC, datetime +from typing import Any + +from sqlalchemy.ext.asyncio import AsyncSession +from sqlalchemy.future import select +from sqlalchemy.orm import selectinload + +from app.config import config +from app.connectors.composio_connector import ComposioConnector +from app.db import Document, DocumentType +from app.services.composio_service import TOOLKIT_TO_DOCUMENT_TYPE +from app.services.llm_service import get_user_long_context_llm +from app.services.task_logging_service import TaskLoggingService +from app.tasks.connector_indexers.base import ( + calculate_date_range, + check_duplicate_document_by_hash, +) +from app.utils.document_converters import ( + create_document_chunks, + generate_content_hash, + generate_document_summary, + generate_unique_identifier_hash, +) + +logger = logging.getLogger(__name__) + + +def get_current_timestamp() -> datetime: + """Get the current timestamp with timezone for updated_at field.""" + return datetime.now(UTC) + + +async def check_document_by_unique_identifier( + session: AsyncSession, unique_identifier_hash: str +) -> Document | None: + """Check if a document with the given unique identifier hash already exists.""" + existing_doc_result = await session.execute( + select(Document) + .options(selectinload(Document.chunks)) + .where(Document.unique_identifier_hash == unique_identifier_hash) + ) + return existing_doc_result.scalars().first() + + +async def update_connector_last_indexed( + session: AsyncSession, + connector, + update_last_indexed: bool = True, +) -> None: + """Update the last_indexed_at timestamp for a connector.""" + if update_last_indexed: + connector.last_indexed_at = datetime.now(UTC) + logger.info(f"Updated last_indexed_at to {connector.last_indexed_at}") + + +class ComposioGoogleCalendarConnector(ComposioConnector): + """ + Google Calendar specific Composio connector. + + Provides methods for listing calendar events and formatting them from + Google Calendar via Composio. + """ + + async def list_calendar_events( + self, + time_min: str | None = None, + time_max: str | None = None, + max_results: int = 250, + ) -> tuple[list[dict[str, Any]], str | None]: + """ + List Google Calendar events via Composio. + + Args: + time_min: Start time (RFC3339 format). + time_max: End time (RFC3339 format). + max_results: Maximum number of events. + + Returns: + Tuple of (events list, error message). + """ + connected_account_id = await self.get_connected_account_id() + if not connected_account_id: + return [], "No connected account ID found" + + entity_id = await self.get_entity_id() + service = await self._get_service() + return await service.get_calendar_events( + connected_account_id=connected_account_id, + entity_id=entity_id, + time_min=time_min, + time_max=time_max, + max_results=max_results, + ) + + def format_calendar_event_to_markdown(self, event: dict[str, Any]) -> str: + """ + Format a Google Calendar event to markdown. + + Args: + event: Event object from Google Calendar API. + + Returns: + Formatted markdown string. + """ + try: + # Extract basic event information + summary = event.get("summary", "No Title") + description = event.get("description", "") + location = event.get("location", "") + + # Extract start and end times + start = event.get("start", {}) + end = event.get("end", {}) + + start_time = start.get("dateTime") or start.get("date", "") + end_time = end.get("dateTime") or end.get("date", "") + + # Format times for display + def format_time(time_str: str) -> str: + if not time_str: + return "Unknown" + try: + if "T" in time_str: + dt = datetime.fromisoformat(time_str.replace("Z", "+00:00")) + return dt.strftime("%Y-%m-%d %H:%M") + return time_str + except Exception: + return time_str + + start_formatted = format_time(start_time) + end_formatted = format_time(end_time) + + # Extract attendees + attendees = event.get("attendees", []) + attendee_list = [] + for attendee in attendees: + email = attendee.get("email", "") + display_name = attendee.get("displayName", email) + response_status = attendee.get("responseStatus", "") + attendee_list.append(f"- {display_name} ({response_status})") + + # Build markdown content + markdown_content = f"# {summary}\n\n" + markdown_content += f"**Start:** {start_formatted}\n" + markdown_content += f"**End:** {end_formatted}\n" + + if location: + markdown_content += f"**Location:** {location}\n" + + markdown_content += "\n" + + if description: + markdown_content += f"## Description\n\n{description}\n\n" + + if attendee_list: + markdown_content += "## Attendees\n\n" + markdown_content += "\n".join(attendee_list) + markdown_content += "\n\n" + + # Add event metadata + markdown_content += "## Event Details\n\n" + markdown_content += f"- **Event ID:** {event.get('id', 'Unknown')}\n" + markdown_content += f"- **Created:** {event.get('created', 'Unknown')}\n" + markdown_content += f"- **Updated:** {event.get('updated', 'Unknown')}\n" + + return markdown_content + + except Exception as e: + return f"Error formatting event to markdown: {e!s}" + + +# ============ Indexer Functions ============ + + +async def index_composio_google_calendar( + session: AsyncSession, + connector, + connector_id: int, + search_space_id: int, + user_id: str, + start_date: str | None, + end_date: str | None, + task_logger: TaskLoggingService, + log_entry, + update_last_indexed: bool = True, + max_items: int = 2500, +) -> tuple[int, str]: + """Index Google Calendar events via Composio.""" + try: + composio_connector = ComposioGoogleCalendarConnector(session, connector_id) + + await task_logger.log_task_progress( + log_entry, + f"Fetching Google Calendar events via Composio for connector {connector_id}", + {"stage": "fetching_events"}, + ) + + # Normalize date values - handle "undefined" strings from frontend + if start_date == "undefined" or start_date == "": + start_date = None + if end_date == "undefined" or end_date == "": + end_date = None + + # Use provided dates directly if both are provided, otherwise calculate from last_indexed_at + # This ensures user-selected dates are respected (matching non-Composio Calendar connector behavior) + if start_date is not None and end_date is not None: + # User provided both dates - use them directly + start_date_str = start_date + end_date_str = end_date + else: + # Calculate date range with defaults (uses last_indexed_at or 365 days back) + # This ensures indexing works even when user doesn't specify dates + start_date_str, end_date_str = calculate_date_range( + connector, start_date, end_date, default_days_back=365 + ) + + # Build time range for API call + time_min = f"{start_date_str}T00:00:00Z" + time_max = f"{end_date_str}T23:59:59Z" + + logger.info( + f"Google Calendar query for connector {connector_id}: " + f"(start_date={start_date_str}, end_date={end_date_str})" + ) + + events, error = await composio_connector.list_calendar_events( + time_min=time_min, + time_max=time_max, + max_results=max_items, + ) + + if error: + await task_logger.log_task_failure( + log_entry, f"Failed to fetch Calendar events: {error}", {} + ) + return 0, f"Failed to fetch Calendar events: {error}" + + if not events: + success_msg = "No Google Calendar events found in the specified date range" + await task_logger.log_task_success( + log_entry, success_msg, {"events_count": 0} + ) + # CRITICAL: Update timestamp even when no events found so Electric SQL syncs and UI shows indexed status + await update_connector_last_indexed(session, connector, update_last_indexed) + await session.commit() + return ( + 0, + None, + ) # Return None (not error) when no items found - this is success with 0 items + + logger.info(f"Found {len(events)} Google Calendar events to index via Composio") + + documents_indexed = 0 + documents_skipped = 0 + duplicate_content_count = ( + 0 # Track events skipped due to duplicate content_hash + ) + + for event in events: + try: + # Handle both standard Google API and potential Composio variations + event_id = event.get("id", "") or event.get("eventId", "") + summary = ( + event.get("summary", "") or event.get("title", "") or "No Title" + ) + + if not event_id: + documents_skipped += 1 + continue + + # Format to markdown + markdown_content = composio_connector.format_calendar_event_to_markdown( + event + ) + + # Generate unique identifier + document_type = DocumentType(TOOLKIT_TO_DOCUMENT_TYPE["googlecalendar"]) + unique_identifier_hash = generate_unique_identifier_hash( + document_type, f"calendar_{event_id}", search_space_id + ) + + content_hash = generate_content_hash(markdown_content, search_space_id) + + existing_document = await check_document_by_unique_identifier( + session, unique_identifier_hash + ) + + # Extract event times + start = event.get("start", {}) + end = event.get("end", {}) + start_time = start.get("dateTime") or start.get("date", "") + end_time = end.get("dateTime") or end.get("date", "") + location = event.get("location", "") + + if existing_document: + if existing_document.content_hash == content_hash: + documents_skipped += 1 + continue + + # Update existing + user_llm = await get_user_long_context_llm( + session, user_id, search_space_id + ) + + if user_llm: + document_metadata = { + "event_id": event_id, + "summary": summary, + "start_time": start_time, + "document_type": "Google Calendar Event (Composio)", + } + ( + summary_content, + summary_embedding, + ) = await generate_document_summary( + markdown_content, user_llm, document_metadata + ) + else: + summary_content = f"Calendar: {summary}\n\nStart: {start_time}\nEnd: {end_time}" + if location: + summary_content += f"\nLocation: {location}" + summary_embedding = config.embedding_model_instance.embed( + summary_content + ) + + chunks = await create_document_chunks(markdown_content) + + existing_document.title = f"Calendar: {summary}" + existing_document.content = summary_content + existing_document.content_hash = content_hash + existing_document.embedding = summary_embedding + existing_document.document_metadata = { + "event_id": event_id, + "summary": summary, + "start_time": start_time, + "end_time": end_time, + "location": location, + "connector_id": connector_id, + "source": "composio", + } + existing_document.chunks = chunks + existing_document.updated_at = get_current_timestamp() + + documents_indexed += 1 + + # Batch commit every 10 documents + if documents_indexed % 10 == 0: + logger.info( + f"Committing batch: {documents_indexed} Google Calendar events processed so far" + ) + await session.commit() + continue + + # Document doesn't exist by unique_identifier_hash + # Check if a document with the same content_hash exists (from standard connector) + with session.no_autoflush: + duplicate_by_content = await check_duplicate_document_by_hash( + session, content_hash + ) + + if duplicate_by_content: + # A document with the same content already exists (likely from standard connector) + logger.info( + f"Event {summary} already indexed by another connector " + f"(existing document ID: {duplicate_by_content.id}, " + f"type: {duplicate_by_content.document_type}). Skipping to avoid duplicate content." + ) + duplicate_content_count += 1 + documents_skipped += 1 + continue + + # Create new document + user_llm = await get_user_long_context_llm( + session, user_id, search_space_id + ) + + if user_llm: + document_metadata = { + "event_id": event_id, + "summary": summary, + "start_time": start_time, + "document_type": "Google Calendar Event (Composio)", + } + ( + summary_content, + summary_embedding, + ) = await generate_document_summary( + markdown_content, user_llm, document_metadata + ) + else: + summary_content = ( + f"Calendar: {summary}\n\nStart: {start_time}\nEnd: {end_time}" + ) + if location: + summary_content += f"\nLocation: {location}" + summary_embedding = config.embedding_model_instance.embed( + summary_content + ) + + chunks = await create_document_chunks(markdown_content) + + document = Document( + search_space_id=search_space_id, + title=f"Calendar: {summary}", + document_type=DocumentType( + TOOLKIT_TO_DOCUMENT_TYPE["googlecalendar"] + ), + document_metadata={ + "event_id": event_id, + "summary": summary, + "start_time": start_time, + "end_time": end_time, + "location": location, + "connector_id": connector_id, + "toolkit_id": "googlecalendar", + "source": "composio", + }, + content=summary_content, + content_hash=content_hash, + unique_identifier_hash=unique_identifier_hash, + embedding=summary_embedding, + chunks=chunks, + updated_at=get_current_timestamp(), + ) + session.add(document) + documents_indexed += 1 + + # Batch commit every 10 documents + if documents_indexed % 10 == 0: + logger.info( + f"Committing batch: {documents_indexed} Google Calendar events processed so far" + ) + await session.commit() + + except Exception as e: + logger.error(f"Error processing Calendar event: {e!s}", exc_info=True) + documents_skipped += 1 + continue + + # CRITICAL: Always update timestamp (even if 0 documents indexed) so Electric SQL syncs + # This ensures the UI shows "Last indexed" instead of "Never indexed" + await update_connector_last_indexed(session, connector, update_last_indexed) + + # Final commit to ensure all documents are persisted (safety net) + # This matches the pattern used in non-Composio Gmail indexer + logger.info( + f"Final commit: Total {documents_indexed} Google Calendar events processed" + ) + try: + await session.commit() + logger.info( + "Successfully committed all Composio Google Calendar document changes to database" + ) + except Exception as e: + # Handle any remaining integrity errors gracefully (race conditions, etc.) + if ( + "duplicate key value violates unique constraint" in str(e).lower() + or "uniqueviolationerror" in str(e).lower() + ): + logger.warning( + f"Duplicate content_hash detected during final commit. " + f"This may occur if the same event was indexed by multiple connectors. " + f"Rolling back and continuing. Error: {e!s}" + ) + await session.rollback() + # Don't fail the entire task - some documents may have been successfully indexed + else: + raise + + # Build warning message if duplicates were found + warning_message = None + if duplicate_content_count > 0: + warning_message = f"{duplicate_content_count} skipped (duplicate)" + + await task_logger.log_task_success( + log_entry, + f"Successfully completed Google Calendar indexing via Composio for connector {connector_id}", + { + "documents_indexed": documents_indexed, + "documents_skipped": documents_skipped, + "duplicate_content_count": duplicate_content_count, + }, + ) + + logger.info( + f"Composio Google Calendar indexing completed: {documents_indexed} new events, {documents_skipped} skipped " + f"({duplicate_content_count} due to duplicate content from other connectors)" + ) + return documents_indexed, warning_message + + except Exception as e: + logger.error( + f"Failed to index Google Calendar via Composio: {e!s}", exc_info=True + ) + return 0, f"Failed to index Google Calendar via Composio: {e!s}" diff --git a/surfsense_backend/app/connectors/composio_google_drive_connector.py b/surfsense_backend/app/connectors/composio_google_drive_connector.py new file mode 100644 index 000000000..e3b988676 --- /dev/null +++ b/surfsense_backend/app/connectors/composio_google_drive_connector.py @@ -0,0 +1,1167 @@ +""" +Composio Google Drive Connector Module. + +Provides Google Drive specific methods for data retrieval and indexing via Composio. +""" + +import logging +import os +import tempfile +from datetime import UTC, datetime +from pathlib import Path +from typing import Any + +from sqlalchemy.ext.asyncio import AsyncSession +from sqlalchemy.orm.attributes import flag_modified + +from app.config import config +from app.connectors.composio_connector import ComposioConnector +from app.db import Document, DocumentType, Log +from app.services.composio_service import TOOLKIT_TO_DOCUMENT_TYPE +from app.services.llm_service import get_user_long_context_llm +from app.services.task_logging_service import TaskLoggingService +from app.utils.document_converters import ( + create_document_chunks, + generate_content_hash, + generate_document_summary, + generate_unique_identifier_hash, +) + +logger = logging.getLogger(__name__) + + +# Binary file extensions that need file processor +BINARY_FILE_EXTENSIONS = { + ".pdf", + ".doc", + ".docx", + ".xls", + ".xlsx", + ".ppt", + ".pptx", + ".png", + ".jpg", + ".jpeg", + ".gif", + ".bmp", + ".tiff", + ".webp", + ".zip", + ".tar", + ".gz", + ".rar", + ".7z", + ".mp3", + ".mp4", + ".wav", + ".avi", + ".mov", + ".exe", + ".dll", + ".so", + ".bin", +} + +# Text file extensions that can be decoded as UTF-8 +TEXT_FILE_EXTENSIONS = { + ".txt", + ".md", + ".markdown", + ".json", + ".xml", + ".html", + ".htm", + ".css", + ".js", + ".ts", + ".py", + ".java", + ".c", + ".cpp", + ".h", + ".yaml", + ".yml", + ".toml", + ".ini", + ".cfg", + ".conf", + ".sh", + ".bash", + ".zsh", + ".fish", + ".sql", + ".csv", + ".tsv", + ".rst", + ".tex", + ".log", +} + + +def get_current_timestamp() -> datetime: + """Get the current timestamp with timezone for updated_at field.""" + return datetime.now(UTC) + + +def _is_binary_file(file_name: str, mime_type: str) -> bool: + """Check if a file is binary based on extension or mime type.""" + extension = Path(file_name).suffix.lower() + + # Check extension first + if extension in BINARY_FILE_EXTENSIONS: + return True + if extension in TEXT_FILE_EXTENSIONS: + return False + + # Check mime type + if mime_type: + if mime_type.startswith(("image/", "audio/", "video/", "application/pdf")): + return True + if mime_type.startswith(("text/", "application/json", "application/xml")): + return False + # Office documents + if ( + "spreadsheet" in mime_type + or "document" in mime_type + or "presentation" in mime_type + ): + return True + + # Default to text for unknown types + return False + + +class ComposioGoogleDriveConnector(ComposioConnector): + """ + Google Drive specific Composio connector. + + Provides methods for listing files, downloading content, and tracking changes + from Google Drive via Composio. + """ + + async def list_drive_files( + self, + folder_id: str | None = None, + page_token: str | None = None, + page_size: int = 100, + ) -> tuple[list[dict[str, Any]], str | None, str | None]: + """ + List files from Google Drive via Composio. + + Args: + folder_id: Optional folder ID to list contents of. + page_token: Pagination token. + page_size: Number of files per page. + + Returns: + Tuple of (files list, next_page_token, error message). + """ + connected_account_id = await self.get_connected_account_id() + if not connected_account_id: + return [], None, "No connected account ID found" + + entity_id = await self.get_entity_id() + service = await self._get_service() + return await service.get_drive_files( + connected_account_id=connected_account_id, + entity_id=entity_id, + folder_id=folder_id, + page_token=page_token, + page_size=page_size, + ) + + async def get_drive_file_content( + self, file_id: str + ) -> tuple[bytes | None, str | None]: + """ + Download file content from Google Drive via Composio. + + Args: + file_id: Google Drive file ID. + + Returns: + Tuple of (file content bytes, error message). + """ + connected_account_id = await self.get_connected_account_id() + if not connected_account_id: + return None, "No connected account ID found" + + entity_id = await self.get_entity_id() + service = await self._get_service() + return await service.get_drive_file_content( + connected_account_id=connected_account_id, + entity_id=entity_id, + file_id=file_id, + ) + + async def get_drive_start_page_token(self) -> tuple[str | None, str | None]: + """ + Get the starting page token for Google Drive change tracking. + + Returns: + Tuple of (start_page_token, error message). + """ + connected_account_id = await self.get_connected_account_id() + if not connected_account_id: + return None, "No connected account ID found" + + entity_id = await self.get_entity_id() + service = await self._get_service() + return await service.get_drive_start_page_token( + connected_account_id=connected_account_id, + entity_id=entity_id, + ) + + async def list_drive_changes( + self, + page_token: str | None = None, + page_size: int = 100, + include_removed: bool = True, + ) -> tuple[list[dict[str, Any]], str | None, str | None]: + """ + List changes in Google Drive since the given page token. + + Args: + page_token: Page token from previous sync (optional). + page_size: Number of changes per page. + include_removed: Whether to include removed items. + + Returns: + Tuple of (changes list, new_start_page_token, error message). + """ + connected_account_id = await self.get_connected_account_id() + if not connected_account_id: + return [], None, "No connected account ID found" + + entity_id = await self.get_entity_id() + service = await self._get_service() + return await service.list_drive_changes( + connected_account_id=connected_account_id, + entity_id=entity_id, + page_token=page_token, + page_size=page_size, + include_removed=include_removed, + ) + + +# ============ File Processing Utilities ============ + + +async def _process_file_content( + content: bytes | str, + file_name: str, + file_id: str, + mime_type: str, + search_space_id: int, + user_id: str, + session: AsyncSession, + task_logger: TaskLoggingService, + log_entry: Log, + processing_errors: list[str], +) -> str: + """ + Process file content and return markdown text. + + For binary files (PDFs, images, etc.), uses Surfsense's ETL service. + For text files, decodes as UTF-8. + + Args: + content: File content as bytes or string + file_name: Name of the file + file_id: Google Drive file ID + mime_type: MIME type of the file + search_space_id: Search space ID + user_id: User ID + session: Database session + task_logger: Task logging service + log_entry: Log entry for tracking + processing_errors: List to append errors to + + Returns: + Markdown content string + """ + # Ensure content is bytes + if isinstance(content, str): + content = content.encode("utf-8") + + # Check if this is a binary file + if _is_binary_file(file_name, mime_type): + # Use ETL service for binary files (PDF, Office docs, etc.) + temp_file_path = None + try: + # Get file extension + extension = Path(file_name).suffix or ".bin" + + # Write to temp file + with tempfile.NamedTemporaryFile( + delete=False, suffix=extension + ) as tmp_file: + tmp_file.write(content) + temp_file_path = tmp_file.name + + # Use the configured ETL service to extract text + extracted_text = await _extract_text_with_etl( + temp_file_path, file_name, task_logger, log_entry + ) + + if extracted_text: + return extracted_text + else: + # Fallback if extraction fails + logger.warning(f"Could not extract text from binary file {file_name}") + return f"# {file_name}\n\n[Binary file - text extraction failed]\n\n**File ID:** {file_id}\n**Type:** {mime_type}\n" + + except Exception as e: + error_msg = f"Error processing binary file {file_name}: {e!s}" + logger.error(error_msg) + processing_errors.append(error_msg) + return f"# {file_name}\n\n[Binary file - processing error]\n\n**File ID:** {file_id}\n**Type:** {mime_type}\n" + finally: + # Cleanup temp file + if temp_file_path and os.path.exists(temp_file_path): + try: + os.unlink(temp_file_path) + except Exception as e: + logger.debug(f"Could not delete temp file {temp_file_path}: {e}") + else: + # Text file - try to decode as UTF-8 + try: + return content.decode("utf-8") + except UnicodeDecodeError: + # Try other encodings + for encoding in ["latin-1", "cp1252", "iso-8859-1"]: + try: + return content.decode(encoding) + except UnicodeDecodeError: + continue + + # If all encodings fail, treat as binary + error_msg = f"Could not decode text file {file_name} with any encoding" + logger.warning(error_msg) + processing_errors.append(error_msg) + return f"# {file_name}\n\n[File content could not be decoded]\n\n**File ID:** {file_id}\n**Type:** {mime_type}\n" + + +async def _extract_text_with_etl( + file_path: str, + file_name: str, + task_logger: TaskLoggingService, + log_entry: Log, +) -> str | None: + """ + Extract text from a file using the configured ETL service. + + Args: + file_path: Path to the file + file_name: Name of the file + task_logger: Task logging service + log_entry: Log entry for tracking + + Returns: + Extracted text as markdown, or None if extraction fails + """ + import warnings + from logging import ERROR, getLogger + + etl_service = config.ETL_SERVICE + + try: + if etl_service == "UNSTRUCTURED": + from langchain_unstructured import UnstructuredLoader + + from app.utils.document_converters import convert_document_to_markdown + + loader = UnstructuredLoader( + file_path, + mode="elements", + post_processors=[], + languages=["eng"], + include_orig_elements=False, + include_metadata=False, + strategy="auto", + ) + + docs = await loader.aload() + if docs: + return await convert_document_to_markdown(docs) + return None + + elif etl_service == "LLAMACLOUD": + from app.tasks.document_processors.file_processors import ( + parse_with_llamacloud_retry, + ) + + # Estimate pages (rough estimate based on file size) + file_size = os.path.getsize(file_path) + estimated_pages = max(1, file_size // (80 * 1024)) + + result = await parse_with_llamacloud_retry( + file_path=file_path, + estimated_pages=estimated_pages, + task_logger=task_logger, + log_entry=log_entry, + ) + + markdown_documents = await result.aget_markdown_documents( + split_by_page=False + ) + if markdown_documents: + return markdown_documents[0].text + return None + + elif etl_service == "DOCLING": + from app.services.docling_service import create_docling_service + + docling_service = create_docling_service() + + # Suppress pdfminer warnings + pdfminer_logger = getLogger("pdfminer") + original_level = pdfminer_logger.level + + with warnings.catch_warnings(): + warnings.filterwarnings( + "ignore", category=UserWarning, module="pdfminer" + ) + warnings.filterwarnings( + "ignore", message=".*Cannot set gray non-stroke color.*" + ) + warnings.filterwarnings("ignore", message=".*invalid float value.*") + + pdfminer_logger.setLevel(ERROR) + + try: + result = await docling_service.process_document( + file_path, file_name + ) + finally: + pdfminer_logger.setLevel(original_level) + + return result.get("content") + else: + logger.warning(f"Unknown ETL service: {etl_service}") + return None + + except Exception as e: + logger.error(f"ETL extraction failed for {file_name}: {e!s}") + return None + + +# ============ Indexer Functions ============ + + +async def check_document_by_unique_identifier( + session: AsyncSession, unique_identifier_hash: str +) -> Document | None: + """Check if a document with the given unique identifier hash already exists.""" + from sqlalchemy.future import select + from sqlalchemy.orm import selectinload + + existing_doc_result = await session.execute( + select(Document) + .options(selectinload(Document.chunks)) + .where(Document.unique_identifier_hash == unique_identifier_hash) + ) + return existing_doc_result.scalars().first() + + +async def update_connector_last_indexed( + session: AsyncSession, + connector, + update_last_indexed: bool = True, +) -> None: + """Update the last_indexed_at timestamp for a connector.""" + if update_last_indexed: + connector.last_indexed_at = datetime.now( + UTC + ) # Use UTC for timezone consistency + logger.info(f"Updated last_indexed_at to {connector.last_indexed_at}") + + +async def index_composio_google_drive( + session: AsyncSession, + connector, + connector_id: int, + search_space_id: int, + user_id: str, + task_logger: TaskLoggingService, + log_entry, + update_last_indexed: bool = True, + max_items: int = 1000, +) -> tuple[int, str]: + """Index Google Drive files via Composio with delta sync support. + + Delta Sync Flow: + 1. First sync: Full scan + get initial page token + 2. Subsequent syncs: Use LIST_CHANGES to process only changed files + + Supports folder/file selection via connector config: + - selected_folders: List of {id, name} for folders to index + - selected_files: List of {id, name} for individual files to index + - indexing_options: {max_files_per_folder, incremental_sync, include_subfolders} + """ + try: + composio_connector = ComposioGoogleDriveConnector(session, connector_id) + connector_config = await composio_connector.get_config() + + # Get folder/file selection configuration + selected_folders = connector_config.get("selected_folders", []) + selected_files = connector_config.get("selected_files", []) + indexing_options = connector_config.get("indexing_options", {}) + + # Check for stored page token for delta sync + stored_page_token = connector_config.get("drive_page_token") + use_delta_sync = stored_page_token and connector.last_indexed_at + + max_files_per_folder = indexing_options.get("max_files_per_folder", 100) + include_subfolders = indexing_options.get("include_subfolders", True) + + # Route to delta sync or full scan + if use_delta_sync: + logger.info( + f"Using delta sync for Composio Google Drive connector {connector_id}" + ) + await task_logger.log_task_progress( + log_entry, + f"Starting delta sync for Google Drive via Composio (connector {connector_id})", + {"stage": "delta_sync", "token": stored_page_token[:20] + "..."}, + ) + + ( + documents_indexed, + documents_skipped, + processing_errors, + ) = await _index_composio_drive_delta_sync( + session=session, + composio_connector=composio_connector, + connector_id=connector_id, + search_space_id=search_space_id, + user_id=user_id, + page_token=stored_page_token, + max_items=max_items, + task_logger=task_logger, + log_entry=log_entry, + ) + else: + logger.info( + f"Using full scan for Composio Google Drive connector {connector_id} (first sync or no token)" + ) + await task_logger.log_task_progress( + log_entry, + f"Fetching Google Drive files via Composio for connector {connector_id}", + { + "stage": "full_scan", + "selected_folders": len(selected_folders), + "selected_files": len(selected_files), + }, + ) + + ( + documents_indexed, + documents_skipped, + processing_errors, + ) = await _index_composio_drive_full_scan( + session=session, + composio_connector=composio_connector, + connector_id=connector_id, + search_space_id=search_space_id, + user_id=user_id, + selected_folders=selected_folders, + selected_files=selected_files, + max_files_per_folder=max_files_per_folder, + include_subfolders=include_subfolders, + max_items=max_items, + task_logger=task_logger, + log_entry=log_entry, + ) + + # Get new page token for next sync (always update after successful sync) + new_token, token_error = await composio_connector.get_drive_start_page_token() + if new_token and not token_error: + # Refresh connector to avoid stale state + await session.refresh(connector) + + if not connector.config: + connector.config = {} + connector.config["drive_page_token"] = new_token + flag_modified(connector, "config") + logger.info(f"Updated drive_page_token for connector {connector_id}") + elif token_error: + logger.warning(f"Failed to get new page token: {token_error}") + + # CRITICAL: Always update timestamp so Electric SQL syncs and UI shows indexed status + await update_connector_last_indexed(session, connector, update_last_indexed) + + # Final commit + logger.info( + f"Final commit: Total {documents_indexed} Google Drive files processed" + ) + await session.commit() + logger.info( + "Successfully committed all Composio Google Drive document changes to database" + ) + + # Handle processing errors + error_message = None + if processing_errors: + if len(processing_errors) == 1: + error_message = processing_errors[0] + else: + error_message = f"Failed to process {len(processing_errors)} file(s). First error: {processing_errors[0]}" + await task_logger.log_task_failure( + log_entry, + f"Completed Google Drive indexing with {len(processing_errors)} error(s) for connector {connector_id}", + { + "documents_indexed": documents_indexed, + "documents_skipped": documents_skipped, + "sync_type": "delta" if use_delta_sync else "full", + "errors": processing_errors, + }, + ) + else: + await task_logger.log_task_success( + log_entry, + f"Successfully completed Google Drive indexing via Composio for connector {connector_id}", + { + "documents_indexed": documents_indexed, + "documents_skipped": documents_skipped, + "sync_type": "delta" if use_delta_sync else "full", + }, + ) + + return documents_indexed, error_message + + except Exception as e: + logger.error(f"Failed to index Google Drive via Composio: {e!s}", exc_info=True) + return 0, f"Failed to index Google Drive via Composio: {e!s}" + + +async def _index_composio_drive_delta_sync( + session: AsyncSession, + composio_connector: ComposioGoogleDriveConnector, + connector_id: int, + search_space_id: int, + user_id: str, + page_token: str, + max_items: int, + task_logger: TaskLoggingService, + log_entry, +) -> tuple[int, int, list[str]]: + """Index Google Drive files using delta sync (only changed files). + + Uses GOOGLEDRIVE_LIST_CHANGES to fetch only files that changed since last sync. + Handles: new files, modified files, and deleted files. + """ + documents_indexed = 0 + documents_skipped = 0 + processing_errors = [] + + # Fetch all changes with pagination + all_changes = [] + current_token = page_token + + while len(all_changes) < max_items: + changes, next_token, error = await composio_connector.list_drive_changes( + page_token=current_token, + page_size=100, + include_removed=True, + ) + + if error: + logger.error(f"Error fetching Drive changes: {error}") + processing_errors.append(f"Failed to fetch changes: {error}") + break + + all_changes.extend(changes) + + if not next_token or next_token == current_token: + break + current_token = next_token + + if not all_changes: + logger.info("No changes detected since last sync") + return 0, 0, [] + + logger.info(f"Processing {len(all_changes)} changes from delta sync") + + for change in all_changes[:max_items]: + try: + # Handle removed files + is_removed = change.get("removed", False) + file_info = change.get("file", {}) + file_id = change.get("fileId") or file_info.get("id", "") + + if not file_id: + documents_skipped += 1 + continue + + # Check if file was trashed or removed + if is_removed or file_info.get("trashed", False): + # Remove document from database + document_type = DocumentType(TOOLKIT_TO_DOCUMENT_TYPE["googledrive"]) + unique_identifier_hash = generate_unique_identifier_hash( + document_type, f"drive_{file_id}", search_space_id + ) + existing_document = await check_document_by_unique_identifier( + session, unique_identifier_hash + ) + if existing_document: + await session.delete(existing_document) + documents_indexed += 1 + logger.info(f"Deleted document for removed/trashed file: {file_id}") + continue + + # Process changed file + file_name = file_info.get("name", "") or "Untitled" + mime_type = file_info.get("mimeType", "") or file_info.get("mime_type", "") + + # Skip folders + if mime_type == "application/vnd.google-apps.folder": + continue + + # Process the file + indexed, skipped, errors = await _process_single_drive_file( + session=session, + composio_connector=composio_connector, + file_id=file_id, + file_name=file_name, + mime_type=mime_type, + connector_id=connector_id, + search_space_id=search_space_id, + user_id=user_id, + task_logger=task_logger, + log_entry=log_entry, + ) + + documents_indexed += indexed + documents_skipped += skipped + processing_errors.extend(errors) + + # Batch commit every 10 documents + if documents_indexed > 0 and documents_indexed % 10 == 0: + await session.commit() + logger.info(f"Committed batch: {documents_indexed} changes processed") + + except Exception as e: + error_msg = f"Error processing change for file {file_id}: {e!s}" + logger.error(error_msg, exc_info=True) + processing_errors.append(error_msg) + documents_skipped += 1 + + logger.info( + f"Delta sync complete: {documents_indexed} indexed, {documents_skipped} skipped" + ) + return documents_indexed, documents_skipped, processing_errors + + +async def _index_composio_drive_full_scan( + session: AsyncSession, + composio_connector: ComposioGoogleDriveConnector, + connector_id: int, + search_space_id: int, + user_id: str, + selected_folders: list[dict], + selected_files: list[dict], + max_files_per_folder: int, + include_subfolders: bool, + max_items: int, + task_logger: TaskLoggingService, + log_entry, +) -> tuple[int, int, list[str]]: + """Index Google Drive files using full scan (first sync or when no delta token).""" + documents_indexed = 0 + documents_skipped = 0 + processing_errors = [] + + all_files = [] + + # If specific folders/files are selected, fetch from those + if selected_folders or selected_files: + # Fetch files from selected folders + for folder in selected_folders: + folder_id = folder.get("id") + folder_name = folder.get("name", "Unknown") + + if not folder_id: + continue + + # Handle special case for "root" folder + actual_folder_id = None if folder_id == "root" else folder_id + + logger.info(f"Fetching files from folder: {folder_name} ({folder_id})") + + # Fetch files from this folder + folder_files = [] + page_token = None + + while len(folder_files) < max_files_per_folder: + ( + files, + next_token, + error, + ) = await composio_connector.list_drive_files( + folder_id=actual_folder_id, + page_token=page_token, + page_size=min(100, max_files_per_folder - len(folder_files)), + ) + + if error: + logger.warning( + f"Failed to fetch files from folder {folder_name}: {error}" + ) + break + + # Process files + for file_info in files: + mime_type = file_info.get("mimeType", "") or file_info.get( + "mime_type", "" + ) + + # If it's a folder and include_subfolders is enabled, recursively fetch + if mime_type == "application/vnd.google-apps.folder": + if include_subfolders: + # Add subfolder files recursively + subfolder_files = await _fetch_folder_files_recursively( + composio_connector, + file_info.get("id"), + max_files=max_files_per_folder, + current_count=len(folder_files), + ) + folder_files.extend(subfolder_files) + else: + folder_files.append(file_info) + + if not next_token: + break + page_token = next_token + + all_files.extend(folder_files[:max_files_per_folder]) + logger.info(f"Found {len(folder_files)} files in folder {folder_name}") + + # Add specifically selected files + for selected_file in selected_files: + file_id = selected_file.get("id") + file_name = selected_file.get("name", "Unknown") + + if not file_id: + continue + + # Add file info (we'll fetch content later during indexing) + all_files.append( + { + "id": file_id, + "name": file_name, + "mimeType": "", # Will be determined later + } + ) + else: + # No selection specified - fetch all files (original behavior) + page_token = None + + while len(all_files) < max_items: + files, next_token, error = await composio_connector.list_drive_files( + page_token=page_token, + page_size=min(100, max_items - len(all_files)), + ) + + if error: + return 0, 0, [f"Failed to fetch Drive files: {error}"] + + all_files.extend(files) + + if not next_token: + break + page_token = next_token + + if not all_files: + logger.info("No Google Drive files found") + return 0, 0, [] + + logger.info( + f"Found {len(all_files)} Google Drive files to index via Composio (full scan)" + ) + + for file_info in all_files: + try: + # Handle both standard Google API and potential Composio variations + file_id = file_info.get("id", "") or file_info.get("fileId", "") + file_name = ( + file_info.get("name", "") or file_info.get("fileName", "") or "Untitled" + ) + mime_type = file_info.get("mimeType", "") or file_info.get("mime_type", "") + + if not file_id: + documents_skipped += 1 + continue + + # Skip folders + if mime_type == "application/vnd.google-apps.folder": + continue + + # Process the file + indexed, skipped, errors = await _process_single_drive_file( + session=session, + composio_connector=composio_connector, + file_id=file_id, + file_name=file_name, + mime_type=mime_type, + connector_id=connector_id, + search_space_id=search_space_id, + user_id=user_id, + task_logger=task_logger, + log_entry=log_entry, + ) + + documents_indexed += indexed + documents_skipped += skipped + processing_errors.extend(errors) + + # Batch commit every 10 documents + if documents_indexed > 0 and documents_indexed % 10 == 0: + logger.info( + f"Committing batch: {documents_indexed} Google Drive files processed so far" + ) + await session.commit() + + except Exception as e: + error_msg = f"Error processing Drive file {file_name or 'unknown'}: {e!s}" + logger.error(error_msg, exc_info=True) + processing_errors.append(error_msg) + documents_skipped += 1 + + logger.info( + f"Full scan complete: {documents_indexed} indexed, {documents_skipped} skipped" + ) + return documents_indexed, documents_skipped, processing_errors + + +async def _process_single_drive_file( + session: AsyncSession, + composio_connector: ComposioGoogleDriveConnector, + file_id: str, + file_name: str, + mime_type: str, + connector_id: int, + search_space_id: int, + user_id: str, + task_logger: TaskLoggingService, + log_entry, +) -> tuple[int, int, list[str]]: + """Process a single Google Drive file for indexing. + + Returns: + Tuple of (documents_indexed, documents_skipped, processing_errors) + """ + processing_errors = [] + + # Generate unique identifier hash + document_type = DocumentType(TOOLKIT_TO_DOCUMENT_TYPE["googledrive"]) + unique_identifier_hash = generate_unique_identifier_hash( + document_type, f"drive_{file_id}", search_space_id + ) + + # Check if document exists + existing_document = await check_document_by_unique_identifier( + session, unique_identifier_hash + ) + + # Get file content + content, content_error = await composio_connector.get_drive_file_content(file_id) + + if content_error or not content: + logger.warning(f"Could not get content for file {file_name}: {content_error}") + # Use metadata as content fallback + markdown_content = f"# {file_name}\n\n" + markdown_content += f"**File ID:** {file_id}\n" + markdown_content += f"**Type:** {mime_type}\n" + elif isinstance(content, dict): + # Safety check: if content is still a dict, log error and use fallback + error_msg = f"Unexpected dict content format for file {file_name}: {list(content.keys())}" + logger.error(error_msg) + processing_errors.append(error_msg) + markdown_content = f"# {file_name}\n\n" + markdown_content += f"**File ID:** {file_id}\n" + markdown_content += f"**Type:** {mime_type}\n" + else: + # Process content based on file type + markdown_content = await _process_file_content( + content=content, + file_name=file_name, + file_id=file_id, + mime_type=mime_type, + search_space_id=search_space_id, + user_id=user_id, + session=session, + task_logger=task_logger, + log_entry=log_entry, + processing_errors=processing_errors, + ) + + content_hash = generate_content_hash(markdown_content, search_space_id) + + if existing_document: + if existing_document.content_hash == content_hash: + return 0, 1, processing_errors # Skipped + + # Update existing document + user_llm = await get_user_long_context_llm(session, user_id, search_space_id) + + if user_llm: + document_metadata = { + "file_id": file_id, + "file_name": file_name, + "mime_type": mime_type, + "document_type": "Google Drive File (Composio)", + } + ( + summary_content, + summary_embedding, + ) = await generate_document_summary( + markdown_content, user_llm, document_metadata + ) + else: + summary_content = f"Google Drive File: {file_name}\n\nType: {mime_type}" + summary_embedding = config.embedding_model_instance.embed(summary_content) + + chunks = await create_document_chunks(markdown_content) + + existing_document.title = f"Drive: {file_name}" + existing_document.content = summary_content + existing_document.content_hash = content_hash + existing_document.embedding = summary_embedding + existing_document.document_metadata = { + "file_id": file_id, + "file_name": file_name, + "FILE_NAME": file_name, # For compatibility + "mime_type": mime_type, + "connector_id": connector_id, + "source": "composio", + } + existing_document.chunks = chunks + existing_document.updated_at = get_current_timestamp() + + return 1, 0, processing_errors # Indexed + + # Create new document + user_llm = await get_user_long_context_llm(session, user_id, search_space_id) + + if user_llm: + document_metadata = { + "file_id": file_id, + "file_name": file_name, + "mime_type": mime_type, + "document_type": "Google Drive File (Composio)", + } + ( + summary_content, + summary_embedding, + ) = await generate_document_summary( + markdown_content, user_llm, document_metadata + ) + else: + summary_content = f"Google Drive File: {file_name}\n\nType: {mime_type}" + summary_embedding = config.embedding_model_instance.embed(summary_content) + + chunks = await create_document_chunks(markdown_content) + + document = Document( + search_space_id=search_space_id, + title=f"Drive: {file_name}", + document_type=DocumentType(TOOLKIT_TO_DOCUMENT_TYPE["googledrive"]), + document_metadata={ + "file_id": file_id, + "file_name": file_name, + "FILE_NAME": file_name, # For compatibility + "mime_type": mime_type, + "connector_id": connector_id, + "toolkit_id": "googledrive", + "source": "composio", + }, + content=summary_content, + content_hash=content_hash, + unique_identifier_hash=unique_identifier_hash, + embedding=summary_embedding, + chunks=chunks, + updated_at=get_current_timestamp(), + ) + session.add(document) + + return 1, 0, processing_errors # Indexed + + +async def _fetch_folder_files_recursively( + composio_connector: ComposioGoogleDriveConnector, + folder_id: str, + max_files: int = 100, + current_count: int = 0, + depth: int = 0, + max_depth: int = 10, +) -> list[dict[str, Any]]: + """ + Recursively fetch files from a Google Drive folder via Composio. + + Args: + composio_connector: The Composio connector instance + folder_id: Google Drive folder ID + max_files: Maximum number of files to fetch + current_count: Current number of files already fetched + depth: Current recursion depth + max_depth: Maximum recursion depth to prevent infinite loops + + Returns: + List of file info dictionaries + """ + if depth >= max_depth: + logger.warning(f"Max recursion depth reached for folder {folder_id}") + return [] + + if current_count >= max_files: + return [] + + all_files = [] + page_token = None + + try: + while len(all_files) + current_count < max_files: + files, next_token, error = await composio_connector.list_drive_files( + folder_id=folder_id, + page_token=page_token, + page_size=min(100, max_files - len(all_files) - current_count), + ) + + if error: + logger.warning( + f"Error fetching files from subfolder {folder_id}: {error}" + ) + break + + for file_info in files: + mime_type = file_info.get("mimeType", "") or file_info.get( + "mime_type", "" + ) + + if mime_type == "application/vnd.google-apps.folder": + # Recursively fetch from subfolders + subfolder_files = await _fetch_folder_files_recursively( + composio_connector, + file_info.get("id"), + max_files=max_files, + current_count=current_count + len(all_files), + depth=depth + 1, + max_depth=max_depth, + ) + all_files.extend(subfolder_files) + else: + all_files.append(file_info) + + if len(all_files) + current_count >= max_files: + break + + if not next_token: + break + page_token = next_token + + return all_files[: max_files - current_count] + + except Exception as e: + logger.error(f"Error in recursive folder fetch: {e!s}") + return all_files diff --git a/surfsense_backend/app/connectors/google_calendar_connector.py b/surfsense_backend/app/connectors/google_calendar_connector.py index 6d389ddd5..d8160cf25 100644 --- a/surfsense_backend/app/connectors/google_calendar_connector.py +++ b/surfsense_backend/app/connectors/google_calendar_connector.py @@ -142,6 +142,15 @@ class GoogleCalendarConnector: flag_modified(connector, "config") await self._session.commit() except Exception as e: + error_str = str(e) + # Check if this is an invalid_grant error (token expired/revoked) + if ( + "invalid_grant" in error_str.lower() + or "token has been expired or revoked" in error_str.lower() + ): + raise Exception( + "Google Calendar authentication failed. Please re-authenticate." + ) from e raise Exception( f"Failed to refresh Google OAuth credentials: {e!s}" ) from e @@ -165,6 +174,14 @@ class GoogleCalendarConnector: self.service = build("calendar", "v3", credentials=credentials) return self.service except Exception as e: + error_str = str(e) + # If the error already contains a user-friendly re-authentication message, preserve it + if ( + "re-authenticate" in error_str.lower() + or "expired or been revoked" in error_str.lower() + or "authentication failed" in error_str.lower() + ): + raise Exception(error_str) from e raise Exception(f"Failed to create Google Calendar service: {e!s}") from e async def get_calendars(self) -> tuple[list[dict[str, Any]], str | None]: @@ -271,6 +288,14 @@ class GoogleCalendarConnector: return events, None except Exception as e: + error_str = str(e) + # If the error already contains a user-friendly re-authentication message, preserve it + if ( + "re-authenticate" in error_str.lower() + or "expired or been revoked" in error_str.lower() + or "authentication failed" in error_str.lower() + ): + return [], error_str return [], f"Error fetching events: {e!s}" def format_event_to_markdown(self, event: dict[str, Any]) -> str: diff --git a/surfsense_backend/app/connectors/google_gmail_connector.py b/surfsense_backend/app/connectors/google_gmail_connector.py index 10008ad73..7c7262bff 100644 --- a/surfsense_backend/app/connectors/google_gmail_connector.py +++ b/surfsense_backend/app/connectors/google_gmail_connector.py @@ -141,6 +141,15 @@ class GoogleGmailConnector: flag_modified(connector, "config") await self._session.commit() except Exception as e: + error_str = str(e) + # Check if this is an invalid_grant error (token expired/revoked) + if ( + "invalid_grant" in error_str.lower() + or "token has been expired or revoked" in error_str.lower() + ): + raise Exception( + "Gmail authentication failed. Please re-authenticate." + ) from e raise Exception( f"Failed to refresh Google OAuth credentials: {e!s}" ) from e @@ -164,6 +173,14 @@ class GoogleGmailConnector: self.service = build("gmail", "v1", credentials=credentials) return self.service except Exception as e: + error_str = str(e) + # If the error already contains a user-friendly re-authentication message, preserve it + if ( + "re-authenticate" in error_str.lower() + or "expired or been revoked" in error_str.lower() + or "authentication failed" in error_str.lower() + ): + raise Exception(error_str) from e raise Exception(f"Failed to create Gmail service: {e!s}") from e async def get_user_profile(self) -> tuple[dict[str, Any], str | None]: @@ -225,6 +242,14 @@ class GoogleGmailConnector: return messages, None except Exception as e: + error_str = str(e) + # If the error already contains a user-friendly re-authentication message, preserve it + if ( + "re-authenticate" in error_str.lower() + or "expired or been revoked" in error_str.lower() + or "authentication failed" in error_str.lower() + ): + return [], error_str return [], f"Error fetching messages list: {e!s}" async def get_message_details( @@ -271,6 +296,13 @@ class GoogleGmailConnector: try: from datetime import datetime, timedelta + # Normalize date values - handle "undefined" strings from frontend + # This prevents "time data 'undefined' does not match format" errors + if start_date == "undefined" or start_date == "": + start_date = None + if end_date == "undefined" or end_date == "": + end_date = None + # Build date query query_parts = [] diff --git a/surfsense_backend/app/db.py b/surfsense_backend/app/db.py index 7018e613c..e3b077ff0 100644 --- a/surfsense_backend/app/db.py +++ b/surfsense_backend/app/db.py @@ -55,7 +55,9 @@ class DocumentType(str, Enum): CIRCLEBACK = "CIRCLEBACK" OBSIDIAN_CONNECTOR = "OBSIDIAN_CONNECTOR" NOTE = "NOTE" - COMPOSIO_CONNECTOR = "COMPOSIO_CONNECTOR" # Generic Composio integration + COMPOSIO_GOOGLE_DRIVE_CONNECTOR = "COMPOSIO_GOOGLE_DRIVE_CONNECTOR" + COMPOSIO_GMAIL_CONNECTOR = "COMPOSIO_GMAIL_CONNECTOR" + COMPOSIO_GOOGLE_CALENDAR_CONNECTOR = "COMPOSIO_GOOGLE_CALENDAR_CONNECTOR" class SearchSourceConnectorType(str, Enum): @@ -86,9 +88,9 @@ class SearchSourceConnectorType(str, Enum): "OBSIDIAN_CONNECTOR" # Self-hosted only - Local Obsidian vault indexing ) MCP_CONNECTOR = "MCP_CONNECTOR" # Model Context Protocol - User-defined API tools - COMPOSIO_CONNECTOR = ( - "COMPOSIO_CONNECTOR" # Generic Composio integration (Google, Slack, etc.) - ) + COMPOSIO_GOOGLE_DRIVE_CONNECTOR = "COMPOSIO_GOOGLE_DRIVE_CONNECTOR" + COMPOSIO_GMAIL_CONNECTOR = "COMPOSIO_GMAIL_CONNECTOR" + COMPOSIO_GOOGLE_CALENDAR_CONNECTOR = "COMPOSIO_GOOGLE_CALENDAR_CONNECTOR" class LiteLLMProvider(str, Enum): @@ -142,6 +144,43 @@ class LogStatus(str, Enum): FAILED = "FAILED" +class IncentiveTaskType(str, Enum): + """ + Enum for incentive task types that users can complete to earn free pages. + Each task can only be completed once per user. + + When adding new tasks: + 1. Add a new enum value here + 2. Add the task configuration to INCENTIVE_TASKS_CONFIG below + 3. Create an Alembic migration to add the enum value to PostgreSQL + """ + + GITHUB_STAR = "GITHUB_STAR" + # Future tasks can be added here: + # GITHUB_ISSUE = "GITHUB_ISSUE" + # SOCIAL_SHARE = "SOCIAL_SHARE" + # REFER_FRIEND = "REFER_FRIEND" + + +# Centralized configuration for incentive tasks +# This makes it easy to add new tasks without changing code in multiple places +INCENTIVE_TASKS_CONFIG = { + IncentiveTaskType.GITHUB_STAR: { + "title": "Star our GitHub repository", + "description": "Show your support by starring SurfSense on GitHub", + "pages_reward": 100, + "action_url": "https://github.com/MODSetter/SurfSense", + }, + # Future tasks can be configured here: + # IncentiveTaskType.GITHUB_ISSUE: { + # "title": "Create an issue", + # "description": "Help improve SurfSense by reporting bugs or suggesting features", + # "pages_reward": 50, + # "action_url": "https://github.com/MODSetter/SurfSense/issues/new/choose", + # }, +} + + class Permission(str, Enum): """ Granular permissions for search space resources. @@ -936,6 +975,39 @@ class Notification(BaseModel, TimestampMixin): search_space = relationship("SearchSpace", back_populates="notifications") +class UserIncentiveTask(BaseModel, TimestampMixin): + """ + Tracks completed incentive tasks for users. + Each user can only complete each task type once. + When a task is completed, the user's pages_limit is increased. + """ + + __tablename__ = "user_incentive_tasks" + __table_args__ = ( + UniqueConstraint( + "user_id", + "task_type", + name="uq_user_incentive_task", + ), + ) + + user_id = Column( + UUID(as_uuid=True), + ForeignKey("user.id", ondelete="CASCADE"), + nullable=False, + index=True, + ) + task_type = Column(SQLAlchemyEnum(IncentiveTaskType), nullable=False, index=True) + pages_awarded = Column(Integer, nullable=False) + completed_at = Column( + TIMESTAMP(timezone=True), + nullable=False, + default=lambda: datetime.now(UTC), + ) + + user = relationship("User", back_populates="incentive_tasks") + + class SearchSpaceRole(BaseModel, TimestampMixin): """ Custom roles that can be defined per search space. @@ -1114,6 +1186,13 @@ if config.AUTH_TYPE == "GOOGLE": cascade="all, delete-orphan", ) + # Incentive tasks completed by this user + incentive_tasks = relationship( + "UserIncentiveTask", + back_populates="user", + cascade="all, delete-orphan", + ) + # Page usage tracking for ETL services pages_limit = Column( Integer, @@ -1165,6 +1244,13 @@ else: cascade="all, delete-orphan", ) + # Incentive tasks completed by this user + incentive_tasks = relationship( + "UserIncentiveTask", + back_populates="user", + cascade="all, delete-orphan", + ) + # Page usage tracking for ETL services pages_limit = Column( Integer, diff --git a/surfsense_backend/app/routes/__init__.py b/surfsense_backend/app/routes/__init__.py index 81bd887a5..746c18c6d 100644 --- a/surfsense_backend/app/routes/__init__.py +++ b/surfsense_backend/app/routes/__init__.py @@ -20,6 +20,7 @@ from .google_drive_add_connector_route import ( from .google_gmail_add_connector_route import ( router as google_gmail_add_connector_router, ) +from .incentive_tasks_routes import router as incentive_tasks_router from .jira_add_connector_route import router as jira_add_connector_router from .linear_add_connector_route import router as linear_add_connector_router from .logs_routes import router as logs_router @@ -69,3 +70,4 @@ router.include_router(surfsense_docs_router) # Surfsense documentation for cita router.include_router(notifications_router) # Notifications with Electric SQL sync router.include_router(composio_router) # Composio OAuth and toolkit management router.include_router(public_chat_router) # Public chat sharing and cloning +router.include_router(incentive_tasks_router) # Incentive tasks for earning free pages diff --git a/surfsense_backend/app/routes/composio_routes.py b/surfsense_backend/app/routes/composio_routes.py index eecbaf598..a28361132 100644 --- a/surfsense_backend/app/routes/composio_routes.py +++ b/surfsense_backend/app/routes/composio_routes.py @@ -8,16 +8,18 @@ Endpoints: - GET /composio/toolkits - List available Composio toolkits - GET /auth/composio/connector/add - Initiate OAuth for a specific toolkit - GET /auth/composio/connector/callback - Handle OAuth callback +- GET /connectors/{connector_id}/composio-drive/folders - List folders/files for Composio Google Drive """ import logging from uuid import UUID -from fastapi import APIRouter, Depends, HTTPException, Query +from fastapi import APIRouter, Depends, HTTPException, Query, Request from fastapi.responses import RedirectResponse from pydantic import ValidationError from sqlalchemy.exc import IntegrityError from sqlalchemy.ext.asyncio import AsyncSession +from sqlalchemy.future import select from app.config import config from app.db import ( @@ -29,19 +31,31 @@ from app.db import ( from app.services.composio_service import ( COMPOSIO_TOOLKIT_NAMES, INDEXABLE_TOOLKITS, + TOOLKIT_TO_CONNECTOR_TYPE, ComposioService, ) from app.users import current_active_user from app.utils.connector_naming import ( - check_duplicate_connector, - generate_unique_connector_name, + count_connectors_of_type, + get_base_name_for_type, ) from app.utils.oauth_security import OAuthStateManager +# Note: We no longer use check_duplicate_connector for Composio connectors because +# Composio generates a new connected_account_id each time, even for the same Google account. +# Instead, we check for existing connectors by type/space/user and update them. + logger = logging.getLogger(__name__) router = APIRouter() +# Map toolkit_id to frontend connector ID +TOOLKIT_TO_FRONTEND_CONNECTOR_ID = { + "googledrive": "composio-googledrive", + "gmail": "composio-gmail", + "googlecalendar": "composio-googlecalendar", +} + # Initialize security utilities _state_manager = None @@ -166,11 +180,8 @@ async def initiate_composio_auth( @router.get("/auth/composio/connector/callback") async def composio_callback( + request: Request, state: str | None = None, - composio_connected_account_id: str | None = Query( - None, alias="connectedAccountId" - ), # Composio sends camelCase - connected_account_id: str | None = None, # Fallback snake_case error: str | None = None, session: AsyncSession = Depends(get_async_session), ): @@ -236,16 +247,17 @@ async def composio_callback( ) # Initialize Composio service - ComposioService() + service = ComposioService() - # Use camelCase param if provided (Composio's format), fallback to snake_case - final_connected_account_id = ( - composio_connected_account_id or connected_account_id - ) + # Extract connected_account_id from query params (accepts both camelCase and snake_case) + query_params = request.query_params + final_connected_account_id = query_params.get( + "connectedAccountId" + ) or query_params.get("connected_account_id") - # DEBUG: Log all query parameters received + # DEBUG: Log query parameter received logger.info( - f"DEBUG: Callback received - connectedAccountId: {composio_connected_account_id}, connected_account_id: {connected_account_id}, using: {final_connected_account_id}" + f"DEBUG: Callback received - connectedAccountId: {query_params.get('connectedAccountId')}, connected_account_id: {query_params.get('connected_account_id')}, using: {final_connected_account_id}" ) # If we still don't have a connected_account_id, warn but continue @@ -268,38 +280,89 @@ async def composio_callback( "is_indexable": toolkit_id in INDEXABLE_TOOLKITS, } - # Check for duplicate connector - # For Composio, we use toolkit_id + connected_account_id as unique identifier - identifier = final_connected_account_id or f"{toolkit_id}_{user_id}" + # Get the specific connector type for this toolkit + connector_type_str = TOOLKIT_TO_CONNECTOR_TYPE.get(toolkit_id) + if not connector_type_str: + raise HTTPException( + status_code=400, + detail=f"Unknown toolkit: {toolkit_id}. Available: {list(TOOLKIT_TO_CONNECTOR_TYPE.keys())}", + ) + connector_type = SearchSourceConnectorType(connector_type_str) - is_duplicate = await check_duplicate_connector( - session, - SearchSourceConnectorType.COMPOSIO_CONNECTOR, - space_id, - user_id, - identifier, + # Check for existing connector of the same type for this user/space + # When reconnecting, Composio gives a new connected_account_id, so we need to + # check by connector_type, user_id, and search_space_id instead of connected_account_id + existing_connector_result = await session.execute( + select(SearchSourceConnector).where( + SearchSourceConnector.connector_type == connector_type, + SearchSourceConnector.search_space_id == space_id, + SearchSourceConnector.user_id == user_id, + ) ) - if is_duplicate: - logger.warning( - f"Duplicate Composio connector detected for user {user_id} with toolkit {toolkit_id}" + existing_connector = existing_connector_result.scalars().first() + + if existing_connector: + # Delete the old Composio connected account before updating + old_connected_account_id = existing_connector.config.get( + "composio_connected_account_id" + ) + if ( + old_connected_account_id + and old_connected_account_id != final_connected_account_id + ): + try: + deleted = await service.delete_connected_account( + old_connected_account_id + ) + if deleted: + logger.info( + f"Deleted old Composio connected account {old_connected_account_id} " + f"before updating connector {existing_connector.id}" + ) + else: + logger.warning( + f"Failed to delete old Composio connected account {old_connected_account_id}" + ) + except Exception as delete_error: + # Log but don't fail - the old account may already be deleted + logger.warning( + f"Error deleting old Composio connected account {old_connected_account_id}: {delete_error!s}" + ) + + # Update existing connector with new connected_account_id + logger.info( + f"Updating existing Composio connector {existing_connector.id} with new connected_account_id {final_connected_account_id}" + ) + existing_connector.config = connector_config + await session.commit() + await session.refresh(existing_connector) + + # Get the frontend connector ID based on toolkit_id + frontend_connector_id = TOOLKIT_TO_FRONTEND_CONNECTOR_ID.get( + toolkit_id, "composio-connector" ) return RedirectResponse( - url=f"{config.NEXT_FRONTEND_URL}/dashboard/{space_id}/new-chat?modal=connectors&tab=all&error=duplicate_account&connector=composio-connector" + url=f"{config.NEXT_FRONTEND_URL}/dashboard/{space_id}/new-chat?modal=connectors&tab=all&success=true&connector={frontend_connector_id}&connectorId={existing_connector.id}" ) try: - # Generate a unique, user-friendly connector name - connector_name = await generate_unique_connector_name( - session, - SearchSourceConnectorType.COMPOSIO_CONNECTOR, - space_id, - user_id, - f"{toolkit_name} (Composio)", + # Count existing connectors of this type to determine the number + count = await count_connectors_of_type( + session, connector_type, space_id, user_id ) + # Generate base name (e.g., "Gmail", "Google Drive") + base_name = get_base_name_for_type(connector_type) + + # Format: "Gmail (Composio) 1", "Gmail (Composio) 2", etc. + if count == 0: + connector_name = f"{base_name} (Composio) 1" + else: + connector_name = f"{base_name} (Composio) {count + 1}" + db_connector = SearchSourceConnector( name=connector_name, - connector_type=SearchSourceConnectorType.COMPOSIO_CONNECTOR, + connector_type=connector_type, config=connector_config, search_space_id=space_id, user_id=user_id, @@ -314,8 +377,12 @@ async def composio_callback( f"Successfully created Composio connector {db_connector.id} for user {user_id}, toolkit {toolkit_id}" ) + # Get the frontend connector ID based on toolkit_id + frontend_connector_id = TOOLKIT_TO_FRONTEND_CONNECTOR_ID.get( + toolkit_id, "composio-connector" + ) return RedirectResponse( - url=f"{config.NEXT_FRONTEND_URL}/dashboard/{space_id}/new-chat?modal=connectors&tab=all&success=true&connector=composio-connector&connectorId={db_connector.id}" + url=f"{config.NEXT_FRONTEND_URL}/dashboard/{space_id}/new-chat?modal=connectors&tab=all&success=true&connector={frontend_connector_id}&connectorId={db_connector.id}" ) except IntegrityError as e: @@ -339,3 +406,136 @@ async def composio_callback( raise HTTPException( status_code=500, detail=f"Failed to complete Composio OAuth: {e!s}" ) from e + + +@router.get("/connectors/{connector_id}/composio-drive/folders") +async def list_composio_drive_folders( + connector_id: int, + parent_id: str | None = None, + session: AsyncSession = Depends(get_async_session), + user: User = Depends(current_active_user), +): + """ + List folders AND files in user's Google Drive via Composio with hierarchical support. + + This is called at index time from the manage connector page to display + the complete file system (folders and files). Only folders are selectable. + + Args: + connector_id: ID of the Composio Google Drive connector + parent_id: Optional parent folder ID to list contents (None for root) + + Returns: + JSON with list of items: { + "items": [ + {"id": str, "name": str, "mimeType": str, "isFolder": bool, ...}, + ... + ] + } + """ + if not ComposioService.is_enabled(): + raise HTTPException( + status_code=503, + detail="Composio integration is not enabled.", + ) + + try: + # Get connector and verify ownership + result = await session.execute( + select(SearchSourceConnector).filter( + SearchSourceConnector.id == connector_id, + SearchSourceConnector.user_id == user.id, + SearchSourceConnector.connector_type + == SearchSourceConnectorType.COMPOSIO_GOOGLE_DRIVE_CONNECTOR, + ) + ) + connector = result.scalars().first() + + if not connector: + raise HTTPException( + status_code=404, + detail="Composio Google Drive connector not found or access denied", + ) + + # Get Composio connected account ID from config + composio_connected_account_id = connector.config.get( + "composio_connected_account_id" + ) + if not composio_connected_account_id: + raise HTTPException( + status_code=400, + detail="Composio connected account not found. Please reconnect the connector.", + ) + + # Initialize Composio service and fetch files + service = ComposioService() + entity_id = f"surfsense_{user.id}" + + # Fetch files/folders from Composio Google Drive + files, _next_token, error = await service.get_drive_files( + connected_account_id=composio_connected_account_id, + entity_id=entity_id, + folder_id=parent_id, + page_size=100, + ) + + if error: + logger.error(f"Failed to list Composio Drive files: {error}") + raise HTTPException( + status_code=500, detail=f"Failed to list folder contents: {error}" + ) + + # Transform files to match the expected format with isFolder field + items = [] + for file_info in files: + file_id = file_info.get("id", "") or file_info.get("fileId", "") + file_name = ( + file_info.get("name", "") or file_info.get("fileName", "") or "Untitled" + ) + mime_type = file_info.get("mimeType", "") or file_info.get("mime_type", "") + + if not file_id: + continue + + is_folder = mime_type == "application/vnd.google-apps.folder" + + items.append( + { + "id": file_id, + "name": file_name, + "mimeType": mime_type, + "isFolder": is_folder, + "parents": file_info.get("parents", []), + "size": file_info.get("size"), + "iconLink": file_info.get("iconLink"), + } + ) + + # Sort: folders first, then files, both alphabetically + folders = sorted( + [item for item in items if item["isFolder"]], + key=lambda x: x["name"].lower(), + ) + files_list = sorted( + [item for item in items if not item["isFolder"]], + key=lambda x: x["name"].lower(), + ) + items = folders + files_list + + folder_count = len(folders) + file_count = len(files_list) + + logger.info( + f"Listed {len(items)} total items ({folder_count} folders, {file_count} files) for Composio connector {connector_id}" + + (f" in folder {parent_id}" if parent_id else " in ROOT") + ) + + return {"items": items} + + except HTTPException: + raise + except Exception as e: + logger.error(f"Error listing Composio Drive contents: {e!s}", exc_info=True) + raise HTTPException( + status_code=500, detail=f"Failed to list Drive contents: {e!s}" + ) from e diff --git a/surfsense_backend/app/routes/google_drive_add_connector_route.py b/surfsense_backend/app/routes/google_drive_add_connector_route.py index e15aed762..6b4159d29 100644 --- a/surfsense_backend/app/routes/google_drive_add_connector_route.py +++ b/surfsense_backend/app/routes/google_drive_add_connector_route.py @@ -402,7 +402,7 @@ async def list_google_drive_folders( file_count = len(items) - folder_count logger.info( - f"✅ Listed {len(items)} total items ({folder_count} folders, {file_count} files) for connector {connector_id}" + f"Listed {len(items)} total items ({folder_count} folders, {file_count} files) for connector {connector_id}" + (f" in folder {parent_id}" if parent_id else " in ROOT") ) diff --git a/surfsense_backend/app/routes/incentive_tasks_routes.py b/surfsense_backend/app/routes/incentive_tasks_routes.py new file mode 100644 index 000000000..93e54c153 --- /dev/null +++ b/surfsense_backend/app/routes/incentive_tasks_routes.py @@ -0,0 +1,131 @@ +""" +Incentive Tasks API routes. +Allows users to complete tasks (like starring GitHub repo) to earn free pages. +Each task can only be completed once per user. +""" + +from fastapi import APIRouter, Depends, HTTPException, status +from sqlalchemy import select +from sqlalchemy.ext.asyncio import AsyncSession + +from app.db import ( + INCENTIVE_TASKS_CONFIG, + IncentiveTaskType, + User, + UserIncentiveTask, + get_async_session, +) +from app.schemas.incentive_tasks import ( + CompleteTaskResponse, + IncentiveTaskInfo, + IncentiveTasksResponse, + TaskAlreadyCompletedResponse, +) +from app.users import current_active_user + +router = APIRouter(prefix="/incentive-tasks", tags=["incentive-tasks"]) + + +@router.get("", response_model=IncentiveTasksResponse) +async def get_incentive_tasks( + user: User = Depends(current_active_user), + session: AsyncSession = Depends(get_async_session), +) -> IncentiveTasksResponse: + """ + Get all available incentive tasks with the user's completion status. + """ + # Get all completed tasks for this user + result = await session.execute( + select(UserIncentiveTask).where(UserIncentiveTask.user_id == user.id) + ) + completed_tasks = {task.task_type: task for task in result.scalars().all()} + + # Build task list with completion status + tasks = [] + total_pages_earned = 0 + + for task_type, config in INCENTIVE_TASKS_CONFIG.items(): + completed_task = completed_tasks.get(task_type) + is_completed = completed_task is not None + + if is_completed: + total_pages_earned += completed_task.pages_awarded + + tasks.append( + IncentiveTaskInfo( + task_type=task_type, + title=config["title"], + description=config["description"], + pages_reward=config["pages_reward"], + action_url=config["action_url"], + completed=is_completed, + completed_at=completed_task.completed_at if completed_task else None, + ) + ) + + return IncentiveTasksResponse( + tasks=tasks, + total_pages_earned=total_pages_earned, + ) + + +@router.post( + "/{task_type}/complete", + response_model=CompleteTaskResponse | TaskAlreadyCompletedResponse, +) +async def complete_task( + task_type: IncentiveTaskType, + user: User = Depends(current_active_user), + session: AsyncSession = Depends(get_async_session), +) -> CompleteTaskResponse | TaskAlreadyCompletedResponse: + """ + Mark an incentive task as completed and award pages to the user. + + Each task can only be completed once. If the task was already completed, + returns the existing completion information without awarding additional pages. + """ + # Validate task type exists in config + task_config = INCENTIVE_TASKS_CONFIG.get(task_type) + if not task_config: + raise HTTPException( + status_code=status.HTTP_400_BAD_REQUEST, + detail=f"Unknown task type: {task_type}", + ) + + # Check if task was already completed + existing_task = await session.execute( + select(UserIncentiveTask).where( + UserIncentiveTask.user_id == user.id, + UserIncentiveTask.task_type == task_type, + ) + ) + existing = existing_task.scalar_one_or_none() + + if existing: + return TaskAlreadyCompletedResponse( + success=False, + message="Task already completed", + completed_at=existing.completed_at, + ) + + # Create the task completion record + pages_reward = task_config["pages_reward"] + new_task = UserIncentiveTask( + user_id=user.id, + task_type=task_type, + pages_awarded=pages_reward, + ) + session.add(new_task) + + # Update user's pages_limit + user.pages_limit += pages_reward + + await session.commit() + await session.refresh(user) + + return CompleteTaskResponse( + success=True, + message=f"Task completed! You earned {pages_reward} pages.", + pages_awarded=pages_reward, + new_pages_limit=user.pages_limit, + ) diff --git a/surfsense_backend/app/routes/rbac_routes.py b/surfsense_backend/app/routes/rbac_routes.py index 84e95f7ca..5070a2724 100644 --- a/surfsense_backend/app/routes/rbac_routes.py +++ b/surfsense_backend/app/routes/rbac_routes.py @@ -59,6 +59,58 @@ router = APIRouter() # ============ Permissions Endpoints ============ +# Human-readable descriptions for each permission +PERMISSION_DESCRIPTIONS = { + # Documents + "documents:create": "Add new documents, files, and content to the search space", + "documents:read": "View and search documents in the search space", + "documents:update": "Edit existing documents and their metadata", + "documents:delete": "Remove documents from the search space", + # Chats + "chats:create": "Start new AI chat conversations", + "chats:read": "View chat history and conversations", + "chats:update": "Edit chat titles and settings", + "chats:delete": "Delete chat conversations", + # Comments + "comments:create": "Add comments and annotations to documents", + "comments:read": "View comments on documents", + "comments:delete": "Remove comments from documents", + # LLM Configs + "llm_configs:create": "Add new AI model configurations", + "llm_configs:read": "View AI model settings and configurations", + "llm_configs:update": "Modify AI model configurations", + "llm_configs:delete": "Remove AI model configurations", + # Podcasts + "podcasts:create": "Generate new AI podcasts from content", + "podcasts:read": "Listen to and view generated podcasts", + "podcasts:update": "Edit podcast settings and metadata", + "podcasts:delete": "Remove generated podcasts", + # Connectors + "connectors:create": "Set up new data source integrations", + "connectors:read": "View configured data sources and their status", + "connectors:update": "Modify data source configurations", + "connectors:delete": "Remove data source integrations", + # Logs + "logs:read": "View activity logs and audit trail", + "logs:delete": "Clear activity logs", + # Members + "members:invite": "Send invitations to new team members", + "members:view": "View the list of team members", + "members:remove": "Remove members from the search space", + "members:manage_roles": "Assign and change member roles", + # Roles + "roles:create": "Create new custom roles", + "roles:read": "View available roles and their permissions", + "roles:update": "Modify role permissions", + "roles:delete": "Remove custom roles", + # Settings + "settings:view": "View search space settings", + "settings:update": "Modify search space settings", + "settings:delete": "Delete the entire search space", + # Full access + "*": "Full access to all features and settings", +} + @router.get("/permissions", response_model=PermissionsListResponse) async def list_all_permissions( @@ -71,12 +123,14 @@ async def list_all_permissions( for perm in Permission: # Extract category from permission value (e.g., "documents:read" -> "documents") category = perm.value.split(":")[0] if ":" in perm.value else "general" + description = PERMISSION_DESCRIPTIONS.get(perm.value, f"Permission for {perm.value}") permissions.append( PermissionInfo( value=perm.value, name=perm.name, category=category, + description=description, ) ) diff --git a/surfsense_backend/app/routes/search_source_connectors_routes.py b/surfsense_backend/app/routes/search_source_connectors_routes.py index 07d1dffe5..191c6f954 100644 --- a/surfsense_backend/app/routes/search_source_connectors_routes.py +++ b/surfsense_backend/app/routes/search_source_connectors_routes.py @@ -22,6 +22,8 @@ import logging from datetime import UTC, datetime, timedelta from typing import Any +import pytz +from dateutil.parser import isoparse from fastapi import APIRouter, Body, Depends, HTTPException, Query from pydantic import BaseModel, Field, ValidationError from sqlalchemy.exc import IntegrityError @@ -47,6 +49,7 @@ from app.schemas import ( SearchSourceConnectorRead, SearchSourceConnectorUpdate, ) +from app.services.composio_service import ComposioService from app.services.notification_service import NotificationService from app.tasks.connector_indexers import ( index_airtable_records, @@ -529,6 +532,38 @@ async def delete_search_source_connector( f"Failed to delete periodic schedule for connector {connector_id}" ) + # For Composio connectors, also delete the connected account in Composio + composio_connector_types = [ + SearchSourceConnectorType.COMPOSIO_GOOGLE_DRIVE_CONNECTOR, + SearchSourceConnectorType.COMPOSIO_GMAIL_CONNECTOR, + SearchSourceConnectorType.COMPOSIO_GOOGLE_CALENDAR_CONNECTOR, + ] + if db_connector.connector_type in composio_connector_types: + composio_connected_account_id = db_connector.config.get( + "composio_connected_account_id" + ) + if composio_connected_account_id and ComposioService.is_enabled(): + try: + service = ComposioService() + deleted = await service.delete_connected_account( + composio_connected_account_id + ) + if deleted: + logger.info( + f"Successfully deleted Composio connected account {composio_connected_account_id} " + f"for connector {connector_id}" + ) + else: + logger.warning( + f"Failed to delete Composio connected account {composio_connected_account_id} " + f"for connector {connector_id}" + ) + except Exception as composio_error: + # Log but don't fail the deletion - Composio account may already be deleted + logger.warning( + f"Error deleting Composio connected account {composio_connected_account_id}: {composio_error!s}" + ) + await session.delete(db_connector) await session.commit() return {"message": "Search source connector deleted successfully"} @@ -611,32 +646,59 @@ async def index_connector_content( # Handle different connector types response_message = "" - today_str = datetime.now().strftime("%Y-%m-%d") + # Use UTC for consistency with last_indexed_at storage + today_str = datetime.now(UTC).strftime("%Y-%m-%d") # Determine the actual date range to use if start_date is None: # Use last_indexed_at or default to 365 days ago if connector.last_indexed_at: - today = datetime.now().date() - if connector.last_indexed_at.date() == today: - # If last indexed today, go back 1 day to ensure we don't miss anything - indexing_from = (today - timedelta(days=1)).strftime("%Y-%m-%d") - else: - indexing_from = connector.last_indexed_at.strftime("%Y-%m-%d") - else: - indexing_from = (datetime.now() - timedelta(days=365)).strftime( - "%Y-%m-%d" + # Convert last_indexed_at to timezone-naive for comparison (like calculate_date_range does) + last_indexed_naive = ( + connector.last_indexed_at.replace(tzinfo=None) + if connector.last_indexed_at.tzinfo + else connector.last_indexed_at ) + # Use UTC for "today" to match how last_indexed_at is stored + today_utc = datetime.now(UTC).replace(tzinfo=None).date() + last_indexed_date = last_indexed_naive.date() + + if last_indexed_date == today_utc: + # If last indexed today, go back 1 day to ensure we don't miss anything + indexing_from = (today_utc - timedelta(days=1)).strftime("%Y-%m-%d") + else: + indexing_from = last_indexed_naive.strftime("%Y-%m-%d") + else: + indexing_from = ( + datetime.now(UTC).replace(tzinfo=None) - timedelta(days=365) + ).strftime("%Y-%m-%d") else: indexing_from = start_date # For calendar connectors, default to today but allow future dates if explicitly provided if connector.connector_type in [ SearchSourceConnectorType.GOOGLE_CALENDAR_CONNECTOR, + SearchSourceConnectorType.COMPOSIO_GOOGLE_CALENDAR_CONNECTOR, SearchSourceConnectorType.LUMA_CONNECTOR, ]: # Default to today if no end_date provided (users can manually select future dates) indexing_to = today_str if end_date is None else end_date + + # If start_date and end_date are the same, adjust end_date to be one day later + # to ensure valid date range (start_date must be strictly before end_date) + if indexing_from == indexing_to: + dt = isoparse(indexing_to) + if dt.tzinfo is None: + dt = dt.replace(tzinfo=pytz.UTC) + else: + dt = dt.astimezone(pytz.UTC) + # Add one day to end_date to make it strictly after start_date + dt_end = dt + timedelta(days=1) + indexing_to = dt_end.strftime("%Y-%m-%d") + logger.info( + f"Adjusted end_date from {end_date} to {indexing_to} " + f"to ensure valid date range (start_date must be strictly before end_date)" + ) else: # For non-calendar connectors, cap at today indexing_to = end_date if end_date else today_str @@ -887,11 +949,66 @@ async def index_connector_content( ) response_message = "Obsidian vault indexing started in the background." - elif connector.connector_type == SearchSourceConnectorType.COMPOSIO_CONNECTOR: + elif ( + connector.connector_type + == SearchSourceConnectorType.COMPOSIO_GOOGLE_DRIVE_CONNECTOR + ): from app.tasks.celery_tasks.connector_tasks import ( index_composio_connector_task, ) + # For Composio Google Drive, if drive_items is provided, update connector config + # This allows the UI to pass folder/file selection like the regular Google Drive connector + if drive_items and drive_items.has_items(): + # Update connector config with the selected folders/files + config = connector.config or {} + config["selected_folders"] = [ + {"id": f.id, "name": f.name} for f in drive_items.folders + ] + config["selected_files"] = [ + {"id": f.id, "name": f.name} for f in drive_items.files + ] + if drive_items.indexing_options: + config["indexing_options"] = { + "max_files_per_folder": drive_items.indexing_options.max_files_per_folder, + "incremental_sync": drive_items.indexing_options.incremental_sync, + "include_subfolders": drive_items.indexing_options.include_subfolders, + } + connector.config = config + from sqlalchemy.orm.attributes import flag_modified + + flag_modified(connector, "config") + await session.commit() + await session.refresh(connector) + + logger.info( + f"Triggering Composio Google Drive indexing for connector {connector_id} into search space {search_space_id}, " + f"folders: {len(drive_items.folders)}, files: {len(drive_items.files)}" + ) + else: + logger.info( + f"Triggering Composio Google Drive indexing for connector {connector_id} into search space {search_space_id} " + f"using existing config (from {indexing_from} to {indexing_to})" + ) + + index_composio_connector_task.delay( + connector_id, search_space_id, str(user.id), indexing_from, indexing_to + ) + response_message = ( + "Composio Google Drive indexing started in the background." + ) + + elif connector.connector_type in [ + SearchSourceConnectorType.COMPOSIO_GMAIL_CONNECTOR, + SearchSourceConnectorType.COMPOSIO_GOOGLE_CALENDAR_CONNECTOR, + ]: + from app.tasks.celery_tasks.connector_tasks import ( + index_composio_connector_task, + ) + + # For Composio Gmail and Calendar, use the same date calculation logic as normal connectors + # This ensures consistent behavior and uses last_indexed_at to reduce API calls + # (includes special case: if indexed today, go back 1 day to avoid missing data) logger.info( f"Triggering Composio connector indexing for connector {connector_id} into search space {search_space_id} from {indexing_from} to {indexing_to}" ) @@ -943,7 +1060,9 @@ async def _update_connector_timestamp_by_id(session: AsyncSession, connector_id: connector = result.scalars().first() if connector: - connector.last_indexed_at = datetime.now() + connector.last_indexed_at = datetime.now( + UTC + ) # Use UTC for timezone consistency await session.commit() logger.info(f"Updated last_indexed_at for connector {connector_id}") except Exception as e: @@ -1083,18 +1202,24 @@ async def _run_indexing_with_notifications( ) await update_timestamp_func(session, connector_id) + await session.commit() # Commit timestamp update logger.info( f"Indexing completed successfully: {documents_processed} documents processed" ) - # Update notification on success + # Update notification on success (or partial success with errors) if notification: + # Refresh notification to ensure it's not stale after timestamp update commit + await session.refresh(notification) await NotificationService.connector_indexing.notify_indexing_completed( session=session, notification=notification, indexed_count=documents_processed, - error_message=None, + error_message=error_or_warning, # Show errors even if some documents were indexed ) + await ( + session.commit() + ) # Commit to ensure Electric SQL syncs the notification update elif documents_processed > 0: # Update notification to storing stage if notification: @@ -1110,24 +1235,73 @@ async def _run_indexing_with_notifications( f"Indexing completed successfully: {documents_processed} documents processed" ) if notification: + # Refresh notification to ensure it's not stale after indexing function commits + await session.refresh(notification) await NotificationService.connector_indexing.notify_indexing_completed( session=session, notification=notification, indexed_count=documents_processed, - error_message=None, + error_message=error_or_warning, # Show errors even if some documents were indexed ) + await ( + session.commit() + ) # Commit to ensure Electric SQL syncs the notification update else: # No new documents processed - check if this is an error or just no changes if error_or_warning: - # Actual failure - logger.error(f"Indexing failed: {error_or_warning}") - if notification: - await NotificationService.connector_indexing.notify_indexing_completed( - session=session, - notification=notification, - indexed_count=0, - error_message=error_or_warning, - ) + # Check if this is a duplicate warning or empty result (success cases) or an actual error + # Handle both normal and Composio calendar connectors + error_or_warning_lower = ( + str(error_or_warning).lower() if error_or_warning else "" + ) + is_duplicate_warning = "skipped (duplicate)" in error_or_warning_lower + # "No X found" messages are success cases - sync worked, just found nothing in date range + is_empty_result = ( + "no " in error_or_warning_lower + and "found" in error_or_warning_lower + ) + + if is_duplicate_warning or is_empty_result: + # These are success cases - sync worked, just found nothing new + logger.info(f"Indexing completed successfully: {error_or_warning}") + # Still update timestamp so ElectricSQL syncs and clears "Syncing" UI + if update_timestamp_func: + await update_timestamp_func(session, connector_id) + await session.commit() # Commit timestamp update + if notification: + # Refresh notification to ensure it's not stale after timestamp update commit + await session.refresh(notification) + # For empty results, use a cleaner message + notification_message = ( + "No new items found in date range" + if is_empty_result + else error_or_warning + ) + await NotificationService.connector_indexing.notify_indexing_completed( + session=session, + notification=notification, + indexed_count=0, + error_message=notification_message, # Pass as warning, not error + is_warning=True, # Flag to indicate this is a warning, not an error + ) + await ( + session.commit() + ) # Commit to ensure Electric SQL syncs the notification update + else: + # Actual failure + logger.error(f"Indexing failed: {error_or_warning}") + if notification: + # Refresh notification to ensure it's not stale after indexing function commits + await session.refresh(notification) + await NotificationService.connector_indexing.notify_indexing_completed( + session=session, + notification=notification, + indexed_count=0, + error_message=error_or_warning, + ) + await ( + session.commit() + ) # Commit to ensure Electric SQL syncs the notification update else: # Success - just no new documents to index (all skipped/unchanged) logger.info( @@ -1136,13 +1310,19 @@ async def _run_indexing_with_notifications( # Still update timestamp so ElectricSQL syncs and clears "Syncing" UI if update_timestamp_func: await update_timestamp_func(session, connector_id) + await session.commit() # Commit timestamp update if notification: + # Refresh notification to ensure it's not stale after timestamp update commit + await session.refresh(notification) await NotificationService.connector_indexing.notify_indexing_completed( session=session, notification=notification, indexed_count=0, error_message=None, # No error - sync succeeded ) + await ( + session.commit() + ) # Commit to ensure Electric SQL syncs the notification update except Exception as e: logger.error(f"Error in indexing task: {e!s}", exc_info=True) @@ -2157,6 +2337,59 @@ async def run_obsidian_indexing( ) +async def run_composio_indexing_with_new_session( + connector_id: int, + search_space_id: int, + user_id: str, + start_date: str, + end_date: str, +): + """ + Create a new session and run the Composio indexing task. + This prevents session leaks by creating a dedicated session for the background task. + """ + async with async_session_maker() as session: + await run_composio_indexing( + session, connector_id, search_space_id, user_id, start_date, end_date + ) + + +async def run_composio_indexing( + session: AsyncSession, + connector_id: int, + search_space_id: int, + user_id: str, + start_date: str | None, + end_date: str | None, +): + """ + Run Composio connector indexing with real-time notifications. + + This wraps the Composio indexer with the notification system so that + Electric SQL can sync indexing progress to the frontend in real-time. + + Args: + session: Database session + connector_id: ID of the Composio connector + search_space_id: ID of the search space + user_id: ID of the user + start_date: Start date for indexing + end_date: End date for indexing + """ + from app.tasks.composio_indexer import index_composio_connector + + await _run_indexing_with_notifications( + session=session, + connector_id=connector_id, + search_space_id=search_space_id, + user_id=user_id, + start_date=start_date, + end_date=end_date, + indexing_function=index_composio_connector, + update_timestamp_func=_update_connector_timestamp_by_id, + ) + + # ============================================================================= # MCP Connector Routes # ============================================================================= diff --git a/surfsense_backend/app/routes/search_spaces_routes.py b/surfsense_backend/app/routes/search_spaces_routes.py index bc52a52b1..147f515b3 100644 --- a/surfsense_backend/app/routes/search_spaces_routes.py +++ b/surfsense_backend/app/routes/search_spaces_routes.py @@ -129,6 +129,7 @@ async def read_search_spaces( result = await session.execute( select(SearchSpace) .filter(SearchSpace.user_id == user.id) + .order_by(SearchSpace.id.asc()) .offset(skip) .limit(limit) ) @@ -138,6 +139,7 @@ async def read_search_spaces( select(SearchSpace) .join(SearchSpaceMembership) .filter(SearchSpaceMembership.user_id == user.id) + .order_by(SearchSpace.id.asc()) .offset(skip) .limit(limit) ) diff --git a/surfsense_backend/app/schemas/incentive_tasks.py b/surfsense_backend/app/schemas/incentive_tasks.py new file mode 100644 index 000000000..52c2a5182 --- /dev/null +++ b/surfsense_backend/app/schemas/incentive_tasks.py @@ -0,0 +1,61 @@ +""" +Schemas for incentive tasks API. +""" + +from datetime import datetime + +from pydantic import BaseModel + +from app.db import INCENTIVE_TASKS_CONFIG, IncentiveTaskType + + +class IncentiveTaskInfo(BaseModel): + """Information about an available incentive task.""" + + task_type: IncentiveTaskType + title: str + description: str + pages_reward: int + action_url: str + completed: bool + completed_at: datetime | None = None + + +class IncentiveTasksResponse(BaseModel): + """Response containing all available incentive tasks with completion status.""" + + tasks: list[IncentiveTaskInfo] + total_pages_earned: int + + +class CompleteTaskRequest(BaseModel): + """Request to mark a task as completed.""" + + task_type: IncentiveTaskType + + +class CompleteTaskResponse(BaseModel): + """Response after completing a task.""" + + success: bool + message: str + pages_awarded: int + new_pages_limit: int + + +class TaskAlreadyCompletedResponse(BaseModel): + """Response when task was already completed.""" + + success: bool + message: str + completed_at: datetime + + +def get_task_info(task_type: IncentiveTaskType) -> dict | None: + """Get task configuration by type.""" + return INCENTIVE_TASKS_CONFIG.get(task_type) + + +def get_all_task_types() -> list[IncentiveTaskType]: + """Get all configured task types.""" + return list(INCENTIVE_TASKS_CONFIG.keys()) diff --git a/surfsense_backend/app/schemas/rbac_schemas.py b/surfsense_backend/app/schemas/rbac_schemas.py index a51f3bc28..031eef3d2 100644 --- a/surfsense_backend/app/schemas/rbac_schemas.py +++ b/surfsense_backend/app/schemas/rbac_schemas.py @@ -167,6 +167,7 @@ class PermissionInfo(BaseModel): value: str name: str category: str + description: str class PermissionsListResponse(BaseModel): diff --git a/surfsense_backend/app/services/composio_service.py b/surfsense_backend/app/services/composio_service.py index 6046ea2d8..ad7841a8b 100644 --- a/surfsense_backend/app/services/composio_service.py +++ b/surfsense_backend/app/services/composio_service.py @@ -39,21 +39,73 @@ COMPOSIO_TOOLKIT_NAMES = { # Toolkits that support indexing (Phase 1: Google services only) INDEXABLE_TOOLKITS = {"googledrive", "gmail", "googlecalendar"} +# Mapping of toolkit IDs to connector types +TOOLKIT_TO_CONNECTOR_TYPE = { + "googledrive": "COMPOSIO_GOOGLE_DRIVE_CONNECTOR", + "gmail": "COMPOSIO_GMAIL_CONNECTOR", + "googlecalendar": "COMPOSIO_GOOGLE_CALENDAR_CONNECTOR", +} + +# Mapping of toolkit IDs to document types +TOOLKIT_TO_DOCUMENT_TYPE = { + "googledrive": "COMPOSIO_GOOGLE_DRIVE_CONNECTOR", + "gmail": "COMPOSIO_GMAIL_CONNECTOR", + "googlecalendar": "COMPOSIO_GOOGLE_CALENDAR_CONNECTOR", +} + +# Mapping of toolkit IDs to their indexer functions +# Format: toolkit_id -> (module_path, function_name, supports_date_filter) +# supports_date_filter: True if the indexer accepts start_date/end_date params +TOOLKIT_TO_INDEXER = { + "googledrive": ( + "app.connectors.composio_google_drive_connector", + "index_composio_google_drive", + False, # Google Drive doesn't use date filtering + ), + "gmail": ( + "app.connectors.composio_gmail_connector", + "index_composio_gmail", + True, # Gmail uses date filtering + ), + "googlecalendar": ( + "app.connectors.composio_google_calendar_connector", + "index_composio_google_calendar", + True, # Calendar uses date filtering + ), +} + class ComposioService: """Service for interacting with Composio API.""" - def __init__(self, api_key: str | None = None): + # Default download directory for files from Composio + DEFAULT_DOWNLOAD_DIR = "/tmp/composio_downloads" + + def __init__( + self, api_key: str | None = None, file_download_dir: str | None = None + ): """ Initialize the Composio service. Args: api_key: Composio API key. If not provided, uses config.COMPOSIO_API_KEY. + file_download_dir: Directory for downloaded files. Defaults to /tmp/composio_downloads. """ + import os + self.api_key = api_key or config.COMPOSIO_API_KEY if not self.api_key: raise ValueError("COMPOSIO_API_KEY is required but not configured") - self.client = Composio(api_key=self.api_key) + + # Set up download directory + self.file_download_dir = file_download_dir or self.DEFAULT_DOWNLOAD_DIR + os.makedirs(self.file_download_dir, exist_ok=True) + + # Initialize Composio client with download directory + # Per docs: file_download_dir configures where files are downloaded + self.client = Composio( + api_key=self.api_key, file_download_dir=self.file_download_dir + ) @staticmethod def is_enabled() -> bool: @@ -252,7 +304,6 @@ class ComposioService: } ) - logger.info(f"DEBUG: Found {len(result)} TOTAL connections in Composio") return result except Exception as e: logger.error(f"Failed to list all connections: {e!s}") @@ -269,7 +320,6 @@ class ComposioService: List of connected account details. """ try: - logger.info(f"DEBUG: Calling connected_accounts.list(user_id='{user_id}')") accounts_response = self.client.connected_accounts.list(user_id=user_id) # Handle paginated response (may have .items attribute) or direct list @@ -312,6 +362,30 @@ class ComposioService: logger.error(f"Failed to list connections for user {user_id}: {e!s}") return [] + async def delete_connected_account(self, connected_account_id: str) -> bool: + """ + Delete a connected account from Composio. + + This permanently removes the connected account and revokes access tokens. + + Args: + connected_account_id: The Composio connected account ID to delete. + + Returns: + True if deletion was successful, False otherwise. + """ + try: + self.client.connected_accounts.delete(connected_account_id) + logger.info( + f"Successfully deleted Composio connected account: {connected_account_id}" + ) + return True + except Exception as e: + logger.error( + f"Failed to delete Composio connected account {connected_account_id}: {e!s}" + ) + return False + async def execute_tool( self, connected_account_id: str, @@ -338,7 +412,6 @@ class ComposioService: # - connected_account_id: for authentication # - user_id: user identifier (SDK uses user_id, not entity_id) # - dangerously_skip_version_check: skip version check for manual execution - logger.info(f"DEBUG: Executing tool {tool_name} with params: {params}") result = self.client.tools.execute( slug=tool_name, connected_account_id=connected_account_id, @@ -346,8 +419,6 @@ class ComposioService: arguments=params or {}, dangerously_skip_version_check=True, ) - logger.info(f"DEBUG: Tool {tool_name} raw result type: {type(result)}") - logger.info(f"DEBUG: Tool {tool_name} raw result: {result}") return {"success": True, "data": result} except Exception as e: logger.error(f"Failed to execute tool {tool_name}: {e!s}") @@ -382,7 +453,15 @@ class ComposioService: "page_size": min(page_size, 100), } if folder_id: - params["folder_id"] = folder_id + # List contents of a specific folder (exclude shortcuts - we don't have access to them) + params["q"] = ( + f"'{folder_id}' in parents and trashed = false and mimeType != 'application/vnd.google-apps.shortcut'" + ) + else: + # List root-level items only (My Drive root), exclude shortcuts + params["q"] = ( + "'root' in parents and trashed = false and mimeType != 'application/vnd.google-apps.shortcut'" + ) if page_token: params["page_token"] = page_token @@ -397,9 +476,6 @@ class ComposioService: return [], None, result.get("error", "Unknown error") data = result.get("data", {}) - logger.info( - f"DEBUG: Drive data type: {type(data)}, keys: {data.keys() if isinstance(data, dict) else 'N/A'}" - ) # Handle nested response structure from Composio files = [] @@ -415,7 +491,6 @@ class ComposioService: elif isinstance(data, list): files = data - logger.info(f"DEBUG: Extracted {len(files)} drive files") return files, next_token, None except Exception as e: @@ -428,6 +503,10 @@ class ComposioService: """ Download file content from Google Drive via Composio. + Per Composio docs: When tools return files, they are automatically downloaded + to a local directory, and the local file path is provided in the response. + Response includes: file_path, file_name, size fields. + Args: connected_account_id: Composio connected account ID. entity_id: The entity/user ID that owns the connected account. @@ -436,27 +515,264 @@ class ComposioService: Returns: Tuple of (file content bytes, error message). """ + from pathlib import Path + try: result = await self.execute_tool( connected_account_id=connected_account_id, tool_name="GOOGLEDRIVE_DOWNLOAD_FILE", - params={"file_id": file_id}, # snake_case + params={"file_id": file_id}, entity_id=entity_id, ) if not result.get("success"): return None, result.get("error", "Unknown error") - content = result.get("data") - if isinstance(content, str): - content = content.encode("utf-8") + data = result.get("data") + if not data: + return None, "No data returned from Composio" - return content, None + # Per Composio docs, response includes file_path where file was downloaded + # Response structure: {data: {...}, error: ..., successful: ...} + # The actual file info is nested inside data["data"] + file_path = None + + if isinstance(data, dict): + # Handle nested response structure: data contains {data, error, successful} + # The actual file info is in data["data"] + inner_data = data + if "data" in data and isinstance(data["data"], dict): + inner_data = data["data"] + logger.debug( + f"Found nested data structure. Inner keys: {list(inner_data.keys())}" + ) + elif "successful" in data and "data" in data: + # Standard Composio response wrapper + inner_data = data["data"] if data["data"] else data + + # Try documented fields: file_path, downloaded_file_content, path, uri + file_path = ( + inner_data.get("file_path") + or inner_data.get("downloaded_file_content") + or inner_data.get("path") + or inner_data.get("uri") + ) + + # Handle nested dict case where downloaded_file_content contains the path + if isinstance(file_path, dict): + file_path = ( + file_path.get("file_path") + or file_path.get("downloaded_file_content") + or file_path.get("path") + or file_path.get("uri") + ) + + # If still no path, check if inner_data itself has the nested structure + if not file_path and isinstance(inner_data, dict): + for key in ["downloaded_file_content", "file_path", "path", "uri"]: + if key in inner_data: + val = inner_data[key] + if isinstance(val, str): + file_path = val + break + elif isinstance(val, dict): + # One more level of nesting + file_path = ( + val.get("file_path") + or val.get("downloaded_file_content") + or val.get("path") + or val.get("uri") + ) + if file_path: + break + + logger.debug( + f"Composio response keys: {list(data.keys())}, inner keys: {list(inner_data.keys()) if isinstance(inner_data, dict) else 'N/A'}, extracted path: {file_path}" + ) + elif isinstance(data, str): + # Direct string response (could be path or content) + file_path = data + elif isinstance(data, bytes): + # Direct bytes response + return data, None + + # Read file from the path + if file_path and isinstance(file_path, str): + path_obj = Path(file_path) + + # Check if it's a valid file path (absolute or in .composio directory) + if path_obj.is_absolute() or ".composio" in str(path_obj): + try: + if path_obj.exists(): + content = path_obj.read_bytes() + logger.info( + f"Successfully read {len(content)} bytes from Composio file: {file_path}" + ) + return content, None + else: + logger.warning( + f"File path from Composio does not exist: {file_path}" + ) + return None, f"File not found at path: {file_path}" + except Exception as e: + logger.error( + f"Failed to read file from Composio path {file_path}: {e!s}" + ) + return None, f"Failed to read file: {e!s}" + else: + # Not a file path - might be base64 encoded content + try: + import base64 + + content = base64.b64decode(file_path) + return content, None + except Exception: + # Not base64, return as UTF-8 bytes + return file_path.encode("utf-8"), None + + # If we got here, couldn't extract file path + if isinstance(data, dict): + # Log full structure for debugging + inner_data = data.get("data", {}) + logger.warning( + f"Could not extract file path from Composio response. " + f"Top keys: {list(data.keys())}, " + f"Inner data keys: {list(inner_data.keys()) if isinstance(inner_data, dict) else type(inner_data).__name__}, " + f"Full inner data: {inner_data}" + ) + return ( + None, + f"No file path in Composio response. Keys: {list(data.keys())}, inner: {list(inner_data.keys()) if isinstance(inner_data, dict) else 'N/A'}", + ) + + return None, f"Unexpected data type from Composio: {type(data).__name__}" except Exception as e: logger.error(f"Failed to get Drive file content: {e!s}") return None, str(e) + async def get_drive_start_page_token( + self, connected_account_id: str, entity_id: str + ) -> tuple[str | None, str | None]: + """ + Get the starting page token for Google Drive change tracking. + + This token represents the current state and is used for future delta syncs. + Per Composio docs: Use GOOGLEDRIVE_GET_CHANGES_START_PAGE_TOKEN to get initial token. + + Args: + connected_account_id: Composio connected account ID. + entity_id: The entity/user ID that owns the connected account. + + Returns: + Tuple of (start_page_token, error message). + """ + try: + result = await self.execute_tool( + connected_account_id=connected_account_id, + tool_name="GOOGLEDRIVE_GET_CHANGES_START_PAGE_TOKEN", + params={}, + entity_id=entity_id, + ) + + if not result.get("success"): + return None, result.get("error", "Unknown error") + + data = result.get("data", {}) + # Handle nested response: {data: {startPageToken: ...}, successful: ...} + if isinstance(data, dict): + inner_data = data.get("data", data) + token = ( + inner_data.get("startPageToken") + or inner_data.get("start_page_token") + or data.get("startPageToken") + or data.get("start_page_token") + ) + if token: + logger.info(f"Got Drive start page token: {token}") + return token, None + + logger.warning(f"Could not extract start page token from response: {data}") + return None, "No start page token in response" + + except Exception as e: + logger.error(f"Failed to get Drive start page token: {e!s}") + return None, str(e) + + async def list_drive_changes( + self, + connected_account_id: str, + entity_id: str, + page_token: str | None = None, + page_size: int = 100, + include_removed: bool = True, + ) -> tuple[list[dict[str, Any]], str | None, str | None]: + """ + List changes in Google Drive since the given page token. + + Per Composio docs: GOOGLEDRIVE_LIST_CHANGES tracks modifications to files/folders. + If pageToken is not provided, it auto-fetches the current start page token. + Response includes nextPageToken for pagination and newStartPageToken for future syncs. + + Args: + connected_account_id: Composio connected account ID. + entity_id: The entity/user ID that owns the connected account. + page_token: Page token from previous sync (optional - will auto-fetch if not provided). + page_size: Number of changes per page. + include_removed: Whether to include removed items in the response. + + Returns: + Tuple of (changes list, new_start_page_token, error message). + """ + try: + params = { + "pageSize": min(page_size, 100), + "includeRemoved": include_removed, + } + if page_token: + params["pageToken"] = page_token + + result = await self.execute_tool( + connected_account_id=connected_account_id, + tool_name="GOOGLEDRIVE_LIST_CHANGES", + params=params, + entity_id=entity_id, + ) + + if not result.get("success"): + return [], None, result.get("error", "Unknown error") + + data = result.get("data", {}) + + # Handle nested response structure + changes = [] + new_start_token = None + + if isinstance(data, dict): + inner_data = data.get("data", data) + changes = inner_data.get("changes", []) or data.get("changes", []) + + # Get the token for next sync + # newStartPageToken is returned when all changes have been fetched + # nextPageToken is for pagination within the current fetch + new_start_token = ( + inner_data.get("newStartPageToken") + or inner_data.get("new_start_page_token") + or inner_data.get("nextPageToken") + or inner_data.get("next_page_token") + or data.get("newStartPageToken") + or data.get("nextPageToken") + ) + + logger.info( + f"Got {len(changes)} Drive changes, new token: {new_start_token[:20] if new_start_token else 'None'}..." + ) + return changes, new_start_token, None + + except Exception as e: + logger.error(f"Failed to list Drive changes: {e!s}") + return [], None, str(e) + # ===== Gmail specific methods ===== async def get_gmail_messages( @@ -464,25 +780,30 @@ class ComposioService: connected_account_id: str, entity_id: str, query: str = "", - max_results: int = 100, - ) -> tuple[list[dict[str, Any]], str | None]: + max_results: int = 50, + page_token: str | None = None, + ) -> tuple[list[dict[str, Any]], str | None, int | None, str | None]: """ - List Gmail messages via Composio. + List Gmail messages via Composio with pagination support. Args: connected_account_id: Composio connected account ID. entity_id: The entity/user ID that owns the connected account. query: Gmail search query. - max_results: Maximum number of messages to return. + max_results: Maximum number of messages to return per page (default: 50 to avoid payload size issues). + page_token: Optional pagination token for next page. Returns: - Tuple of (messages list, error message). + Tuple of (messages list, next_page_token, result_size_estimate, error message). """ try: - # Composio uses snake_case for parameters, max is 500 - params = {"max_results": min(max_results, 500)} + # Use smaller batch size to avoid 413 payload too large errors + # Composio uses snake_case for parameters + params = {"max_results": min(max_results, 50)} # Reduced from 500 to 50 if query: params["query"] = query # Composio uses 'query' not 'q' + if page_token: + params["page_token"] = page_token result = await self.execute_tool( connected_account_id=connected_account_id, @@ -492,31 +813,42 @@ class ComposioService: ) if not result.get("success"): - return [], result.get("error", "Unknown error") + return [], None, result.get("error", "Unknown error") data = result.get("data", {}) - logger.info( - f"DEBUG: Gmail data type: {type(data)}, keys: {data.keys() if isinstance(data, dict) else 'N/A'}" - ) - logger.info(f"DEBUG: Gmail full data: {data}") # Try different possible response structures messages = [] + next_token = None + result_size_estimate = None if isinstance(data, dict): messages = ( data.get("messages", []) or data.get("data", {}).get("messages", []) or data.get("emails", []) ) + # Check for pagination token in various possible locations + next_token = ( + data.get("nextPageToken") + or data.get("next_page_token") + or data.get("data", {}).get("nextPageToken") + or data.get("data", {}).get("next_page_token") + ) + # Extract resultSizeEstimate if available (Gmail API provides this) + result_size_estimate = ( + data.get("resultSizeEstimate") + or data.get("result_size_estimate") + or data.get("data", {}).get("resultSizeEstimate") + or data.get("data", {}).get("result_size_estimate") + ) elif isinstance(data, list): messages = data - logger.info(f"DEBUG: Extracted {len(messages)} messages") - return messages, None + return messages, next_token, result_size_estimate, None except Exception as e: logger.error(f"Failed to list Gmail messages: {e!s}") - return [], str(e) + return [], None, str(e) async def get_gmail_message_detail( self, connected_account_id: str, entity_id: str, message_id: str @@ -595,10 +927,6 @@ class ComposioService: return [], result.get("error", "Unknown error") data = result.get("data", {}) - logger.info( - f"DEBUG: Calendar data type: {type(data)}, keys: {data.keys() if isinstance(data, dict) else 'N/A'}" - ) - logger.info(f"DEBUG: Calendar full data: {data}") # Try different possible response structures events = [] @@ -611,7 +939,6 @@ class ComposioService: elif isinstance(data, list): events = data - logger.info(f"DEBUG: Extracted {len(events)} calendar events") return events, None except Exception as e: diff --git a/surfsense_backend/app/services/connector_service.py b/surfsense_backend/app/services/connector_service.py index dc43697e7..4c5599815 100644 --- a/surfsense_backend/app/services/connector_service.py +++ b/surfsense_backend/app/services/connector_service.py @@ -2871,3 +2871,350 @@ class ConnectorService: } return result_object, obsidian_docs + + # ========================================================================= + # Composio Connector Search Methods + # ========================================================================= + + async def search_composio_google_drive( + self, + user_query: str, + search_space_id: int, + top_k: int = 20, + start_date: datetime | None = None, + end_date: datetime | None = None, + ) -> tuple: + """ + Search for Composio Google Drive files and return both the source information + and langchain documents. + + Uses combined chunk-level and document-level hybrid search with RRF fusion. + + Args: + user_query: The user's query + search_space_id: The search space ID to search in + top_k: Maximum number of results to return + start_date: Optional start date for filtering documents by updated_at + end_date: Optional end date for filtering documents by updated_at + + Returns: + tuple: (sources_info, langchain_documents) + """ + composio_drive_docs = await self._combined_rrf_search( + query_text=user_query, + search_space_id=search_space_id, + document_type="COMPOSIO_GOOGLE_DRIVE_CONNECTOR", + top_k=top_k, + start_date=start_date, + end_date=end_date, + ) + + # Early return if no results + if not composio_drive_docs: + return { + "id": 54, + "name": "Google Drive (Composio)", + "type": "COMPOSIO_GOOGLE_DRIVE_CONNECTOR", + "sources": [], + }, [] + + def _title_fn(doc_info: dict[str, Any], metadata: dict[str, Any]) -> str: + return ( + doc_info.get("title") + or metadata.get("title") + or metadata.get("file_name") + or "Untitled Document" + ) + + def _url_fn(_doc_info: dict[str, Any], metadata: dict[str, Any]) -> str: + return metadata.get("url") or metadata.get("web_view_link") or "" + + def _description_fn( + chunk: dict[str, Any], _doc_info: dict[str, Any], metadata: dict[str, Any] + ) -> str: + description = self._chunk_preview(chunk.get("content", ""), limit=200) + info_parts = [] + mime_type = metadata.get("mime_type") + modified_time = metadata.get("modified_time") + if mime_type: + info_parts.append(f"Type: {mime_type}") + if modified_time: + info_parts.append(f"Modified: {modified_time}") + if info_parts: + description = (description + " | " + " | ".join(info_parts)).strip(" |") + return description + + def _extra_fields_fn( + _chunk: dict[str, Any], _doc_info: dict[str, Any], metadata: dict[str, Any] + ) -> dict[str, Any]: + return { + "mime_type": metadata.get("mime_type", ""), + "file_id": metadata.get("file_id", ""), + "modified_time": metadata.get("modified_time", ""), + } + + sources_list = self._build_chunk_sources_from_documents( + composio_drive_docs, + title_fn=_title_fn, + url_fn=_url_fn, + description_fn=_description_fn, + extra_fields_fn=_extra_fields_fn, + ) + + # Create result object + result_object = { + "id": 54, + "name": "Google Drive (Composio)", + "type": "COMPOSIO_GOOGLE_DRIVE_CONNECTOR", + "sources": sources_list, + } + + return result_object, composio_drive_docs + + async def search_composio_gmail( + self, + user_query: str, + search_space_id: int, + top_k: int = 20, + start_date: datetime | None = None, + end_date: datetime | None = None, + ) -> tuple: + """ + Search for Composio Gmail messages and return both the source information + and langchain documents. + + Uses combined chunk-level and document-level hybrid search with RRF fusion. + + Args: + user_query: The user's query + search_space_id: The search space ID to search in + top_k: Maximum number of results to return + start_date: Optional start date for filtering documents by updated_at + end_date: Optional end date for filtering documents by updated_at + + Returns: + tuple: (sources_info, langchain_documents) + """ + composio_gmail_docs = await self._combined_rrf_search( + query_text=user_query, + search_space_id=search_space_id, + document_type="COMPOSIO_GMAIL_CONNECTOR", + top_k=top_k, + start_date=start_date, + end_date=end_date, + ) + + # Early return if no results + if not composio_gmail_docs: + return { + "id": 55, + "name": "Gmail (Composio)", + "type": "COMPOSIO_GMAIL_CONNECTOR", + "sources": [], + }, [] + + def _title_fn(doc_info: dict[str, Any], metadata: dict[str, Any]) -> str: + return ( + doc_info.get("title") + or metadata.get("subject") + or metadata.get("title") + or "Untitled Email" + ) + + def _url_fn(_doc_info: dict[str, Any], metadata: dict[str, Any]) -> str: + return metadata.get("url") or "" + + def _description_fn( + chunk: dict[str, Any], _doc_info: dict[str, Any], metadata: dict[str, Any] + ) -> str: + description = self._chunk_preview(chunk.get("content", ""), limit=200) + info_parts = [] + sender = metadata.get("from") or metadata.get("sender") + date = metadata.get("date") or metadata.get("received_at") + if sender: + info_parts.append(f"From: {sender}") + if date: + info_parts.append(f"Date: {date}") + if info_parts: + description = (description + " | " + " | ".join(info_parts)).strip(" |") + return description + + def _extra_fields_fn( + _chunk: dict[str, Any], _doc_info: dict[str, Any], metadata: dict[str, Any] + ) -> dict[str, Any]: + return { + "message_id": metadata.get("message_id", ""), + "thread_id": metadata.get("thread_id", ""), + "from": metadata.get("from", ""), + "to": metadata.get("to", ""), + "date": metadata.get("date", ""), + } + + sources_list = self._build_chunk_sources_from_documents( + composio_gmail_docs, + title_fn=_title_fn, + url_fn=_url_fn, + description_fn=_description_fn, + extra_fields_fn=_extra_fields_fn, + ) + + # Create result object + result_object = { + "id": 55, + "name": "Gmail (Composio)", + "type": "COMPOSIO_GMAIL_CONNECTOR", + "sources": sources_list, + } + + return result_object, composio_gmail_docs + + async def search_composio_google_calendar( + self, + user_query: str, + search_space_id: int, + top_k: int = 20, + start_date: datetime | None = None, + end_date: datetime | None = None, + ) -> tuple: + """ + Search for Composio Google Calendar events and return both the source information + and langchain documents. + + Uses combined chunk-level and document-level hybrid search with RRF fusion. + + Args: + user_query: The user's query + search_space_id: The search space ID to search in + top_k: Maximum number of results to return + start_date: Optional start date for filtering documents by updated_at + end_date: Optional end date for filtering documents by updated_at + + Returns: + tuple: (sources_info, langchain_documents) + """ + composio_calendar_docs = await self._combined_rrf_search( + query_text=user_query, + search_space_id=search_space_id, + document_type="COMPOSIO_GOOGLE_CALENDAR_CONNECTOR", + top_k=top_k, + start_date=start_date, + end_date=end_date, + ) + + # Early return if no results + if not composio_calendar_docs: + return { + "id": 56, + "name": "Google Calendar (Composio)", + "type": "COMPOSIO_GOOGLE_CALENDAR_CONNECTOR", + "sources": [], + }, [] + + def _title_fn(doc_info: dict[str, Any], metadata: dict[str, Any]) -> str: + return ( + doc_info.get("title") + or metadata.get("summary") + or metadata.get("title") + or "Untitled Event" + ) + + def _url_fn(_doc_info: dict[str, Any], metadata: dict[str, Any]) -> str: + return metadata.get("url") or metadata.get("html_link") or "" + + def _description_fn( + chunk: dict[str, Any], _doc_info: dict[str, Any], metadata: dict[str, Any] + ) -> str: + description = self._chunk_preview(chunk.get("content", ""), limit=200) + info_parts = [] + start_time = metadata.get("start_time") or metadata.get("start") + end_time = metadata.get("end_time") or metadata.get("end") + if start_time: + info_parts.append(f"Start: {start_time}") + if end_time: + info_parts.append(f"End: {end_time}") + if info_parts: + description = (description + " | " + " | ".join(info_parts)).strip(" |") + return description + + def _extra_fields_fn( + _chunk: dict[str, Any], _doc_info: dict[str, Any], metadata: dict[str, Any] + ) -> dict[str, Any]: + return { + "event_id": metadata.get("event_id", ""), + "calendar_id": metadata.get("calendar_id", ""), + "start_time": metadata.get("start_time", ""), + "end_time": metadata.get("end_time", ""), + "location": metadata.get("location", ""), + } + + sources_list = self._build_chunk_sources_from_documents( + composio_calendar_docs, + title_fn=_title_fn, + url_fn=_url_fn, + description_fn=_description_fn, + extra_fields_fn=_extra_fields_fn, + ) + + # Create result object + result_object = { + "id": 56, + "name": "Google Calendar (Composio)", + "type": "COMPOSIO_GOOGLE_CALENDAR_CONNECTOR", + "sources": sources_list, + } + + return result_object, composio_calendar_docs + + # ========================================================================= + # Utility Methods for Connector Discovery + # ========================================================================= + + async def get_available_connectors( + self, + search_space_id: int, + ) -> list[SearchSourceConnectorType]: + """ + Get all available (enabled) connector types for a search space. + + Args: + search_space_id: The search space ID + + Returns: + List of SearchSourceConnectorType enums for enabled connectors + """ + query = ( + select(SearchSourceConnector.connector_type) + .filter( + SearchSourceConnector.search_space_id == search_space_id, + ) + .distinct() + ) + + result = await self.session.execute(query) + connector_types = result.scalars().all() + return list(connector_types) + + async def get_available_document_types( + self, + search_space_id: int, + ) -> list[str]: + """ + Get all document types that have at least one document in the search space. + + Args: + search_space_id: The search space ID + + Returns: + List of document type strings that have documents indexed + """ + from sqlalchemy import distinct + + from app.db import Document + + query = select(distinct(Document.document_type)).filter( + Document.search_space_id == search_space_id, + ) + + result = await self.session.execute(query) + doc_types = result.scalars().all() + return [str(dt) for dt in doc_types] diff --git a/surfsense_backend/app/services/notification_service.py b/surfsense_backend/app/services/notification_service.py index 836daeb9e..04f39d8ef 100644 --- a/surfsense_backend/app/services/notification_service.py +++ b/surfsense_backend/app/services/notification_service.py @@ -335,6 +335,7 @@ class ConnectorIndexingNotificationHandler(BaseNotificationHandler): notification: Notification, indexed_count: int, error_message: str | None = None, + is_warning: bool = False, ) -> Notification: """ Update notification when connector indexing completes. @@ -343,7 +344,8 @@ class ConnectorIndexingNotificationHandler(BaseNotificationHandler): session: Database session notification: Notification to update indexed_count: Total number of items indexed - error_message: Error message if indexing failed (optional) + error_message: Error message if indexing failed, or warning message (optional) + is_warning: If True, treat error_message as a warning (success case) rather than an error Returns: Updated notification @@ -352,10 +354,26 @@ class ConnectorIndexingNotificationHandler(BaseNotificationHandler): "connector_name", "Connector" ) + # If there's an error message but items were indexed, treat it as a warning (partial success) + # If is_warning is True, treat it as success even with 0 items (e.g., duplicates found) + # Otherwise, treat it as a failure if error_message: - title = f"Failed: {connector_name}" - message = f"Sync failed: {error_message}" - status = "failed" + if indexed_count > 0: + # Partial success with warnings (e.g., duplicate content from other connectors) + title = f"Ready: {connector_name}" + item_text = "item" if indexed_count == 1 else "items" + message = f"Now searchable! {indexed_count} {item_text} synced. Note: {error_message}" + status = "completed" + elif is_warning: + # Warning case (e.g., duplicates found) - treat as success + title = f"Ready: {connector_name}" + message = f"Sync completed. {error_message}" + status = "completed" + else: + # Complete failure + title = f"Failed: {connector_name}" + message = f"Sync failed: {error_message}" + status = "failed" else: title = f"Ready: {connector_name}" if indexed_count == 0: @@ -367,7 +385,9 @@ class ConnectorIndexingNotificationHandler(BaseNotificationHandler): metadata_updates = { "indexed_count": indexed_count, - "sync_stage": "completed" if not error_message else "failed", + "sync_stage": "completed" + if (not error_message or is_warning or indexed_count > 0) + else "failed", "error_message": error_message, } diff --git a/surfsense_backend/app/tasks/celery_tasks/connector_tasks.py b/surfsense_backend/app/tasks/celery_tasks/connector_tasks.py index b90ff753f..d0710d246 100644 --- a/surfsense_backend/app/tasks/celery_tasks/connector_tasks.py +++ b/surfsense_backend/app/tasks/celery_tasks/connector_tasks.py @@ -810,8 +810,8 @@ def index_composio_connector_task( connector_id: int, search_space_id: int, user_id: str, - start_date: str, - end_date: str, + start_date: str | None, + end_date: str | None, ): """Celery task to index Composio connector content (Google Drive, Gmail, Calendar via Composio).""" import asyncio @@ -833,14 +833,16 @@ async def _index_composio_connector( connector_id: int, search_space_id: int, user_id: str, - start_date: str, - end_date: str, + start_date: str | None, + end_date: str | None, ): - """Index Composio connector content with new session.""" - # Import from tasks folder (not connector_indexers) to avoid circular import - from app.tasks.composio_indexer import index_composio_connector + """Index Composio connector content with new session and real-time notifications.""" + # Import from routes to use the notification-wrapped version + from app.routes.search_source_connectors_routes import ( + run_composio_indexing, + ) async with get_celery_session_maker()() as session: - await index_composio_connector( + await run_composio_indexing( session, connector_id, search_space_id, user_id, start_date, end_date ) diff --git a/surfsense_backend/app/tasks/celery_tasks/schedule_checker_task.py b/surfsense_backend/app/tasks/celery_tasks/schedule_checker_task.py index 21855f73f..bf80cbe78 100644 --- a/surfsense_backend/app/tasks/celery_tasks/schedule_checker_task.py +++ b/surfsense_backend/app/tasks/celery_tasks/schedule_checker_task.py @@ -66,6 +66,7 @@ async def _check_and_trigger_schedules(): from app.tasks.celery_tasks.connector_tasks import ( index_airtable_records_task, index_clickup_tasks_task, + index_composio_connector_task, index_confluence_pages_task, index_crawled_urls_task, index_discord_messages_task, @@ -98,6 +99,10 @@ async def _check_and_trigger_schedules(): SearchSourceConnectorType.ELASTICSEARCH_CONNECTOR: index_elasticsearch_documents_task, SearchSourceConnectorType.WEBCRAWLER_CONNECTOR: index_crawled_urls_task, SearchSourceConnectorType.GOOGLE_DRIVE_CONNECTOR: index_google_drive_files_task, + # Composio connector types + SearchSourceConnectorType.COMPOSIO_GOOGLE_DRIVE_CONNECTOR: index_composio_connector_task, + SearchSourceConnectorType.COMPOSIO_GMAIL_CONNECTOR: index_composio_connector_task, + SearchSourceConnectorType.COMPOSIO_GOOGLE_CALENDAR_CONNECTOR: index_composio_connector_task, } # Trigger indexing for each due connector diff --git a/surfsense_backend/app/tasks/chat/stream_new_chat.py b/surfsense_backend/app/tasks/chat/stream_new_chat.py index af09c4702..39d85f0c6 100644 --- a/surfsense_backend/app/tasks/chat/stream_new_chat.py +++ b/surfsense_backend/app/tasks/chat/stream_new_chat.py @@ -54,21 +54,68 @@ def format_attachments_as_context(attachments: list[ChatAttachment]) -> str: def format_mentioned_documents_as_context(documents: list[Document]) -> str: - """Format mentioned documents as context for the agent.""" + """ + Format mentioned documents as context for the agent. + + Uses the same XML structure as knowledge_base.format_documents_for_context + to ensure citations work properly with chunk IDs. + """ if not documents: return "" context_parts = [""] context_parts.append( "The user has explicitly mentioned the following documents from their knowledge base. " - "These documents are directly relevant to the query and should be prioritized as primary sources." + "These documents are directly relevant to the query and should be prioritized as primary sources. " + "Use [citation:CHUNK_ID] format for citations (e.g., [citation:123])." ) - for i, doc in enumerate(documents, 1): - context_parts.append( - f"" + context_parts.append("") + + for doc in documents: + # Build metadata JSON + metadata = doc.document_metadata or {} + metadata_json = json.dumps(metadata, ensure_ascii=False) + + # Get URL from metadata + url = ( + metadata.get("url") + or metadata.get("source") + or metadata.get("page_url") + or "" ) - context_parts.append(f"") + + context_parts.append("") + context_parts.append("") + context_parts.append(f" {doc.id}") + context_parts.append( + f" {doc.document_type.value}" + ) + context_parts.append(f" <![CDATA[{doc.title}]]>") + context_parts.append(f" ") + context_parts.append( + f" " + ) + context_parts.append("") + context_parts.append("") + context_parts.append("") + + # Use chunks if available (preferred for proper citations) + if hasattr(doc, "chunks") and doc.chunks: + for chunk in doc.chunks: + context_parts.append( + f" " + ) + else: + # Fallback to document content if chunks not loaded + # Use document ID as chunk ID prefix for consistency + context_parts.append( + f" " + ) + + context_parts.append("") context_parts.append("") + context_parts.append("") + context_parts.append("") return "\n".join(context_parts) @@ -81,8 +128,6 @@ def format_mentioned_surfsense_docs_as_context( if not documents: return "" - import json - context_parts = [""] context_parts.append( "The user has explicitly mentioned the following SurfSense documentation pages. " @@ -263,11 +308,15 @@ async def stream_new_chat( # Build input with message history from frontend langchain_messages = [] - # Fetch mentioned documents if any + # Fetch mentioned documents if any (with chunks for proper citations) mentioned_documents: list[Document] = [] if mentioned_document_ids: + from sqlalchemy.orm import selectinload as doc_selectinload + result = await session.execute( - select(Document).filter( + select(Document) + .options(doc_selectinload(Document.chunks)) + .filter( Document.id.in_(mentioned_document_ids), Document.search_space_id == search_space_id, ) diff --git a/surfsense_backend/app/tasks/composio_indexer.py b/surfsense_backend/app/tasks/composio_indexer.py index abb238924..f97652114 100644 --- a/surfsense_backend/app/tasks/composio_indexer.py +++ b/surfsense_backend/app/tasks/composio_indexer.py @@ -2,83 +2,76 @@ Composio connector indexer. Routes indexing requests to toolkit-specific handlers (Google Drive, Gmail, Calendar). +Uses a registry pattern for clean, extensible connector routing. Note: This module is intentionally placed in app/tasks/ (not in connector_indexers/) to avoid circular import issues with the connector_indexers package. """ import logging -from datetime import UTC, datetime +from importlib import import_module from sqlalchemy.exc import SQLAlchemyError from sqlalchemy.ext.asyncio import AsyncSession from sqlalchemy.future import select -from sqlalchemy.orm import selectinload -from app.config import config -from app.connectors.composio_connector import ComposioConnector from app.db import ( - Document, - DocumentType, SearchSourceConnector, SearchSourceConnectorType, ) -from app.services.composio_service import INDEXABLE_TOOLKITS -from app.services.llm_service import get_user_long_context_llm +from app.services.composio_service import INDEXABLE_TOOLKITS, TOOLKIT_TO_INDEXER from app.services.task_logging_service import TaskLoggingService -from app.utils.document_converters import ( - create_document_chunks, - generate_content_hash, - generate_document_summary, - generate_unique_identifier_hash, -) # Set up logging logger = logging.getLogger(__name__) -# ============ Utility functions (copied from connector_indexers.base to avoid circular imports) ============ +# Valid Composio connector types +COMPOSIO_CONNECTOR_TYPES = { + SearchSourceConnectorType.COMPOSIO_GOOGLE_DRIVE_CONNECTOR, + SearchSourceConnectorType.COMPOSIO_GMAIL_CONNECTOR, + SearchSourceConnectorType.COMPOSIO_GOOGLE_CALENDAR_CONNECTOR, +} -def get_current_timestamp() -> datetime: - """Get the current timestamp with timezone for updated_at field.""" - return datetime.now(UTC) - - -async def check_document_by_unique_identifier( - session: AsyncSession, unique_identifier_hash: str -) -> Document | None: - """Check if a document with the given unique identifier hash already exists.""" - existing_doc_result = await session.execute( - select(Document) - .options(selectinload(Document.chunks)) - .where(Document.unique_identifier_hash == unique_identifier_hash) - ) - return existing_doc_result.scalars().first() +# ============ Utility functions ============ async def get_connector_by_id( - session: AsyncSession, connector_id: int, connector_type: SearchSourceConnectorType + session: AsyncSession, + connector_id: int, + connector_type: SearchSourceConnectorType | None, ) -> SearchSourceConnector | None: - """Get a connector by ID and type from the database.""" - result = await session.execute( - select(SearchSourceConnector).filter( - SearchSourceConnector.id == connector_id, - SearchSourceConnector.connector_type == connector_type, - ) + """Get a connector by ID and optionally by type from the database.""" + query = select(SearchSourceConnector).filter( + SearchSourceConnector.id == connector_id ) + if connector_type is not None: + query = query.filter(SearchSourceConnector.connector_type == connector_type) + result = await session.execute(query) return result.scalars().first() -async def update_connector_last_indexed( - session: AsyncSession, - connector: SearchSourceConnector, - update_last_indexed: bool = True, -) -> None: - """Update the last_indexed_at timestamp for a connector.""" - if update_last_indexed: - connector.last_indexed_at = datetime.now() - logger.info(f"Updated last_indexed_at to {connector.last_indexed_at}") +def get_indexer_function(toolkit_id: str): + """ + Dynamically import and return the indexer function for a toolkit. + + Args: + toolkit_id: The toolkit ID (e.g., "googledrive", "gmail") + + Returns: + Tuple of (indexer_function, supports_date_filter) + + Raises: + ValueError: If toolkit not found in registry + """ + if toolkit_id not in TOOLKIT_TO_INDEXER: + raise ValueError(f"No indexer registered for toolkit: {toolkit_id}") + + module_path, function_name, supports_date_filter = TOOLKIT_TO_INDEXER[toolkit_id] + module = import_module(module_path) + indexer_func = getattr(module, function_name) + return indexer_func, supports_date_filter # ============ Main indexer function ============ @@ -98,6 +91,7 @@ async def index_composio_connector( Index content from a Composio connector. Routes to toolkit-specific indexing based on the connector's toolkit_id. + Uses a registry pattern for clean, extensible connector routing. Args: session: Database session @@ -129,10 +123,16 @@ async def index_composio_connector( ) try: - # Get connector by id - connector = await get_connector_by_id( - session, connector_id, SearchSourceConnectorType.COMPOSIO_CONNECTOR - ) + # Get connector by id - accept any Composio connector type + connector = await get_connector_by_id(session, connector_id, None) + + # Validate it's a Composio connector + if connector and connector.connector_type not in COMPOSIO_CONNECTOR_TYPES: + error_msg = f"Connector {connector_id} is not a Composio connector" + await task_logger.log_task_failure( + log_entry, error_msg, {"error_type": "InvalidConnectorType"} + ) + return 0, error_msg if not connector: error_msg = f"Composio connector with ID {connector_id} not found" @@ -160,53 +160,35 @@ async def index_composio_connector( ) return 0, error_msg - # Route to toolkit-specific indexer - if toolkit_id == "googledrive": - return await _index_composio_google_drive( - session=session, - connector=connector, - connector_id=connector_id, - search_space_id=search_space_id, - user_id=user_id, - task_logger=task_logger, - log_entry=log_entry, - update_last_indexed=update_last_indexed, - max_items=max_items, - ) - elif toolkit_id == "gmail": - return await _index_composio_gmail( - session=session, - connector=connector, - connector_id=connector_id, - search_space_id=search_space_id, - user_id=user_id, - start_date=start_date, - end_date=end_date, - task_logger=task_logger, - log_entry=log_entry, - update_last_indexed=update_last_indexed, - max_items=max_items, - ) - elif toolkit_id == "googlecalendar": - return await _index_composio_google_calendar( - session=session, - connector=connector, - connector_id=connector_id, - search_space_id=search_space_id, - user_id=user_id, - start_date=start_date, - end_date=end_date, - task_logger=task_logger, - log_entry=log_entry, - update_last_indexed=update_last_indexed, - max_items=max_items, - ) - else: - error_msg = f"No indexer implemented for toolkit: {toolkit_id}" + # Get indexer function from registry + try: + indexer_func, supports_date_filter = get_indexer_function(toolkit_id) + except ValueError as e: await task_logger.log_task_failure( - log_entry, error_msg, {"error_type": "NoIndexerImplemented"} + log_entry, str(e), {"error_type": "NoIndexerImplemented"} ) - return 0, error_msg + return 0, str(e) + + # Build kwargs for the indexer function + kwargs = { + "session": session, + "connector": connector, + "connector_id": connector_id, + "search_space_id": search_space_id, + "user_id": user_id, + "task_logger": task_logger, + "log_entry": log_entry, + "update_last_indexed": update_last_indexed, + "max_items": max_items, + } + + # Add date params for toolkits that support them + if supports_date_filter: + kwargs["start_date"] = start_date + kwargs["end_date"] = end_date + + # Call the toolkit-specific indexer + return await indexer_func(**kwargs) except SQLAlchemyError as db_error: await session.rollback() @@ -228,714 +210,3 @@ async def index_composio_connector( ) logger.error(f"Failed to index Composio connector: {e!s}", exc_info=True) return 0, f"Failed to index Composio connector: {e!s}" - - -async def _index_composio_google_drive( - session: AsyncSession, - connector, - connector_id: int, - search_space_id: int, - user_id: str, - task_logger: TaskLoggingService, - log_entry, - update_last_indexed: bool = True, - max_items: int = 1000, -) -> tuple[int, str]: - """Index Google Drive files via Composio.""" - try: - composio_connector = ComposioConnector(session, connector_id) - - await task_logger.log_task_progress( - log_entry, - f"Fetching Google Drive files via Composio for connector {connector_id}", - {"stage": "fetching_files"}, - ) - - # Fetch files - all_files = [] - page_token = None - - while len(all_files) < max_items: - files, next_token, error = await composio_connector.list_drive_files( - page_token=page_token, - page_size=min(100, max_items - len(all_files)), - ) - - if error: - await task_logger.log_task_failure( - log_entry, f"Failed to fetch Drive files: {error}", {} - ) - return 0, f"Failed to fetch Drive files: {error}" - - all_files.extend(files) - - if not next_token: - break - page_token = next_token - - if not all_files: - success_msg = "No Google Drive files found" - await task_logger.log_task_success( - log_entry, success_msg, {"files_count": 0} - ) - return 0, success_msg - - logger.info(f"Found {len(all_files)} Google Drive files to index via Composio") - - documents_indexed = 0 - documents_skipped = 0 - - for file_info in all_files: - try: - # Handle both standard Google API and potential Composio variations - file_id = file_info.get("id", "") or file_info.get("fileId", "") - file_name = ( - file_info.get("name", "") - or file_info.get("fileName", "") - or "Untitled" - ) - mime_type = file_info.get("mimeType", "") or file_info.get( - "mime_type", "" - ) - - if not file_id: - documents_skipped += 1 - continue - - # Skip folders - if mime_type == "application/vnd.google-apps.folder": - continue - - # Generate unique identifier hash - unique_identifier_hash = generate_unique_identifier_hash( - DocumentType.COMPOSIO_CONNECTOR, f"drive_{file_id}", search_space_id - ) - - # Check if document exists - existing_document = await check_document_by_unique_identifier( - session, unique_identifier_hash - ) - - # Get file content - ( - content, - content_error, - ) = await composio_connector.get_drive_file_content(file_id) - - if content_error or not content: - logger.warning( - f"Could not get content for file {file_name}: {content_error}" - ) - # Use metadata as content fallback - markdown_content = f"# {file_name}\n\n" - markdown_content += f"**File ID:** {file_id}\n" - markdown_content += f"**Type:** {mime_type}\n" - else: - try: - markdown_content = content.decode("utf-8") - except UnicodeDecodeError: - markdown_content = f"# {file_name}\n\n[Binary file content]\n" - - content_hash = generate_content_hash(markdown_content, search_space_id) - - if existing_document: - if existing_document.content_hash == content_hash: - documents_skipped += 1 - continue - - # Update existing document - user_llm = await get_user_long_context_llm( - session, user_id, search_space_id - ) - - if user_llm: - document_metadata = { - "file_id": file_id, - "file_name": file_name, - "mime_type": mime_type, - "document_type": "Google Drive File (Composio)", - } - ( - summary_content, - summary_embedding, - ) = await generate_document_summary( - markdown_content, user_llm, document_metadata - ) - else: - summary_content = ( - f"Google Drive File: {file_name}\n\nType: {mime_type}" - ) - summary_embedding = config.embedding_model_instance.embed( - summary_content - ) - - chunks = await create_document_chunks(markdown_content) - - existing_document.title = f"Drive: {file_name}" - existing_document.content = summary_content - existing_document.content_hash = content_hash - existing_document.embedding = summary_embedding - existing_document.document_metadata = { - "file_id": file_id, - "file_name": file_name, - "mime_type": mime_type, - "connector_id": connector_id, - "source": "composio", - } - existing_document.chunks = chunks - existing_document.updated_at = get_current_timestamp() - - documents_indexed += 1 - continue - - # Create new document - user_llm = await get_user_long_context_llm( - session, user_id, search_space_id - ) - - if user_llm: - document_metadata = { - "file_id": file_id, - "file_name": file_name, - "mime_type": mime_type, - "document_type": "Google Drive File (Composio)", - } - ( - summary_content, - summary_embedding, - ) = await generate_document_summary( - markdown_content, user_llm, document_metadata - ) - else: - summary_content = ( - f"Google Drive File: {file_name}\n\nType: {mime_type}" - ) - summary_embedding = config.embedding_model_instance.embed( - summary_content - ) - - chunks = await create_document_chunks(markdown_content) - - document = Document( - search_space_id=search_space_id, - title=f"Drive: {file_name}", - document_type=DocumentType.COMPOSIO_CONNECTOR, - document_metadata={ - "file_id": file_id, - "file_name": file_name, - "mime_type": mime_type, - "connector_id": connector_id, - "toolkit_id": "googledrive", - "source": "composio", - }, - content=summary_content, - content_hash=content_hash, - unique_identifier_hash=unique_identifier_hash, - embedding=summary_embedding, - chunks=chunks, - updated_at=get_current_timestamp(), - ) - session.add(document) - documents_indexed += 1 - - if documents_indexed % 10 == 0: - await session.commit() - - except Exception as e: - logger.error(f"Error processing Drive file: {e!s}", exc_info=True) - documents_skipped += 1 - continue - - if documents_indexed > 0: - await update_connector_last_indexed(session, connector, update_last_indexed) - - await session.commit() - - await task_logger.log_task_success( - log_entry, - f"Successfully completed Google Drive indexing via Composio for connector {connector_id}", - { - "documents_indexed": documents_indexed, - "documents_skipped": documents_skipped, - }, - ) - - return documents_indexed, None - - except Exception as e: - logger.error(f"Failed to index Google Drive via Composio: {e!s}", exc_info=True) - return 0, f"Failed to index Google Drive via Composio: {e!s}" - - -async def _index_composio_gmail( - session: AsyncSession, - connector, - connector_id: int, - search_space_id: int, - user_id: str, - start_date: str | None, - end_date: str | None, - task_logger: TaskLoggingService, - log_entry, - update_last_indexed: bool = True, - max_items: int = 1000, -) -> tuple[int, str]: - """Index Gmail messages via Composio.""" - try: - composio_connector = ComposioConnector(session, connector_id) - - await task_logger.log_task_progress( - log_entry, - f"Fetching Gmail messages via Composio for connector {connector_id}", - {"stage": "fetching_messages"}, - ) - - # Build query with date range - query_parts = [] - if start_date: - query_parts.append(f"after:{start_date.replace('-', '/')}") - if end_date: - query_parts.append(f"before:{end_date.replace('-', '/')}") - query = " ".join(query_parts) - - messages, error = await composio_connector.list_gmail_messages( - query=query, - max_results=max_items, - ) - - if error: - await task_logger.log_task_failure( - log_entry, f"Failed to fetch Gmail messages: {error}", {} - ) - return 0, f"Failed to fetch Gmail messages: {error}" - - if not messages: - success_msg = "No Gmail messages found in the specified date range" - await task_logger.log_task_success( - log_entry, success_msg, {"messages_count": 0} - ) - return 0, success_msg - - logger.info(f"Found {len(messages)} Gmail messages to index via Composio") - - documents_indexed = 0 - documents_skipped = 0 - - for message in messages: - try: - # Composio uses 'messageId' (camelCase), not 'id' - message_id = message.get("messageId", "") or message.get("id", "") - if not message_id: - documents_skipped += 1 - continue - - # Composio's GMAIL_FETCH_EMAILS already returns full message content - # No need for a separate detail API call - - # Extract message info from Composio response - # Composio structure: messageId, messageText, messageTimestamp, payload.headers, labelIds - payload = message.get("payload", {}) - headers = payload.get("headers", []) - - subject = "No Subject" - sender = "Unknown Sender" - date_str = message.get("messageTimestamp", "Unknown Date") - - for header in headers: - name = header.get("name", "").lower() - value = header.get("value", "") - if name == "subject": - subject = value - elif name == "from": - sender = value - elif name == "date": - date_str = value - - # Format to markdown using the full message data - markdown_content = composio_connector.format_gmail_message_to_markdown( - message - ) - - # Generate unique identifier - unique_identifier_hash = generate_unique_identifier_hash( - DocumentType.COMPOSIO_CONNECTOR, - f"gmail_{message_id}", - search_space_id, - ) - - content_hash = generate_content_hash(markdown_content, search_space_id) - - existing_document = await check_document_by_unique_identifier( - session, unique_identifier_hash - ) - - # Get label IDs from Composio response - label_ids = message.get("labelIds", []) - - if existing_document: - if existing_document.content_hash == content_hash: - documents_skipped += 1 - continue - - # Update existing - user_llm = await get_user_long_context_llm( - session, user_id, search_space_id - ) - - if user_llm: - document_metadata = { - "message_id": message_id, - "subject": subject, - "sender": sender, - "document_type": "Gmail Message (Composio)", - } - ( - summary_content, - summary_embedding, - ) = await generate_document_summary( - markdown_content, user_llm, document_metadata - ) - else: - summary_content = ( - f"Gmail: {subject}\n\nFrom: {sender}\nDate: {date_str}" - ) - summary_embedding = config.embedding_model_instance.embed( - summary_content - ) - - chunks = await create_document_chunks(markdown_content) - - existing_document.title = f"Gmail: {subject}" - existing_document.content = summary_content - existing_document.content_hash = content_hash - existing_document.embedding = summary_embedding - existing_document.document_metadata = { - "message_id": message_id, - "subject": subject, - "sender": sender, - "date": date_str, - "labels": label_ids, - "connector_id": connector_id, - "source": "composio", - } - existing_document.chunks = chunks - existing_document.updated_at = get_current_timestamp() - - documents_indexed += 1 - continue - - # Create new document - user_llm = await get_user_long_context_llm( - session, user_id, search_space_id - ) - - if user_llm: - document_metadata = { - "message_id": message_id, - "subject": subject, - "sender": sender, - "document_type": "Gmail Message (Composio)", - } - ( - summary_content, - summary_embedding, - ) = await generate_document_summary( - markdown_content, user_llm, document_metadata - ) - else: - summary_content = ( - f"Gmail: {subject}\n\nFrom: {sender}\nDate: {date_str}" - ) - summary_embedding = config.embedding_model_instance.embed( - summary_content - ) - - chunks = await create_document_chunks(markdown_content) - - document = Document( - search_space_id=search_space_id, - title=f"Gmail: {subject}", - document_type=DocumentType.COMPOSIO_CONNECTOR, - document_metadata={ - "message_id": message_id, - "subject": subject, - "sender": sender, - "date": date_str, - "labels": label_ids, - "connector_id": connector_id, - "toolkit_id": "gmail", - "source": "composio", - }, - content=summary_content, - content_hash=content_hash, - unique_identifier_hash=unique_identifier_hash, - embedding=summary_embedding, - chunks=chunks, - updated_at=get_current_timestamp(), - ) - session.add(document) - documents_indexed += 1 - - if documents_indexed % 10 == 0: - await session.commit() - - except Exception as e: - logger.error(f"Error processing Gmail message: {e!s}", exc_info=True) - documents_skipped += 1 - continue - - if documents_indexed > 0: - await update_connector_last_indexed(session, connector, update_last_indexed) - - await session.commit() - - await task_logger.log_task_success( - log_entry, - f"Successfully completed Gmail indexing via Composio for connector {connector_id}", - { - "documents_indexed": documents_indexed, - "documents_skipped": documents_skipped, - }, - ) - - return documents_indexed, None - - except Exception as e: - logger.error(f"Failed to index Gmail via Composio: {e!s}", exc_info=True) - return 0, f"Failed to index Gmail via Composio: {e!s}" - - -async def _index_composio_google_calendar( - session: AsyncSession, - connector, - connector_id: int, - search_space_id: int, - user_id: str, - start_date: str | None, - end_date: str | None, - task_logger: TaskLoggingService, - log_entry, - update_last_indexed: bool = True, - max_items: int = 2500, -) -> tuple[int, str]: - """Index Google Calendar events via Composio.""" - from datetime import datetime, timedelta - - try: - composio_connector = ComposioConnector(session, connector_id) - - await task_logger.log_task_progress( - log_entry, - f"Fetching Google Calendar events via Composio for connector {connector_id}", - {"stage": "fetching_events"}, - ) - - # Build time range - if start_date: - time_min = f"{start_date}T00:00:00Z" - else: - # Default to 365 days ago - default_start = datetime.now() - timedelta(days=365) - time_min = default_start.strftime("%Y-%m-%dT00:00:00Z") - - if end_date: - time_max = f"{end_date}T23:59:59Z" - else: - time_max = datetime.now().strftime("%Y-%m-%dT23:59:59Z") - - events, error = await composio_connector.list_calendar_events( - time_min=time_min, - time_max=time_max, - max_results=max_items, - ) - - if error: - await task_logger.log_task_failure( - log_entry, f"Failed to fetch Calendar events: {error}", {} - ) - return 0, f"Failed to fetch Calendar events: {error}" - - if not events: - success_msg = "No Google Calendar events found in the specified date range" - await task_logger.log_task_success( - log_entry, success_msg, {"events_count": 0} - ) - return 0, success_msg - - logger.info(f"Found {len(events)} Google Calendar events to index via Composio") - - documents_indexed = 0 - documents_skipped = 0 - - for event in events: - try: - # Handle both standard Google API and potential Composio variations - event_id = event.get("id", "") or event.get("eventId", "") - summary = ( - event.get("summary", "") or event.get("title", "") or "No Title" - ) - - if not event_id: - documents_skipped += 1 - continue - - # Format to markdown - markdown_content = composio_connector.format_calendar_event_to_markdown( - event - ) - - # Generate unique identifier - unique_identifier_hash = generate_unique_identifier_hash( - DocumentType.COMPOSIO_CONNECTOR, - f"calendar_{event_id}", - search_space_id, - ) - - content_hash = generate_content_hash(markdown_content, search_space_id) - - existing_document = await check_document_by_unique_identifier( - session, unique_identifier_hash - ) - - # Extract event times - start = event.get("start", {}) - end = event.get("end", {}) - start_time = start.get("dateTime") or start.get("date", "") - end_time = end.get("dateTime") or end.get("date", "") - location = event.get("location", "") - - if existing_document: - if existing_document.content_hash == content_hash: - documents_skipped += 1 - continue - - # Update existing - user_llm = await get_user_long_context_llm( - session, user_id, search_space_id - ) - - if user_llm: - document_metadata = { - "event_id": event_id, - "summary": summary, - "start_time": start_time, - "document_type": "Google Calendar Event (Composio)", - } - ( - summary_content, - summary_embedding, - ) = await generate_document_summary( - markdown_content, user_llm, document_metadata - ) - else: - summary_content = f"Calendar: {summary}\n\nStart: {start_time}\nEnd: {end_time}" - if location: - summary_content += f"\nLocation: {location}" - summary_embedding = config.embedding_model_instance.embed( - summary_content - ) - - chunks = await create_document_chunks(markdown_content) - - existing_document.title = f"Calendar: {summary}" - existing_document.content = summary_content - existing_document.content_hash = content_hash - existing_document.embedding = summary_embedding - existing_document.document_metadata = { - "event_id": event_id, - "summary": summary, - "start_time": start_time, - "end_time": end_time, - "location": location, - "connector_id": connector_id, - "source": "composio", - } - existing_document.chunks = chunks - existing_document.updated_at = get_current_timestamp() - - documents_indexed += 1 - continue - - # Create new document - user_llm = await get_user_long_context_llm( - session, user_id, search_space_id - ) - - if user_llm: - document_metadata = { - "event_id": event_id, - "summary": summary, - "start_time": start_time, - "document_type": "Google Calendar Event (Composio)", - } - ( - summary_content, - summary_embedding, - ) = await generate_document_summary( - markdown_content, user_llm, document_metadata - ) - else: - summary_content = ( - f"Calendar: {summary}\n\nStart: {start_time}\nEnd: {end_time}" - ) - if location: - summary_content += f"\nLocation: {location}" - summary_embedding = config.embedding_model_instance.embed( - summary_content - ) - - chunks = await create_document_chunks(markdown_content) - - document = Document( - search_space_id=search_space_id, - title=f"Calendar: {summary}", - document_type=DocumentType.COMPOSIO_CONNECTOR, - document_metadata={ - "event_id": event_id, - "summary": summary, - "start_time": start_time, - "end_time": end_time, - "location": location, - "connector_id": connector_id, - "toolkit_id": "googlecalendar", - "source": "composio", - }, - content=summary_content, - content_hash=content_hash, - unique_identifier_hash=unique_identifier_hash, - embedding=summary_embedding, - chunks=chunks, - updated_at=get_current_timestamp(), - ) - session.add(document) - documents_indexed += 1 - - if documents_indexed % 10 == 0: - await session.commit() - - except Exception as e: - logger.error(f"Error processing Calendar event: {e!s}", exc_info=True) - documents_skipped += 1 - continue - - if documents_indexed > 0: - await update_connector_last_indexed(session, connector, update_last_indexed) - - await session.commit() - - await task_logger.log_task_success( - log_entry, - f"Successfully completed Google Calendar indexing via Composio for connector {connector_id}", - { - "documents_indexed": documents_indexed, - "documents_skipped": documents_skipped, - }, - ) - - return documents_indexed, None - - except Exception as e: - logger.error( - f"Failed to index Google Calendar via Composio: {e!s}", exc_info=True - ) - return 0, f"Failed to index Google Calendar via Composio: {e!s}" diff --git a/surfsense_backend/app/tasks/connector_indexers/base.py b/surfsense_backend/app/tasks/connector_indexers/base.py index b9a99808e..b390937f0 100644 --- a/surfsense_backend/app/tasks/connector_indexers/base.py +++ b/surfsense_backend/app/tasks/connector_indexers/base.py @@ -112,6 +112,13 @@ def calculate_date_range( Returns: Tuple of (start_date_str, end_date_str) """ + # Normalize "undefined" strings to None (from frontend) + # This prevents parsing errors and ensures consistent behavior across all indexers + if start_date == "undefined" or start_date == "": + start_date = None + if end_date == "undefined" or end_date == "": + end_date = None + if start_date is not None and end_date is not None: return start_date, end_date diff --git a/surfsense_backend/app/tasks/connector_indexers/bookstack_indexer.py b/surfsense_backend/app/tasks/connector_indexers/bookstack_indexer.py index 2793f78db..a1067255d 100644 --- a/surfsense_backend/app/tasks/connector_indexers/bookstack_indexer.py +++ b/surfsense_backend/app/tasks/connector_indexers/bookstack_indexer.py @@ -136,10 +136,9 @@ async def index_bookstack_pages( ) if error: - logger.error(f"Failed to get BookStack pages: {error}") - # Don't treat "No pages found" as an error that should stop indexing if "No pages found" in error: + logger.info(f"No BookStack pages found: {error}") logger.info( "No pages found is not a critical error, continuing with update" ) @@ -159,6 +158,7 @@ async def index_bookstack_pages( ) return 0, None else: + logger.error(f"Failed to get BookStack pages: {error}") await task_logger.log_task_failure( log_entry, f"Failed to get BookStack pages: {error}", diff --git a/surfsense_backend/app/tasks/connector_indexers/confluence_indexer.py b/surfsense_backend/app/tasks/connector_indexers/confluence_indexer.py index 7289b0ccd..ddbefafb9 100644 --- a/surfsense_backend/app/tasks/connector_indexers/confluence_indexer.py +++ b/surfsense_backend/app/tasks/connector_indexers/confluence_indexer.py @@ -120,10 +120,9 @@ async def index_confluence_pages( ) if error: - logger.error(f"Failed to get Confluence pages: {error}") - # Don't treat "No pages found" as an error that should stop indexing if "No pages found" in error: + logger.info(f"No Confluence pages found: {error}") logger.info( "No pages found is not a critical error, continuing with update" ) @@ -147,6 +146,7 @@ async def index_confluence_pages( await confluence_client.close() return 0, None else: + logger.error(f"Failed to get Confluence pages: {error}") await task_logger.log_task_failure( log_entry, f"Failed to get Confluence pages: {error}", diff --git a/surfsense_backend/app/tasks/connector_indexers/google_calendar_indexer.py b/surfsense_backend/app/tasks/connector_indexers/google_calendar_indexer.py index b8c0e564d..2365ff984 100644 --- a/surfsense_backend/app/tasks/connector_indexers/google_calendar_indexer.py +++ b/surfsense_backend/app/tasks/connector_indexers/google_calendar_indexer.py @@ -4,6 +4,8 @@ Google Calendar connector indexer. from datetime import datetime, timedelta +import pytz +from dateutil.parser import isoparse from google.oauth2.credentials import Credentials from sqlalchemy.exc import SQLAlchemyError from sqlalchemy.ext.asyncio import AsyncSession @@ -21,6 +23,7 @@ from app.utils.document_converters import ( from .base import ( check_document_by_unique_identifier, + check_duplicate_document_by_hash, get_connector_by_id, get_current_timestamp, logger, @@ -206,6 +209,23 @@ async def index_google_calendar_events( start_date_str = start_date end_date_str = end_date + # If start_date and end_date are the same, adjust end_date to be one day later + # to ensure valid date range (start_date must be strictly before end_date) + if start_date_str == end_date_str: + # Parse the date and add one day to ensure valid range + dt = isoparse(end_date_str) + if dt.tzinfo is None: + dt = dt.replace(tzinfo=pytz.UTC) + else: + dt = dt.astimezone(pytz.UTC) + # Add one day to end_date to make it strictly after start_date + dt_end = dt + timedelta(days=1) + end_date_str = dt_end.strftime("%Y-%m-%d") + logger.info( + f"Adjusted end_date from {end_date} to {end_date_str} " + f"to ensure valid date range (start_date must be strictly before end_date)" + ) + await task_logger.log_task_progress( log_entry, f"Fetching Google Calendar events from {start_date_str} to {end_date_str}", @@ -223,10 +243,9 @@ async def index_google_calendar_events( ) if error: - logger.error(f"Failed to get Google Calendar events: {error}") - # Don't treat "No events found" as an error that should stop indexing if "No events found" in error: + logger.info(f"No Google Calendar events found: {error}") logger.info( "No events found is not a critical error, continuing with update" ) @@ -246,13 +265,25 @@ async def index_google_calendar_events( ) return 0, None else: + logger.error(f"Failed to get Google Calendar events: {error}") + # Check if this is an authentication error that requires re-authentication + error_message = error + error_type = "APIError" + if ( + "re-authenticate" in error.lower() + or "expired or been revoked" in error.lower() + or "authentication failed" in error.lower() + ): + error_message = "Google Calendar authentication failed. Please re-authenticate." + error_type = "AuthenticationError" + await task_logger.log_task_failure( log_entry, - f"Failed to get Google Calendar events: {error}", - "API Error", - {"error_type": "APIError"}, + error_message, + error, + {"error_type": error_type}, ) - return 0, f"Failed to get Google Calendar events: {error}" + return 0, error_message logger.info(f"Retrieved {len(events)} events from Google Calendar API") @@ -263,6 +294,9 @@ async def index_google_calendar_events( documents_indexed = 0 documents_skipped = 0 skipped_events = [] + duplicate_content_count = ( + 0 # Track events skipped due to duplicate content_hash + ) for event in events: try: @@ -383,6 +417,27 @@ async def index_google_calendar_events( ) continue + # Document doesn't exist by unique_identifier_hash + # Check if a document with the same content_hash exists (from another connector) + with session.no_autoflush: + duplicate_by_content = await check_duplicate_document_by_hash( + session, content_hash + ) + + if duplicate_by_content: + # A document with the same content already exists (likely from Composio connector) + logger.info( + f"Event {event_summary} already indexed by another connector " + f"(existing document ID: {duplicate_by_content.id}, " + f"type: {duplicate_by_content.document_type}). Skipping to avoid duplicate content." + ) + duplicate_content_count += 1 + documents_skipped += 1 + skipped_events.append( + f"{event_summary} (already indexed by another connector)" + ) + continue + # Document doesn't exist - create new one # Generate summary with metadata user_llm = await get_user_long_context_llm( @@ -475,7 +530,28 @@ async def index_google_calendar_events( logger.info( f"Final commit: Total {documents_indexed} Google Calendar events processed" ) - await session.commit() + try: + await session.commit() + except Exception as e: + # Handle any remaining integrity errors gracefully (race conditions, etc.) + if ( + "duplicate key value violates unique constraint" in str(e).lower() + or "uniqueviolationerror" in str(e).lower() + ): + logger.warning( + f"Duplicate content_hash detected during final commit. " + f"This may occur if the same event was indexed by multiple connectors. " + f"Rolling back and continuing. Error: {e!s}" + ) + await session.rollback() + # Don't fail the entire task - some documents may have been successfully indexed + else: + raise + + # Build warning message if duplicates were found + warning_message = None + if duplicate_content_count > 0: + warning_message = f"{duplicate_content_count} skipped (duplicate)" await task_logger.log_task_success( log_entry, @@ -484,14 +560,16 @@ async def index_google_calendar_events( "events_processed": total_processed, "documents_indexed": documents_indexed, "documents_skipped": documents_skipped, + "duplicate_content_count": duplicate_content_count, "skipped_events_count": len(skipped_events), }, ) logger.info( - f"Google Calendar indexing completed: {documents_indexed} new events, {documents_skipped} skipped" + f"Google Calendar indexing completed: {documents_indexed} new events, {documents_skipped} skipped " + f"({duplicate_content_count} due to duplicate content from other connectors)" ) - return total_processed, None + return total_processed, warning_message except SQLAlchemyError as db_error: await session.rollback() diff --git a/surfsense_backend/app/tasks/connector_indexers/google_drive_indexer.py b/surfsense_backend/app/tasks/connector_indexers/google_drive_indexer.py index 48282a1af..f50e149d3 100644 --- a/surfsense_backend/app/tasks/connector_indexers/google_drive_indexer.py +++ b/surfsense_backend/app/tasks/connector_indexers/google_drive_indexer.py @@ -578,7 +578,7 @@ async def _check_rename_only_update( - (True, message): Only filename changed, document was updated - (False, None): Content changed or new file, needs full processing """ - from sqlalchemy import select + from sqlalchemy import String, cast, select from sqlalchemy.orm.attributes import flag_modified from app.db import Document @@ -603,7 +603,8 @@ async def _check_rename_only_update( select(Document).where( Document.search_space_id == search_space_id, Document.document_type == DocumentType.GOOGLE_DRIVE_FILE, - Document.document_metadata["google_drive_file_id"].astext == file_id, + cast(Document.document_metadata["google_drive_file_id"], String) + == file_id, ) ) existing_document = result.scalar_one_or_none() @@ -755,7 +756,7 @@ async def _remove_document(session: AsyncSession, file_id: str, search_space_id: Handles both new (file_id-based) and legacy (filename-based) hash schemes. """ - from sqlalchemy import select + from sqlalchemy import String, cast, select from app.db import Document @@ -774,7 +775,8 @@ async def _remove_document(session: AsyncSession, file_id: str, search_space_id: select(Document).where( Document.search_space_id == search_space_id, Document.document_type == DocumentType.GOOGLE_DRIVE_FILE, - Document.document_metadata["google_drive_file_id"].astext == file_id, + cast(Document.document_metadata["google_drive_file_id"], String) + == file_id, ) ) existing_document = result.scalar_one_or_none() diff --git a/surfsense_backend/app/tasks/connector_indexers/google_gmail_indexer.py b/surfsense_backend/app/tasks/connector_indexers/google_gmail_indexer.py index e10297057..08d2904d6 100644 --- a/surfsense_backend/app/tasks/connector_indexers/google_gmail_indexer.py +++ b/surfsense_backend/app/tasks/connector_indexers/google_gmail_indexer.py @@ -170,10 +170,21 @@ async def index_google_gmail_messages( ) if error: + # Check if this is an authentication error that requires re-authentication + error_message = error + error_type = "APIError" + if ( + "re-authenticate" in error.lower() + or "expired or been revoked" in error.lower() + or "authentication failed" in error.lower() + ): + error_message = "Gmail authentication failed. Please re-authenticate." + error_type = "AuthenticationError" + await task_logger.log_task_failure( - log_entry, f"Failed to fetch messages: {error}", {} + log_entry, error_message, error, {"error_type": error_type} ) - return 0, f"Failed to fetch Gmail messages: {error}" + return 0, error_message if not messages: success_msg = "No Google gmail messages found in the specified date range" diff --git a/surfsense_backend/app/tasks/connector_indexers/jira_indexer.py b/surfsense_backend/app/tasks/connector_indexers/jira_indexer.py index fdbeb93b0..4851a6466 100644 --- a/surfsense_backend/app/tasks/connector_indexers/jira_indexer.py +++ b/surfsense_backend/app/tasks/connector_indexers/jira_indexer.py @@ -126,10 +126,9 @@ async def index_jira_issues( ) if error: - logger.error(f"Failed to get Jira issues: {error}") - # Don't treat "No issues found" as an error that should stop indexing if "No issues found" in error: + logger.info(f"No Jira issues found: {error}") logger.info( "No issues found is not a critical error, continuing with update" ) @@ -149,6 +148,7 @@ async def index_jira_issues( ) return 0, None else: + logger.error(f"Failed to get Jira issues: {error}") await task_logger.log_task_failure( log_entry, f"Failed to get Jira issues: {error}", diff --git a/surfsense_backend/app/tasks/connector_indexers/linear_indexer.py b/surfsense_backend/app/tasks/connector_indexers/linear_indexer.py index f1bfd42e8..7d8e0c30e 100644 --- a/surfsense_backend/app/tasks/connector_indexers/linear_indexer.py +++ b/surfsense_backend/app/tasks/connector_indexers/linear_indexer.py @@ -145,10 +145,9 @@ async def index_linear_issues( ) if error: - logger.error(f"Failed to get Linear issues: {error}") - # Don't treat "No issues found" as an error that should stop indexing if "No issues found" in error: + logger.info(f"No Linear issues found: {error}") logger.info( "No issues found is not a critical error, continuing with update" ) @@ -162,6 +161,7 @@ async def index_linear_issues( ) return 0, None else: + logger.error(f"Failed to get Linear issues: {error}") return 0, f"Failed to get Linear issues: {error}" logger.info(f"Retrieved {len(issues)} issues from Linear API") diff --git a/surfsense_backend/app/tasks/connector_indexers/luma_indexer.py b/surfsense_backend/app/tasks/connector_indexers/luma_indexer.py index 91f81ac20..ead259a44 100644 --- a/surfsense_backend/app/tasks/connector_indexers/luma_indexer.py +++ b/surfsense_backend/app/tasks/connector_indexers/luma_indexer.py @@ -116,6 +116,13 @@ async def index_luma_events( luma_client = LumaConnector(api_key=api_key) + # Handle 'undefined' string from frontend (treat as None) + # This prevents "time data 'undefined' does not match format" errors + if start_date == "undefined" or start_date == "": + start_date = None + if end_date == "undefined" or end_date == "": + end_date = None + # Calculate date range # For calendar connectors, allow future dates to index upcoming events if start_date is None or end_date is None: @@ -172,10 +179,9 @@ async def index_luma_events( ) if error: - logger.error(f"Failed to get Luma events: {error}") - # Don't treat "No events found" as an error that should stop indexing if "No events found" in error or "no events" in error.lower(): + logger.info(f"No Luma events found: {error}") logger.info( "No events found is not a critical error, continuing with update" ) @@ -195,6 +201,7 @@ async def index_luma_events( ) return 0, None else: + logger.error(f"Failed to get Luma events: {error}") await task_logger.log_task_failure( log_entry, f"Failed to get Luma events: {error}", diff --git a/surfsense_backend/app/utils/connector_naming.py b/surfsense_backend/app/utils/connector_naming.py index a2b748a3a..7d3efc001 100644 --- a/surfsense_backend/app/utils/connector_naming.py +++ b/surfsense_backend/app/utils/connector_naming.py @@ -28,6 +28,9 @@ BASE_NAME_FOR_TYPE = { SearchSourceConnectorType.CONFLUENCE_CONNECTOR: "Confluence", SearchSourceConnectorType.AIRTABLE_CONNECTOR: "Airtable", SearchSourceConnectorType.MCP_CONNECTOR: "Model Context Protocol (MCP)", + SearchSourceConnectorType.COMPOSIO_GMAIL_CONNECTOR: "Gmail", + SearchSourceConnectorType.COMPOSIO_GOOGLE_DRIVE_CONNECTOR: "Google Drive", + SearchSourceConnectorType.COMPOSIO_GOOGLE_CALENDAR_CONNECTOR: "Google Calendar", } diff --git a/surfsense_backend/pyproject.toml b/surfsense_backend/pyproject.toml index ffe9e5232..57dbdc7b5 100644 --- a/surfsense_backend/pyproject.toml +++ b/surfsense_backend/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "surf-new-backend" -version = "0.0.11" +version = "0.0.12" description = "SurfSense Backend" requires-python = ">=3.12" dependencies = [ diff --git a/surfsense_backend/uv.lock b/surfsense_backend/uv.lock index 18f04288e..16b77a7b2 100644 --- a/surfsense_backend/uv.lock +++ b/surfsense_backend/uv.lock @@ -6545,7 +6545,7 @@ wheels = [ [[package]] name = "surf-new-backend" -version = "0.0.11" +version = "0.0.12" source = { editable = "." } dependencies = [ { name = "alembic" }, diff --git a/surfsense_browser_extension/package.json b/surfsense_browser_extension/package.json index b225bc206..bf926d09f 100644 --- a/surfsense_browser_extension/package.json +++ b/surfsense_browser_extension/package.json @@ -1,7 +1,7 @@ { "name": "surfsense_browser_extension", "displayName": "Surfsense Browser Extension", - "version": "0.0.11", + "version": "0.0.12", "description": "Extension to collect Browsing History for SurfSense.", "author": "https://github.com/MODSetter", "engines": { diff --git a/surfsense_web/app/(home)/layout.tsx b/surfsense_web/app/(home)/layout.tsx index 9488ee875..f1ceffac0 100644 --- a/surfsense_web/app/(home)/layout.tsx +++ b/surfsense_web/app/(home)/layout.tsx @@ -1,14 +1,18 @@ "use client"; +import { usePathname } from "next/navigation"; import { FooterNew } from "@/components/homepage/footer-new"; import { Navbar } from "@/components/homepage/navbar"; export default function HomePageLayout({ children }: { children: React.ReactNode }) { + const pathname = usePathname(); + const isAuthPage = pathname === "/login" || pathname === "/register"; + return (
{children} - + {!isAuthPage && }
); } diff --git a/surfsense_web/app/(home)/login/LocalLoginForm.tsx b/surfsense_web/app/(home)/login/LocalLoginForm.tsx index 62d2a2a66..5b2edae71 100644 --- a/surfsense_web/app/(home)/login/LocalLoginForm.tsx +++ b/surfsense_web/app/(home)/login/LocalLoginForm.tsx @@ -8,6 +8,7 @@ import { useTranslations } from "next-intl"; import { useEffect, useState } from "react"; import { toast } from "sonner"; import { loginMutationAtom } from "@/atoms/auth/auth-mutation.atoms"; +import { Spinner } from "@/components/ui/spinner"; import { getAuthErrorDetails, isNetworkError, shouldRetry } from "@/lib/auth-errors"; import { AUTH_TYPE } from "@/lib/env-config"; import { ValidationError } from "@/lib/error"; @@ -42,9 +43,6 @@ export function LocalLoginForm() { // Track login attempt trackLoginAttempt("local"); - // Show loading toast - const loadingToast = toast.loading(tCommon("loading")); - try { const data = await login({ username, @@ -62,8 +60,7 @@ export function LocalLoginForm() { // Success toast toast.success(t("login_success"), { - id: loadingToast, - description: "Redirecting to dashboard...", + description: "Redirecting to dashboard", duration: 2000, }); @@ -76,7 +73,6 @@ export function LocalLoginForm() { trackLoginFailure("local", err.message); setError({ title: err.name, message: err.message }); toast.error(err.name, { - id: loadingToast, description: err.message, duration: 6000, }); @@ -106,7 +102,6 @@ export function LocalLoginForm() { // Show error toast with conditional retry action const toastOptions: any = { - id: loadingToast, description: errorDetails.description, duration: 6000, }; @@ -244,9 +239,16 @@ export function LocalLoginForm() { diff --git a/surfsense_web/app/(home)/login/page.tsx b/surfsense_web/app/(home)/login/page.tsx index a2dadd70c..cd40351fa 100644 --- a/surfsense_web/app/(home)/login/page.tsx +++ b/surfsense_web/app/(home)/login/page.tsx @@ -1,12 +1,12 @@ "use client"; -import { Loader2 } from "lucide-react"; import { AnimatePresence, motion } from "motion/react"; import { useSearchParams } from "next/navigation"; import { useTranslations } from "next-intl"; import { Suspense, useEffect, useState } from "react"; import { toast } from "sonner"; import { Logo } from "@/components/Logo"; +import { useGlobalLoadingEffect } from "@/hooks/use-global-loading"; import { getAuthErrorDetails, shouldRetry } from "@/lib/auth-errors"; import { AUTH_TYPE } from "@/lib/env-config"; import { AmbientBackground } from "./AmbientBackground"; @@ -66,7 +66,11 @@ function LoginContent() { }); // Show toast with conditional retry action - const toastOptions: any = { + const toastOptions: { + description: string; + duration: number; + action?: { label: string; onClick: () => void }; + } = { description: errorDescription, duration: 6000, }; @@ -95,20 +99,12 @@ function LoginContent() { setIsLoading(false); }, [searchParams, t, tCommon]); - // Show loading state while determining auth type + // Use global loading screen for auth type determination - spinner animation won't reset + useGlobalLoadingEffect(isLoading, tCommon("loading"), "login"); + + // Show nothing while loading - the GlobalLoadingProvider handles the loading UI if (isLoading) { - return ( -
- -
- -
- - {tCommon("loading")} -
-
-
- ); + return null; } if (authType === "GOOGLE") { @@ -189,23 +185,10 @@ function LoginContent() { ); } -// Loading fallback for Suspense -const LoadingFallback = () => ( -
- -
- -
- - Loading... -
-
-
-); - export default function LoginPage() { + // Suspense fallback returns null - the GlobalLoadingProvider handles the loading UI return ( - }> + ); diff --git a/surfsense_web/app/(home)/register/page.tsx b/surfsense_web/app/(home)/register/page.tsx index 243ad4c60..60c3ba1be 100644 --- a/surfsense_web/app/(home)/register/page.tsx +++ b/surfsense_web/app/(home)/register/page.tsx @@ -9,6 +9,7 @@ import { useEffect, useState } from "react"; import { toast } from "sonner"; import { registerMutationAtom } from "@/atoms/auth/auth-mutation.atoms"; import { Logo } from "@/components/Logo"; +import { Spinner } from "@/components/ui/spinner"; import { getAuthErrorDetails, isNetworkError, shouldRetry } from "@/lib/auth-errors"; import { AUTH_TYPE } from "@/lib/env-config"; import { AppError, ValidationError } from "@/lib/error"; @@ -60,9 +61,6 @@ export default function RegisterPage() { // Track registration attempt trackRegistrationAttempt(); - // Show loading toast - const loadingToast = toast.loading(t("creating_account")); - try { await register({ email, @@ -77,7 +75,6 @@ export default function RegisterPage() { // Success toast toast.success(t("register_success"), { - id: loadingToast, description: t("redirecting_login"), duration: 2000, }); @@ -95,7 +92,6 @@ export default function RegisterPage() { trackRegistrationFailure("Registration disabled"); setError({ title: "Registration is disabled", message: friendlyMessage }); toast.error("Registration is disabled", { - id: loadingToast, description: friendlyMessage, duration: 6000, }); @@ -109,7 +105,6 @@ export default function RegisterPage() { trackRegistrationFailure(err.message); setError({ title: err.name, message: err.message }); toast.error(err.name, { - id: loadingToast, description: err.message, duration: 6000, }); @@ -137,7 +132,6 @@ export default function RegisterPage() { // Show error toast with conditional retry action const toastOptions: any = { - id: loadingToast, description: errorDetails.description, duration: 6000, }; @@ -295,9 +289,16 @@ export default function RegisterPage() { diff --git a/surfsense_web/app/auth/callback/loading.tsx b/surfsense_web/app/auth/callback/loading.tsx new file mode 100644 index 000000000..0c94e1ee0 --- /dev/null +++ b/surfsense_web/app/auth/callback/loading.tsx @@ -0,0 +1,14 @@ +"use client"; + +import { useTranslations } from "next-intl"; +import { useGlobalLoadingEffect } from "@/hooks/use-global-loading"; + +export default function AuthCallbackLoading() { + const t = useTranslations("auth"); + + // Use global loading - spinner animation won't reset when page transitions + useGlobalLoadingEffect(true, t("processing_authentication"), "default"); + + // Return null - the GlobalLoadingProvider handles the loading UI + return null; +} diff --git a/surfsense_web/app/auth/callback/page.tsx b/surfsense_web/app/auth/callback/page.tsx index da868c316..4050eefb6 100644 --- a/surfsense_web/app/auth/callback/page.tsx +++ b/surfsense_web/app/auth/callback/page.tsx @@ -1,23 +1,18 @@ +"use client"; + import { Suspense } from "react"; import TokenHandler from "@/components/TokenHandler"; export default function AuthCallbackPage() { + // Suspense fallback returns null - the GlobalLoadingProvider handles the loading UI + // TokenHandler uses useGlobalLoadingEffect to show the loading screen return ( -
-

Authentication Callback

- -
-
- } - > - -
- + + + ); } diff --git a/surfsense_web/app/dashboard/[search_space_id]/client-layout.tsx b/surfsense_web/app/dashboard/[search_space_id]/client-layout.tsx index bbafa9703..e6730d8d1 100644 --- a/surfsense_web/app/dashboard/[search_space_id]/client-layout.tsx +++ b/surfsense_web/app/dashboard/[search_space_id]/client-layout.tsx @@ -1,7 +1,6 @@ "use client"; import { useAtomValue, useSetAtom } from "jotai"; -import { Loader2 } from "lucide-react"; import { useParams, usePathname, useRouter } from "next/navigation"; import { useTranslations } from "next-intl"; import type React from "react"; @@ -19,6 +18,7 @@ import { DashboardBreadcrumb } from "@/components/dashboard-breadcrumb"; import { LayoutDataProvider } from "@/components/layout"; import { OnboardingTour } from "@/components/onboarding-tour"; import { Card, CardContent, CardDescription, CardHeader, CardTitle } from "@/components/ui/card"; +import { useGlobalLoadingEffect } from "@/hooks/use-global-loading"; export function DashboardClientLayout({ children, @@ -146,31 +146,22 @@ export function DashboardClientLayout({ setActiveSearchSpaceIdState(activeSeacrhSpaceId); }, [search_space_id, setActiveSearchSpaceIdState]); - if ( + // Determine if we should show loading + const shouldShowLoading = (!hasCheckedOnboarding && (loading || accessLoading || globalConfigsLoading) && !isOnboardingPage) || - isAutoConfiguring - ) { - return ( -
- - - - {isAutoConfiguring ? "Setting up AI..." : t("loading_config")} - - - {isAutoConfiguring - ? "Auto-configuring with available settings" - : t("checking_llm_prefs")} - - - - - - -
- ); + isAutoConfiguring; + + // Use global loading screen - spinner animation won't reset + useGlobalLoadingEffect( + shouldShowLoading, + isAutoConfiguring ? t("setting_up_ai") : t("checking_llm_prefs"), + "default" + ); + + if (shouldShowLoading) { + return null; } if (error && !hasCheckedOnboarding && !isOnboardingPage) { diff --git a/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/components/DocumentsTableShell.tsx b/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/components/DocumentsTableShell.tsx index 38d61a6ce..d9908f46c 100644 --- a/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/components/DocumentsTableShell.tsx +++ b/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/components/DocumentsTableShell.tsx @@ -1,6 +1,6 @@ "use client"; -import { ChevronDown, ChevronUp, FileX, Loader2, Plus } from "lucide-react"; +import { ChevronDown, ChevronUp, FileX, Plus } from "lucide-react"; import { motion } from "motion/react"; import { useParams } from "next/navigation"; import { useTranslations } from "next-intl"; @@ -9,6 +9,7 @@ import { useDocumentUploadDialog } from "@/components/assistant-ui/document-uplo import { DocumentViewer } from "@/components/document-viewer"; import { Button } from "@/components/ui/button"; import { Checkbox } from "@/components/ui/checkbox"; +import { Spinner } from "@/components/ui/spinner"; import { Table, TableBody, @@ -114,7 +115,7 @@ export function DocumentsTableShell({ {loading ? (
- +

{t("loading")}

diff --git a/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/components/RowActions.tsx b/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/components/RowActions.tsx index d277a84ee..d9a894e5a 100644 --- a/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/components/RowActions.tsx +++ b/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/components/RowActions.tsx @@ -209,7 +209,7 @@ export function RowActions({ disabled={isDeleting} className="bg-destructive text-destructive-foreground hover:bg-destructive/90" > - {isDeleting ? "Deleting..." : "Delete"} + {isDeleting ? "Deleting" : "Delete"} diff --git a/surfsense_web/app/dashboard/[search_space_id]/editor/[documentId]/page.tsx b/surfsense_web/app/dashboard/[search_space_id]/editor/[documentId]/page.tsx index 2320b3b9a..74104f450 100644 --- a/surfsense_web/app/dashboard/[search_space_id]/editor/[documentId]/page.tsx +++ b/surfsense_web/app/dashboard/[search_space_id]/editor/[documentId]/page.tsx @@ -1,8 +1,7 @@ "use client"; -import { useQueryClient } from "@tanstack/react-query"; import { useAtom } from "jotai"; -import { AlertCircle, ArrowLeft, FileText, Loader2, Save } from "lucide-react"; +import { AlertCircle, ArrowLeft, FileText, Save } from "lucide-react"; import { motion } from "motion/react"; import { useParams, useRouter } from "next/navigation"; import { useEffect, useMemo, useState } from "react"; @@ -21,6 +20,7 @@ import { } from "@/components/ui/alert-dialog"; import { Button } from "@/components/ui/button"; import { Card, CardContent, CardDescription, CardHeader, CardTitle } from "@/components/ui/card"; +import { Spinner } from "@/components/ui/spinner"; import { notesApiService } from "@/lib/apis/notes-api.service"; import { authenticatedFetch, getBearerToken, redirectToLogin } from "@/lib/auth-utils"; @@ -78,7 +78,6 @@ function extractTitleFromBlockNote(blocknoteDocument: BlockNoteDocument): string export default function EditorPage() { const params = useParams(); const router = useRouter(); - const queryClient = useQueryClient(); const documentId = params.documentId as string; const searchSpaceId = Number(params.search_space_id); const isNewNote = documentId === "new"; @@ -349,8 +348,8 @@ export default function EditorPage() {
- -

Loading editor...

+ +

Loading editor

@@ -437,7 +436,7 @@ export default function EditorPage() { > {saving ? ( <> - + {isNewNote ? "Creating" : "Saving"} ) : ( diff --git a/surfsense_web/app/dashboard/[search_space_id]/more-pages/page.tsx b/surfsense_web/app/dashboard/[search_space_id]/more-pages/page.tsx new file mode 100644 index 000000000..7bb15b78b --- /dev/null +++ b/surfsense_web/app/dashboard/[search_space_id]/more-pages/page.tsx @@ -0,0 +1,210 @@ +"use client"; + +import { IconCalendar, IconMailFilled } from "@tabler/icons-react"; +import { useMutation, useQuery, useQueryClient } from "@tanstack/react-query"; +import { Check, ExternalLink, Gift, Loader2, Mail, Star } from "lucide-react"; +import { motion } from "motion/react"; +import Link from "next/link"; +import { useEffect } from "react"; +import { toast } from "sonner"; +import { Button } from "@/components/ui/button"; +import { Card, CardContent } from "@/components/ui/card"; +import { + Dialog, + DialogContent, + DialogDescription, + DialogHeader, + DialogTitle, + DialogTrigger, +} from "@/components/ui/dialog"; +import { Separator } from "@/components/ui/separator"; +import { Skeleton } from "@/components/ui/skeleton"; +import type { IncentiveTaskInfo } from "@/contracts/types/incentive-tasks.types"; +import { incentiveTasksApiService } from "@/lib/apis/incentive-tasks-api.service"; +import { + trackIncentiveContactOpened, + trackIncentivePageViewed, + trackIncentiveTaskClicked, + trackIncentiveTaskCompleted, +} from "@/lib/posthog/events"; +import { cn } from "@/lib/utils"; + +export default function MorePagesPage() { + const queryClient = useQueryClient(); + + // Track page view on mount + useEffect(() => { + trackIncentivePageViewed(); + }, []); + + // Fetch tasks from API + const { data, isLoading } = useQuery({ + queryKey: ["incentive-tasks"], + queryFn: () => incentiveTasksApiService.getTasks(), + }); + + // Mutation to complete a task + const completeMutation = useMutation({ + mutationFn: incentiveTasksApiService.completeTask, + onSuccess: (response, taskType) => { + if (response.success) { + toast.success(response.message); + // Track task completion + const task = data?.tasks.find((t) => t.task_type === taskType); + if (task) { + trackIncentiveTaskCompleted(taskType, task.pages_reward); + } + // Invalidate queries to refresh data + queryClient.invalidateQueries({ queryKey: ["incentive-tasks"] }); + queryClient.invalidateQueries({ queryKey: ["user"] }); + } + }, + onError: () => { + toast.error("Failed to complete task. Please try again."); + }, + }); + + const handleTaskClick = (task: IncentiveTaskInfo) => { + if (!task.completed) { + trackIncentiveTaskClicked(task.task_type); + completeMutation.mutate(task.task_type); + } + }; + + const allCompleted = data?.tasks.every((t) => t.completed) ?? false; + + return ( +
+ + {/* Header */} +
+ +

Get More Pages

+

Complete tasks to earn additional pages

+
+ + {/* Tasks */} + {isLoading ? ( + + + +
+ + +
+ +
+
+ ) : ( +
+ {data?.tasks.map((task) => ( + + +
+ {task.completed ? : } +
+
+

+ {task.title} +

+

+{task.pages_reward} pages

+
+ +
+
+ ))} +
+ )} + + {/* Contact */} + +
+

+ {allCompleted ? "Thanks! Need even more pages?" : "Need more pages?"} +

+ open && trackIncentiveContactOpened()}> + + + + + + Contact Us + Schedule a meeting or send us an email. + +
+ + + Schedule a Meeting + +
+ + or + +
+ + + eric@surfsense.com + +
+
+
+
+
+
+ ); +} diff --git a/surfsense_web/app/dashboard/[search_space_id]/new-chat/[[...chat_id]]/page.tsx b/surfsense_web/app/dashboard/[search_space_id]/new-chat/[[...chat_id]]/page.tsx index a56cd84ce..b5e63ca80 100644 --- a/surfsense_web/app/dashboard/[search_space_id]/new-chat/[[...chat_id]]/page.tsx +++ b/surfsense_web/app/dashboard/[search_space_id]/new-chat/[[...chat_id]]/page.tsx @@ -9,6 +9,7 @@ import { import { useQueryClient } from "@tanstack/react-query"; import { useAtomValue, useSetAtom } from "jotai"; import { useParams, useSearchParams } from "next/navigation"; +import { useTranslations } from "next-intl"; import { useCallback, useEffect, useMemo, useRef, useState } from "react"; import { toast } from "sonner"; import { z } from "zod"; @@ -34,6 +35,7 @@ import { GeneratePodcastToolUI } from "@/components/tool-ui/generate-podcast"; import { LinkPreviewToolUI } from "@/components/tool-ui/link-preview"; import { ScrapeWebpageToolUI } from "@/components/tool-ui/scrape-webpage"; import { RecallMemoryToolUI, SaveMemoryToolUI } from "@/components/tool-ui/user-memory"; +import { Spinner } from "@/components/ui/spinner"; import { useChatSessionStateSync } from "@/hooks/use-chat-session-state"; import { useMessagesElectric } from "@/hooks/use-messages-electric"; // import { WriteTodosToolUI } from "@/components/tool-ui/write-todos"; @@ -132,6 +134,7 @@ interface ThinkingStepData { } export default function NewChatPage() { + const t = useTranslations("dashboard"); const params = useParams(); const queryClient = useQueryClient(); const [isInitializing, setIsInitializing] = useState(true); @@ -1379,8 +1382,9 @@ export default function NewChatPage() { // Show loading state only when loading an existing thread if (isInitializing) { return ( -
-
Loading chat...
+
+ +
{t("loading_chat")}
); } diff --git a/surfsense_web/app/dashboard/[search_space_id]/onboard/page.tsx b/surfsense_web/app/dashboard/[search_space_id]/onboard/page.tsx index 25f189203..1b7fa297f 100644 --- a/surfsense_web/app/dashboard/[search_space_id]/onboard/page.tsx +++ b/surfsense_web/app/dashboard/[search_space_id]/onboard/page.tsx @@ -1,7 +1,6 @@ "use client"; import { useAtomValue } from "jotai"; -import { Loader2 } from "lucide-react"; import { motion } from "motion/react"; import { useParams, useRouter } from "next/navigation"; import { useEffect, useRef, useState } from "react"; @@ -17,6 +16,7 @@ import { import { Logo } from "@/components/Logo"; import { LLMConfigForm, type LLMConfigFormData } from "@/components/shared/llm-config-form"; import { Card, CardContent, CardHeader, CardTitle } from "@/components/ui/card"; +import { Spinner } from "@/components/ui/spinner"; import { getBearerToken, redirectToLogin } from "@/lib/auth-utils"; export default function OnboardPage() { @@ -156,7 +156,7 @@ export default function OnboardPage() {
- +
diff --git a/surfsense_web/app/dashboard/[search_space_id]/settings/page.tsx b/surfsense_web/app/dashboard/[search_space_id]/settings/page.tsx index fb2f49317..8c8bdb2e9 100644 --- a/surfsense_web/app/dashboard/[search_space_id]/settings/page.tsx +++ b/surfsense_web/app/dashboard/[search_space_id]/settings/page.tsx @@ -5,6 +5,7 @@ import { Bot, Brain, ChevronRight, + FileText, type LucideIcon, Menu, MessageSquare, @@ -15,6 +16,7 @@ import { AnimatePresence, motion } from "motion/react"; import { useParams, useRouter } from "next/navigation"; import { useTranslations } from "next-intl"; import { useCallback, useEffect, useState } from "react"; +import { GeneralSettingsManager } from "@/components/settings/general-settings-manager"; import { LLMRoleManager } from "@/components/settings/llm-role-manager"; import { ModelConfigManager } from "@/components/settings/model-config-manager"; import { PromptConfigManager } from "@/components/settings/prompt-config-manager"; @@ -30,6 +32,12 @@ interface SettingsNavItem { } const settingsNavItems: SettingsNavItem[] = [ + { + id: "general", + labelKey: "nav_general", + descriptionKey: "nav_general_desc", + icon: FileText, + }, { id: "models", labelKey: "nav_agent_configs", @@ -262,6 +270,9 @@ function SettingsContent({ ease: [0.4, 0, 0.2, 1], }} > + {activeSection === "general" && ( + + )} {activeSection === "models" && } {activeSection === "roles" && } {activeSection === "prompts" && } @@ -277,7 +288,7 @@ export default function SettingsPage() { const router = useRouter(); const params = useParams(); const searchSpaceId = Number(params.search_space_id); - const [activeSection, setActiveSection] = useState("models"); + const [activeSection, setActiveSection] = useState("general"); const [isSidebarOpen, setIsSidebarOpen] = useState(false); // Track settings section view diff --git a/surfsense_web/app/dashboard/[search_space_id]/team/page.tsx b/surfsense_web/app/dashboard/[search_space_id]/team/page.tsx index b661e9222..298871cf7 100644 --- a/surfsense_web/app/dashboard/[search_space_id]/team/page.tsx +++ b/surfsense_web/app/dashboard/[search_space_id]/team/page.tsx @@ -14,7 +14,6 @@ import { Hash, Link2, LinkIcon, - Loader2, Logs, type LucideIcon, MessageCircle, @@ -96,6 +95,7 @@ import { SelectTrigger, SelectValue, } from "@/components/ui/select"; +import { Spinner } from "@/components/ui/spinner"; import { Table, TableBody, @@ -105,7 +105,6 @@ import { TableRow, } from "@/components/ui/table"; import { Tabs, TabsContent, TabsList, TabsTrigger } from "@/components/ui/tabs"; -import { Textarea } from "@/components/ui/textarea"; import type { CreateInviteRequest, DeleteInviteRequest, @@ -122,6 +121,7 @@ import type { Role, UpdateRoleRequest, } from "@/contracts/types/roles.types"; +import type { PermissionInfo } from "@/contracts/types/permissions.types"; import { invitesApiService } from "@/lib/apis/invites-api.service"; import { rolesApiService } from "@/lib/apis/roles-api.service"; import { trackSearchSpaceInviteSent, trackSearchSpaceUsersViewed } from "@/lib/posthog/events"; @@ -321,7 +321,7 @@ export default function TeamManagementPage() { animate={{ opacity: 1, scale: 1 }} className="flex flex-col items-center gap-4" > - +

Loading team data...

@@ -471,13 +471,6 @@ export default function TeamManagementPage() { className="w-full md:w-auto" /> )} - {activeTab === "roles" && hasPermission("roles:create") && ( - - )}
@@ -499,8 +492,10 @@ export default function TeamManagementPage() { loading={rolesLoading} onUpdateRole={handleUpdateRole} onDeleteRole={handleDeleteRole} + onCreateRole={handleCreateRole} canUpdate={hasPermission("roles:update")} canDelete={hasPermission("roles:delete")} + canCreate={hasPermission("roles:create")} /> @@ -571,7 +566,7 @@ function MembersTab({ if (loading) { return (
- +
); } @@ -767,17 +762,71 @@ function MembersTab({ // ============ Role Permissions Display ============ -const CATEGORY_CONFIG: Record = { - documents: { label: "Documents", icon: FileText, order: 1 }, - chats: { label: "Chats", icon: MessageSquare, order: 2 }, - comments: { label: "Comments", icon: MessageCircle, order: 3 }, - llm_configs: { label: "LLM Configs", icon: Bot, order: 4 }, - podcasts: { label: "Podcasts", icon: Mic, order: 5 }, - connectors: { label: "Connectors", icon: Plug, order: 6 }, - logs: { label: "Logs", icon: Logs, order: 7 }, - members: { label: "Members", icon: Users, order: 8 }, - roles: { label: "Roles", icon: Shield, order: 9 }, - settings: { label: "Settings", icon: Settings, order: 10 }, +// Unified category configuration used across all role-related components +const CATEGORY_CONFIG: Record< + string, + { label: string; icon: LucideIcon; description: string; order: number } +> = { + documents: { + label: "Documents", + icon: FileText, + description: "Manage files, notes, and content", + order: 1, + }, + chats: { + label: "AI Chats", + icon: MessageSquare, + description: "Create and manage AI conversations", + order: 2, + }, + comments: { + label: "Comments", + icon: MessageCircle, + description: "Add annotations to documents", + order: 3, + }, + llm_configs: { + label: "AI Models", + icon: Bot, + description: "Configure AI model settings", + order: 4, + }, + podcasts: { + label: "Podcasts", + icon: Mic, + description: "Generate AI podcasts from content", + order: 5, + }, + connectors: { + label: "Integrations", + icon: Plug, + description: "Connect external data sources", + order: 6, + }, + logs: { + label: "Activity Logs", + icon: Logs, + description: "View and manage audit trail", + order: 7, + }, + members: { + label: "Team Members", + icon: Users, + description: "Manage team membership", + order: 8, + }, + roles: { + label: "Roles", + icon: Shield, + description: "Configure role permissions", + order: 9, + }, + settings: { + label: "Settings", + icon: Settings, + description: "Manage search space settings", + order: 10, + }, }; const ACTION_LABELS: Record = { @@ -893,25 +942,31 @@ function RolePermissionsDisplay({ permissions }: { permissions: string[] }) { function RolesTab({ roles, - groupedPermissions: _groupedPermissions, + groupedPermissions, loading, onUpdateRole: _onUpdateRole, onDeleteRole, + onCreateRole, canUpdate, canDelete, + canCreate, }: { roles: Role[]; - groupedPermissions: Record; + groupedPermissions: Record; loading: boolean; onUpdateRole: (roleId: number, data: { permissions?: string[] }) => Promise; onDeleteRole: (roleId: number) => Promise; + onCreateRole: (data: CreateRoleRequest["data"]) => Promise; canUpdate: boolean; canDelete: boolean; + canCreate: boolean; }) { + const [showCreateRole, setShowCreateRole] = useState(false); + if (loading) { return (
- +
); } @@ -921,123 +976,149 @@ function RolesTab({ initial={{ opacity: 0, y: 10 }} animate={{ opacity: 1, y: 0 }} exit={{ opacity: 0, y: -10 }} - className="grid grid-cols-1 md:grid-cols-2 lg:grid-cols-3 gap-4" + className="space-y-6" > - {roles.map((role, index) => ( + {/* Create Role Button / Section */} + {canCreate && !showCreateRole && ( - - {role.is_system_role && ( -
- System Role -
- )} - -
-
-
- -
-
- {role.name} - {role.is_default && ( - - Default - - )} -
-
- {!role.is_system_role && ( - - - - - - {canUpdate && ( - { - // TODO: Implement edit role dialog/modal - }} - > - - Edit Role - - )} - {canDelete && ( - <> - - - - e.preventDefault()} - > - - Delete Role - - - - - Delete role? - - This will permanently delete the "{role.name}" role. Members with - this role will lose their permissions. - - - - Cancel - onDeleteRole(role.id)} - className="bg-destructive text-destructive-foreground hover:bg-destructive/90" - > - Delete - - - - - - )} - - - )} -
- {role.description && ( - {role.description} - )} -
- - - -
+
- ))} + )} + + {/* Create Role Form */} + {showCreateRole && ( + setShowCreateRole(false)} + /> + )} + + {/* Roles Grid */} +
+ {roles.map((role, index) => ( + + + {role.is_system_role && ( +
+ System Role +
+ )} + +
+
+
+ +
+
+ {role.name} + {role.is_default && ( + + Default + + )} +
+
+ {!role.is_system_role && ( + + + + + + {canUpdate && ( + { + // TODO: Implement edit role dialog/modal + }} + > + + Edit Role + + )} + {canDelete && ( + <> + + + + e.preventDefault()} + > + + Delete Role + + + + + Delete role? + + This will permanently delete the "{role.name}" role. Members + with this role will lose their permissions. + + + + Cancel + onDeleteRole(role.id)} + className="bg-destructive text-destructive-foreground hover:bg-destructive/90" + > + Delete + + + + + + )} + + + )} +
+ {role.description && ( + {role.description} + )} +
+ + + +
+
+ ))} +
); } @@ -1068,7 +1149,7 @@ function InvitesTab({ if (loading) { return (
- +
); } @@ -1446,7 +1527,7 @@ function CreateInviteDialog({ - - - - Create Custom Role - - Define a new role with specific permissions for this search space. - - -
-
+ + + +
+
+
+ +
+
+ Create Custom Role + + Define permissions for a new role in this search space + +
+
+ +
+
+ + {/* Quick Start with Presets */} +
+ +
+ {Object.entries(ROLE_PRESETS).map(([key, preset]) => ( + + ))} +
+
+ + {/* Role Details */} +
setName(e.target.value)} />
-
+
+ + {/* Default Role Checkbox */} +
+ setIsDefault(checked === true)} + /> +
+

- New invites without a role will use this + New members without a specific role will be assigned this role

-
- -