From be5715cfebb1070458c702b4aee943a80da47b95 Mon Sep 17 00:00:00 2001 From: Anish Sarkar <104695310+AnishSarkar22@users.noreply.github.com> Date: Thu, 22 Jan 2026 22:33:28 +0530 Subject: [PATCH 01/28] feat: add Composio connector types and enhance integration - Introduced new enum values for Composio connectors: COMPOSIO_GOOGLE_DRIVE_CONNECTOR, COMPOSIO_GMAIL_CONNECTOR, and COMPOSIO_GOOGLE_CALENDAR_CONNECTOR. - Updated database migration to add these new enum values to the relevant types. - Refactored Composio integration logic to handle specific connector types, improving the management of connected accounts and indexing processes. - Enhanced frontend components to support the new Composio connector types, including updated UI elements and connector configuration handling. - Improved backend services to manage Composio connected accounts more effectively, including deletion and indexing tasks. --- .../74_add_composio_connector_enums.py | 93 +++++++++------- surfsense_backend/app/db.py | 8 +- .../app/routes/composio_routes.py | 80 ++++++++++---- .../routes/search_source_connectors_routes.py | 88 ++++++++++++++- .../app/services/composio_service.py | 34 ++++++ .../app/tasks/celery_tasks/connector_tasks.py | 10 +- .../app/tasks/composio_indexer.py | 54 +++++---- .../assistant-ui/connector-popup.tsx | 14 ++- .../components/composio-config.tsx | 103 ------------------ .../connector-configs/index.tsx | 4 +- .../constants/connector-constants.ts | 26 ++++- .../constants/connector-popup.schemas.ts | 2 +- .../hooks/use-connector-dialog.ts | 64 ++++++----- .../tabs/all-connectors-tab.tsx | 94 +++++++++------- .../utils/connector-document-mapping.ts | 5 +- surfsense_web/contracts/enums/connector.ts | 4 +- .../contracts/enums/connectorIcons.tsx | 16 ++- .../contracts/types/connector.types.ts | 11 +- .../contracts/types/document.types.ts | 4 +- 19 files changed, 437 insertions(+), 277 deletions(-) diff --git a/surfsense_backend/alembic/versions/74_add_composio_connector_enums.py b/surfsense_backend/alembic/versions/74_add_composio_connector_enums.py index 454b60754..cadf70cb6 100644 --- a/surfsense_backend/alembic/versions/74_add_composio_connector_enums.py +++ b/surfsense_backend/alembic/versions/74_add_composio_connector_enums.py @@ -1,16 +1,21 @@ -"""Add COMPOSIO_CONNECTOR to SearchSourceConnectorType and DocumentType enums +"""Add Composio connector types to SearchSourceConnectorType and DocumentType enums Revision ID: 74 Revises: 73 Create Date: 2026-01-21 -This migration adds the COMPOSIO_CONNECTOR enum value to both: +This migration adds the Composio connector enum values to both: - searchsourceconnectortype (for connector type tracking) - documenttype (for document type tracking) Composio is a managed OAuth integration service that allows connecting to various third-party services (Google Drive, Gmail, Calendar, etc.) without requiring separate OAuth app verification. + +This migration adds three specific connector types: +- COMPOSIO_GOOGLE_DRIVE_CONNECTOR +- COMPOSIO_GMAIL_CONNECTOR +- COMPOSIO_GOOGLE_CALENDAR_CONNECTOR """ from collections.abc import Sequence @@ -23,55 +28,65 @@ down_revision: str | None = "73" branch_labels: str | Sequence[str] | None = None depends_on: str | Sequence[str] | None = None -# Define the ENUM type names and the new value +# Define the ENUM type names and the new values CONNECTOR_ENUM = "searchsourceconnectortype" -CONNECTOR_NEW_VALUE = "COMPOSIO_CONNECTOR" +CONNECTOR_NEW_VALUES = [ + "COMPOSIO_GOOGLE_DRIVE_CONNECTOR", + "COMPOSIO_GMAIL_CONNECTOR", + "COMPOSIO_GOOGLE_CALENDAR_CONNECTOR", +] DOCUMENT_ENUM = "documenttype" -DOCUMENT_NEW_VALUE = "COMPOSIO_CONNECTOR" +DOCUMENT_NEW_VALUES = [ + "COMPOSIO_GOOGLE_DRIVE_CONNECTOR", + "COMPOSIO_GMAIL_CONNECTOR", + "COMPOSIO_GOOGLE_CALENDAR_CONNECTOR", +] def upgrade() -> None: - """Upgrade schema - add COMPOSIO_CONNECTOR to connector and document enums safely.""" - # Add COMPOSIO_CONNECTOR to searchsourceconnectortype only if not exists - op.execute( - f""" - DO $$ - BEGIN - IF NOT EXISTS ( - SELECT 1 FROM pg_enum - WHERE enumlabel = '{CONNECTOR_NEW_VALUE}' - AND enumtypid = (SELECT oid FROM pg_type WHERE typname = '{CONNECTOR_ENUM}') - ) THEN - ALTER TYPE {CONNECTOR_ENUM} ADD VALUE '{CONNECTOR_NEW_VALUE}'; - END IF; - END$$; - """ - ) + """Upgrade schema - add Composio connector types to connector and document enums safely.""" + # Add each Composio connector type to searchsourceconnectortype only if not exists + for value in CONNECTOR_NEW_VALUES: + op.execute( + f""" + DO $$ + BEGIN + IF NOT EXISTS ( + SELECT 1 FROM pg_enum e + JOIN pg_type t ON e.enumtypid = t.oid + WHERE t.typname = '{CONNECTOR_ENUM}' AND e.enumlabel = '{value}' + ) THEN + ALTER TYPE {CONNECTOR_ENUM} ADD VALUE '{value}'; + END IF; + END$$; + """ + ) - # Add COMPOSIO_CONNECTOR to documenttype only if not exists - op.execute( - f""" - DO $$ - BEGIN - IF NOT EXISTS ( - SELECT 1 FROM pg_enum - WHERE enumlabel = '{DOCUMENT_NEW_VALUE}' - AND enumtypid = (SELECT oid FROM pg_type WHERE typname = '{DOCUMENT_ENUM}') - ) THEN - ALTER TYPE {DOCUMENT_ENUM} ADD VALUE '{DOCUMENT_NEW_VALUE}'; - END IF; - END$$; - """ - ) + # Add each Composio connector type to documenttype only if not exists + for value in DOCUMENT_NEW_VALUES: + op.execute( + f""" + DO $$ + BEGIN + IF NOT EXISTS ( + SELECT 1 FROM pg_enum e + JOIN pg_type t ON e.enumtypid = t.oid + WHERE t.typname = '{DOCUMENT_ENUM}' AND e.enumlabel = '{value}' + ) THEN + ALTER TYPE {DOCUMENT_ENUM} ADD VALUE '{value}'; + END IF; + END$$; + """ + ) def downgrade() -> None: - """Downgrade schema - remove COMPOSIO_CONNECTOR from connector and document enums. + """Downgrade schema - remove Composio connector types from connector and document enums. Note: PostgreSQL does not support removing enum values directly. To properly downgrade, you would need to: - 1. Delete any rows using the COMPOSIO_CONNECTOR value - 2. Create new enums without COMPOSIO_CONNECTOR + 1. Delete any rows using the Composio connector type values + 2. Create new enums without the Composio connector types 3. Alter the columns to use the new enums 4. Drop the old enums diff --git a/surfsense_backend/app/db.py b/surfsense_backend/app/db.py index b56f37373..705e89ea7 100644 --- a/surfsense_backend/app/db.py +++ b/surfsense_backend/app/db.py @@ -54,7 +54,9 @@ class DocumentType(str, Enum): BOOKSTACK_CONNECTOR = "BOOKSTACK_CONNECTOR" CIRCLEBACK = "CIRCLEBACK" NOTE = "NOTE" - COMPOSIO_CONNECTOR = "COMPOSIO_CONNECTOR" # Generic Composio integration + COMPOSIO_GOOGLE_DRIVE_CONNECTOR = "COMPOSIO_GOOGLE_DRIVE_CONNECTOR" + COMPOSIO_GMAIL_CONNECTOR = "COMPOSIO_GMAIL_CONNECTOR" + COMPOSIO_GOOGLE_CALENDAR_CONNECTOR = "COMPOSIO_GOOGLE_CALENDAR_CONNECTOR" class SearchSourceConnectorType(str, Enum): @@ -82,7 +84,9 @@ class SearchSourceConnectorType(str, Enum): BOOKSTACK_CONNECTOR = "BOOKSTACK_CONNECTOR" CIRCLEBACK_CONNECTOR = "CIRCLEBACK_CONNECTOR" MCP_CONNECTOR = "MCP_CONNECTOR" # Model Context Protocol - User-defined API tools - COMPOSIO_CONNECTOR = "COMPOSIO_CONNECTOR" # Generic Composio integration (Google, Slack, etc.) + COMPOSIO_GOOGLE_DRIVE_CONNECTOR = "COMPOSIO_GOOGLE_DRIVE_CONNECTOR" + COMPOSIO_GMAIL_CONNECTOR = "COMPOSIO_GMAIL_CONNECTOR" + COMPOSIO_GOOGLE_CALENDAR_CONNECTOR = "COMPOSIO_GOOGLE_CALENDAR_CONNECTOR" class LiteLLMProvider(str, Enum): diff --git a/surfsense_backend/app/routes/composio_routes.py b/surfsense_backend/app/routes/composio_routes.py index b6f418aa2..77891fc88 100644 --- a/surfsense_backend/app/routes/composio_routes.py +++ b/surfsense_backend/app/routes/composio_routes.py @@ -19,6 +19,7 @@ from fastapi.responses import RedirectResponse from pydantic import ValidationError from sqlalchemy.exc import IntegrityError from sqlalchemy.ext.asyncio import AsyncSession +from sqlalchemy.future import select from app.config import config from app.db import ( @@ -30,15 +31,17 @@ from app.db import ( from app.services.composio_service import ( COMPOSIO_TOOLKIT_NAMES, INDEXABLE_TOOLKITS, + TOOLKIT_TO_CONNECTOR_TYPE, ComposioService, ) from app.users import current_active_user -from app.utils.connector_naming import ( - check_duplicate_connector, - generate_unique_connector_name, -) +from app.utils.connector_naming import generate_unique_connector_name from app.utils.oauth_security import OAuthStateManager +# Note: We no longer use check_duplicate_connector for Composio connectors because +# Composio generates a new connected_account_id each time, even for the same Google account. +# Instead, we check for existing connectors by type/space/user and update them. + logger = logging.getLogger(__name__) router = APIRouter() @@ -260,30 +263,65 @@ async def composio_callback( "is_indexable": toolkit_id in INDEXABLE_TOOLKITS, } - # Check for duplicate connector - # For Composio, we use toolkit_id + connected_account_id as unique identifier - identifier = final_connected_account_id or f"{toolkit_id}_{user_id}" - - is_duplicate = await check_duplicate_connector( - session, - SearchSourceConnectorType.COMPOSIO_CONNECTOR, - space_id, - user_id, - identifier, - ) - if is_duplicate: - logger.warning( - f"Duplicate Composio connector detected for user {user_id} with toolkit {toolkit_id}" + # Get the specific connector type for this toolkit + connector_type_str = TOOLKIT_TO_CONNECTOR_TYPE.get(toolkit_id) + if not connector_type_str: + raise HTTPException( + status_code=400, + detail=f"Unknown toolkit: {toolkit_id}. Available: {list(TOOLKIT_TO_CONNECTOR_TYPE.keys())}", ) + connector_type = SearchSourceConnectorType(connector_type_str) + + # Check for existing connector of the same type for this user/space + # When reconnecting, Composio gives a new connected_account_id, so we need to + # check by connector_type, user_id, and search_space_id instead of connected_account_id + existing_connector_result = await session.execute( + select(SearchSourceConnector).where( + SearchSourceConnector.connector_type == connector_type, + SearchSourceConnector.search_space_id == space_id, + SearchSourceConnector.user_id == user_id, + ) + ) + existing_connector = existing_connector_result.scalars().first() + + if existing_connector: + # Delete the old Composio connected account before updating + old_connected_account_id = existing_connector.config.get("composio_connected_account_id") + if old_connected_account_id and old_connected_account_id != final_connected_account_id: + try: + deleted = await service.delete_connected_account(old_connected_account_id) + if deleted: + logger.info( + f"Deleted old Composio connected account {old_connected_account_id} " + f"before updating connector {existing_connector.id}" + ) + else: + logger.warning( + f"Failed to delete old Composio connected account {old_connected_account_id}" + ) + except Exception as delete_error: + # Log but don't fail - the old account may already be deleted + logger.warning( + f"Error deleting old Composio connected account {old_connected_account_id}: {delete_error!s}" + ) + + # Update existing connector with new connected_account_id + logger.info( + f"Updating existing Composio connector {existing_connector.id} with new connected_account_id {final_connected_account_id}" + ) + existing_connector.config = connector_config + await session.commit() + await session.refresh(existing_connector) + return RedirectResponse( - url=f"{config.NEXT_FRONTEND_URL}/dashboard/{space_id}/new-chat?modal=connectors&tab=all&error=duplicate_account&connector=composio-connector" + url=f"{config.NEXT_FRONTEND_URL}/dashboard/{space_id}/new-chat?modal=connectors&tab=all&success=true&connector=composio-connector&connectorId={existing_connector.id}" ) try: # Generate a unique, user-friendly connector name connector_name = await generate_unique_connector_name( session, - SearchSourceConnectorType.COMPOSIO_CONNECTOR, + connector_type, space_id, user_id, f"{toolkit_name} (Composio)", @@ -291,7 +329,7 @@ async def composio_callback( db_connector = SearchSourceConnector( name=connector_name, - connector_type=SearchSourceConnectorType.COMPOSIO_CONNECTOR, + connector_type=connector_type, config=connector_config, search_space_id=space_id, user_id=user_id, diff --git a/surfsense_backend/app/routes/search_source_connectors_routes.py b/surfsense_backend/app/routes/search_source_connectors_routes.py index d60d08d57..9ad03fba8 100644 --- a/surfsense_backend/app/routes/search_source_connectors_routes.py +++ b/surfsense_backend/app/routes/search_source_connectors_routes.py @@ -37,6 +37,7 @@ from app.db import ( async_session_maker, get_async_session, ) +from app.services.composio_service import ComposioService from app.schemas import ( GoogleDriveIndexRequest, MCPConnectorCreate, @@ -529,6 +530,34 @@ async def delete_search_source_connector( f"Failed to delete periodic schedule for connector {connector_id}" ) + # For Composio connectors, also delete the connected account in Composio + composio_connector_types = [ + SearchSourceConnectorType.COMPOSIO_GOOGLE_DRIVE_CONNECTOR, + SearchSourceConnectorType.COMPOSIO_GMAIL_CONNECTOR, + SearchSourceConnectorType.COMPOSIO_GOOGLE_CALENDAR_CONNECTOR, + ] + if db_connector.connector_type in composio_connector_types: + composio_connected_account_id = db_connector.config.get("composio_connected_account_id") + if composio_connected_account_id and ComposioService.is_enabled(): + try: + service = ComposioService() + deleted = await service.delete_connected_account(composio_connected_account_id) + if deleted: + logger.info( + f"Successfully deleted Composio connected account {composio_connected_account_id} " + f"for connector {connector_id}" + ) + else: + logger.warning( + f"Failed to delete Composio connected account {composio_connected_account_id} " + f"for connector {connector_id}" + ) + except Exception as composio_error: + # Log but don't fail the deletion - Composio account may already be deleted + logger.warning( + f"Error deleting Composio connected account {composio_connected_account_id}: {composio_error!s}" + ) + await session.delete(db_connector) await session.commit() return {"message": "Search source connector deleted successfully"} @@ -868,7 +897,11 @@ async def index_connector_content( ) response_message = "Web page indexing started in the background." - elif connector.connector_type == SearchSourceConnectorType.COMPOSIO_CONNECTOR: + elif connector.connector_type in [ + SearchSourceConnectorType.COMPOSIO_GOOGLE_DRIVE_CONNECTOR, + SearchSourceConnectorType.COMPOSIO_GMAIL_CONNECTOR, + SearchSourceConnectorType.COMPOSIO_GOOGLE_CALENDAR_CONNECTOR, + ]: from app.tasks.celery_tasks.connector_tasks import ( index_composio_connector_task, ) @@ -2086,6 +2119,59 @@ async def run_bookstack_indexing( ) +async def run_composio_indexing_with_new_session( + connector_id: int, + search_space_id: int, + user_id: str, + start_date: str, + end_date: str, +): + """ + Create a new session and run the Composio indexing task. + This prevents session leaks by creating a dedicated session for the background task. + """ + async with async_session_maker() as session: + await run_composio_indexing( + session, connector_id, search_space_id, user_id, start_date, end_date + ) + + +async def run_composio_indexing( + session: AsyncSession, + connector_id: int, + search_space_id: int, + user_id: str, + start_date: str, + end_date: str, +): + """ + Run Composio connector indexing with real-time notifications. + + This wraps the Composio indexer with the notification system so that + Electric SQL can sync indexing progress to the frontend in real-time. + + Args: + session: Database session + connector_id: ID of the Composio connector + search_space_id: ID of the search space + user_id: ID of the user + start_date: Start date for indexing + end_date: End date for indexing + """ + from app.tasks.composio_indexer import index_composio_connector + + await _run_indexing_with_notifications( + session=session, + connector_id=connector_id, + search_space_id=search_space_id, + user_id=user_id, + start_date=start_date, + end_date=end_date, + indexing_function=index_composio_connector, + update_timestamp_func=_update_connector_timestamp_by_id, + ) + + # ============================================================================= # MCP Connector Routes # ============================================================================= diff --git a/surfsense_backend/app/services/composio_service.py b/surfsense_backend/app/services/composio_service.py index 4b6a32b03..17fbd64e0 100644 --- a/surfsense_backend/app/services/composio_service.py +++ b/surfsense_backend/app/services/composio_service.py @@ -39,6 +39,20 @@ COMPOSIO_TOOLKIT_NAMES = { # Toolkits that support indexing (Phase 1: Google services only) INDEXABLE_TOOLKITS = {"googledrive", "gmail", "googlecalendar"} +# Mapping of toolkit IDs to connector types +TOOLKIT_TO_CONNECTOR_TYPE = { + "googledrive": "COMPOSIO_GOOGLE_DRIVE_CONNECTOR", + "gmail": "COMPOSIO_GMAIL_CONNECTOR", + "googlecalendar": "COMPOSIO_GOOGLE_CALENDAR_CONNECTOR", +} + +# Mapping of toolkit IDs to document types +TOOLKIT_TO_DOCUMENT_TYPE = { + "googledrive": "COMPOSIO_GOOGLE_DRIVE_CONNECTOR", + "gmail": "COMPOSIO_GMAIL_CONNECTOR", + "googlecalendar": "COMPOSIO_GOOGLE_CALENDAR_CONNECTOR", +} + class ComposioService: """Service for interacting with Composio API.""" @@ -298,6 +312,26 @@ class ComposioService: logger.error(f"Failed to list connections for user {user_id}: {e!s}") return [] + async def delete_connected_account(self, connected_account_id: str) -> bool: + """ + Delete a connected account from Composio. + + This permanently removes the connected account and revokes access tokens. + + Args: + connected_account_id: The Composio connected account ID to delete. + + Returns: + True if deletion was successful, False otherwise. + """ + try: + self.client.connected_accounts.delete(connected_account_id) + logger.info(f"Successfully deleted Composio connected account: {connected_account_id}") + return True + except Exception as e: + logger.error(f"Failed to delete Composio connected account {connected_account_id}: {e!s}") + return False + async def execute_tool( self, connected_account_id: str, diff --git a/surfsense_backend/app/tasks/celery_tasks/connector_tasks.py b/surfsense_backend/app/tasks/celery_tasks/connector_tasks.py index 72cedb40f..307b5a551 100644 --- a/surfsense_backend/app/tasks/celery_tasks/connector_tasks.py +++ b/surfsense_backend/app/tasks/celery_tasks/connector_tasks.py @@ -793,11 +793,13 @@ async def _index_composio_connector( start_date: str, end_date: str, ): - """Index Composio connector content with new session.""" - # Import from tasks folder (not connector_indexers) to avoid circular import - from app.tasks.composio_indexer import index_composio_connector + """Index Composio connector content with new session and real-time notifications.""" + # Import from routes to use the notification-wrapped version + from app.routes.search_source_connectors_routes import ( + run_composio_indexing, + ) async with get_celery_session_maker()() as session: - await index_composio_connector( + await run_composio_indexing( session, connector_id, search_space_id, user_id, start_date, end_date ) diff --git a/surfsense_backend/app/tasks/composio_indexer.py b/surfsense_backend/app/tasks/composio_indexer.py index 01d2cfce4..8762561ee 100644 --- a/surfsense_backend/app/tasks/composio_indexer.py +++ b/surfsense_backend/app/tasks/composio_indexer.py @@ -23,7 +23,7 @@ from app.db import ( SearchSourceConnector, SearchSourceConnectorType, ) -from app.services.composio_service import INDEXABLE_TOOLKITS +from app.services.composio_service import INDEXABLE_TOOLKITS, TOOLKIT_TO_DOCUMENT_TYPE from app.services.llm_service import get_user_long_context_llm from app.services.task_logging_service import TaskLoggingService from app.utils.document_converters import ( @@ -58,15 +58,13 @@ async def check_document_by_unique_identifier( async def get_connector_by_id( - session: AsyncSession, connector_id: int, connector_type: SearchSourceConnectorType + session: AsyncSession, connector_id: int, connector_type: SearchSourceConnectorType | None ) -> SearchSourceConnector | None: - """Get a connector by ID and type from the database.""" - result = await session.execute( - select(SearchSourceConnector).filter( - SearchSourceConnector.id == connector_id, - SearchSourceConnector.connector_type == connector_type, - ) - ) + """Get a connector by ID and optionally by type from the database.""" + query = select(SearchSourceConnector).filter(SearchSourceConnector.id == connector_id) + if connector_type is not None: + query = query.filter(SearchSourceConnector.connector_type == connector_type) + result = await session.execute(query) return result.scalars().first() @@ -129,10 +127,23 @@ async def index_composio_connector( ) try: - # Get connector by id + # Get connector by id - accept any Composio connector type + # We'll check the actual type after loading connector = await get_connector_by_id( - session, connector_id, SearchSourceConnectorType.COMPOSIO_CONNECTOR + session, connector_id, None # Don't filter by type, we'll validate after ) + + # Validate it's a Composio connector + if connector and connector.connector_type not in [ + SearchSourceConnectorType.COMPOSIO_GOOGLE_DRIVE_CONNECTOR, + SearchSourceConnectorType.COMPOSIO_GMAIL_CONNECTOR, + SearchSourceConnectorType.COMPOSIO_GOOGLE_CALENDAR_CONNECTOR, + ]: + error_msg = f"Connector {connector_id} is not a Composio connector" + await task_logger.log_task_failure( + log_entry, error_msg, {"error_type": "InvalidConnectorType"} + ) + return 0, error_msg if not connector: error_msg = f"Composio connector with ID {connector_id} not found" @@ -276,7 +287,7 @@ async def _index_composio_google_drive( await task_logger.log_task_success( log_entry, success_msg, {"files_count": 0} ) - return 0, success_msg + return 0, None # Return None (not error) when no items found - this is success with 0 items logger.info(f"Found {len(all_files)} Google Drive files to index via Composio") @@ -299,8 +310,9 @@ async def _index_composio_google_drive( continue # Generate unique identifier hash + document_type = DocumentType(TOOLKIT_TO_DOCUMENT_TYPE["googledrive"]) unique_identifier_hash = generate_unique_identifier_hash( - DocumentType.COMPOSIO_CONNECTOR, f"drive_{file_id}", search_space_id + document_type, f"drive_{file_id}", search_space_id ) # Check if document exists @@ -394,7 +406,7 @@ async def _index_composio_google_drive( document = Document( search_space_id=search_space_id, title=f"Drive: {file_name}", - document_type=DocumentType.COMPOSIO_CONNECTOR, + document_type=DocumentType(TOOLKIT_TO_DOCUMENT_TYPE["googledrive"]), document_metadata={ "file_id": file_id, "file_name": file_name, @@ -489,7 +501,7 @@ async def _index_composio_gmail( await task_logger.log_task_success( log_entry, success_msg, {"messages_count": 0} ) - return 0, success_msg + return 0, None # Return None (not error) when no items found - this is success with 0 items logger.info(f"Found {len(messages)} Gmail messages to index via Composio") @@ -530,8 +542,9 @@ async def _index_composio_gmail( markdown_content = composio_connector.format_gmail_message_to_markdown(message) # Generate unique identifier + document_type = DocumentType(TOOLKIT_TO_DOCUMENT_TYPE["gmail"]) unique_identifier_hash = generate_unique_identifier_hash( - DocumentType.COMPOSIO_CONNECTOR, f"gmail_{message_id}", search_space_id + document_type, f"gmail_{message_id}", search_space_id ) content_hash = generate_content_hash(markdown_content, search_space_id) @@ -612,7 +625,7 @@ async def _index_composio_gmail( document = Document( search_space_id=search_space_id, title=f"Gmail: {subject}", - document_type=DocumentType.COMPOSIO_CONNECTOR, + document_type=DocumentType(TOOLKIT_TO_DOCUMENT_TYPE["gmail"]), document_metadata={ "message_id": message_id, "subject": subject, @@ -717,7 +730,7 @@ async def _index_composio_google_calendar( await task_logger.log_task_success( log_entry, success_msg, {"events_count": 0} ) - return 0, success_msg + return 0, None # Return None (not error) when no items found - this is success with 0 items logger.info(f"Found {len(events)} Google Calendar events to index via Composio") @@ -738,8 +751,9 @@ async def _index_composio_google_calendar( markdown_content = composio_connector.format_calendar_event_to_markdown(event) # Generate unique identifier + document_type = DocumentType(TOOLKIT_TO_DOCUMENT_TYPE["googlecalendar"]) unique_identifier_hash = generate_unique_identifier_hash( - DocumentType.COMPOSIO_CONNECTOR, f"calendar_{event_id}", search_space_id + document_type, f"calendar_{event_id}", search_space_id ) content_hash = generate_content_hash(markdown_content, search_space_id) @@ -828,7 +842,7 @@ async def _index_composio_google_calendar( document = Document( search_space_id=search_space_id, title=f"Calendar: {summary}", - document_type=DocumentType.COMPOSIO_CONNECTOR, + document_type=DocumentType(TOOLKIT_TO_DOCUMENT_TYPE["googlecalendar"]), document_metadata={ "event_id": event_id, "summary": summary, diff --git a/surfsense_web/components/assistant-ui/connector-popup.tsx b/surfsense_web/components/assistant-ui/connector-popup.tsx index 1f4341d07..228b12836 100644 --- a/surfsense_web/components/assistant-ui/connector-popup.tsx +++ b/surfsense_web/components/assistant-ui/connector-popup.tsx @@ -188,8 +188,18 @@ export const ConnectorIndicator: FC = () => { searchSpaceId={searchSpaceId} connectedToolkits={ (connectors || []) - .filter((c: SearchSourceConnector) => c.connector_type === "COMPOSIO_CONNECTOR") - .map((c: SearchSourceConnector) => c.config?.toolkit_id as string) + .filter((c: SearchSourceConnector) => + c.connector_type === "COMPOSIO_GOOGLE_DRIVE_CONNECTOR" || + c.connector_type === "COMPOSIO_GMAIL_CONNECTOR" || + c.connector_type === "COMPOSIO_GOOGLE_CALENDAR_CONNECTOR" + ) + .map((c: SearchSourceConnector) => { + // Map connector type back to toolkit_id + if (c.connector_type === "COMPOSIO_GOOGLE_DRIVE_CONNECTOR") return "googledrive"; + if (c.connector_type === "COMPOSIO_GMAIL_CONNECTOR") return "gmail"; + if (c.connector_type === "COMPOSIO_GOOGLE_CALENDAR_CONNECTOR") return "googlecalendar"; + return c.config?.toolkit_id as string; + }) .filter(Boolean) } onBack={handleBackFromComposio} diff --git a/surfsense_web/components/assistant-ui/connector-popup/connector-configs/components/composio-config.tsx b/surfsense_web/components/assistant-ui/connector-popup/connector-configs/components/composio-config.tsx index 6fe37e1e5..a96f906fe 100644 --- a/surfsense_web/components/assistant-ui/connector-popup/connector-configs/components/composio-config.tsx +++ b/surfsense_web/components/assistant-ui/connector-popup/connector-configs/components/composio-config.tsx @@ -1,7 +1,5 @@ "use client"; -import { ExternalLink, Info, Zap } from "lucide-react"; -import Image from "next/image"; import type { FC } from "react"; import { Badge } from "@/components/ui/badge"; import type { SearchSourceConnector } from "@/contracts/types/connector.types"; @@ -13,92 +11,13 @@ interface ComposioConfigProps { onNameChange?: (name: string) => void; } -// Get toolkit display info -const getToolkitInfo = (toolkitId: string): { name: string; icon: string; description: string } => { - switch (toolkitId) { - case "googledrive": - return { - name: "Google Drive", - icon: "/connectors/google-drive.svg", - description: "Files and documents from Google Drive", - }; - case "gmail": - return { - name: "Gmail", - icon: "/connectors/google-gmail.svg", - description: "Emails from Gmail", - }; - case "googlecalendar": - return { - name: "Google Calendar", - icon: "/connectors/google-calendar.svg", - description: "Events from Google Calendar", - }; - case "slack": - return { - name: "Slack", - icon: "/connectors/slack.svg", - description: "Messages from Slack", - }; - case "notion": - return { - name: "Notion", - icon: "/connectors/notion.svg", - description: "Pages from Notion", - }; - case "github": - return { - name: "GitHub", - icon: "/connectors/github.svg", - description: "Repositories from GitHub", - }; - default: - return { - name: toolkitId, - icon: "/connectors/composio.svg", - description: "Connected via Composio", - }; - } -}; - export const ComposioConfig: FC = ({ connector }) => { const toolkitId = connector.config?.toolkit_id as string; - const toolkitName = connector.config?.toolkit_name as string; const isIndexable = connector.config?.is_indexable as boolean; const composioAccountId = connector.config?.composio_connected_account_id as string; - const toolkitInfo = getToolkitInfo(toolkitId); - return (
- {/* Toolkit Info Card */} -
-
-
- {toolkitInfo.name} -
-
-
-

{toolkitName || toolkitInfo.name}

- - - Composio - -
-

{toolkitInfo.description}

-
-
-
- {/* Connection Details */}

@@ -133,28 +52,6 @@ export const ComposioConfig: FC = ({ connector }) => { )}

- - {/* Info Banner */} -
-
- -
-

- This connection uses Composio's managed OAuth, which means you don't need to - wait for app verification. Your data is securely accessed through Composio. -

- - Learn more about Composio - - -
-
-
); }; diff --git a/surfsense_web/components/assistant-ui/connector-popup/connector-configs/index.tsx b/surfsense_web/components/assistant-ui/connector-popup/connector-configs/index.tsx index a7a92597c..160185b1e 100644 --- a/surfsense_web/components/assistant-ui/connector-popup/connector-configs/index.tsx +++ b/surfsense_web/components/assistant-ui/connector-popup/connector-configs/index.tsx @@ -74,7 +74,9 @@ export function getConnectorConfigComponent( return CirclebackConfig; case "MCP_CONNECTOR": return MCPConfig; - case "COMPOSIO_CONNECTOR": + case "COMPOSIO_GOOGLE_DRIVE_CONNECTOR": + case "COMPOSIO_GMAIL_CONNECTOR": + case "COMPOSIO_GOOGLE_CALENDAR_CONNECTOR": return ComposioConfig; // OAuth connectors (Gmail, Calendar, Airtable, Notion) and others don't need special config UI default: diff --git a/surfsense_web/components/assistant-ui/connector-popup/constants/connector-constants.ts b/surfsense_web/components/assistant-ui/connector-popup/constants/connector-constants.ts index 7646d7a9b..11066f28a 100644 --- a/surfsense_web/components/assistant-ui/connector-popup/constants/connector-constants.ts +++ b/surfsense_web/components/assistant-ui/connector-popup/constants/connector-constants.ts @@ -168,14 +168,28 @@ export const OTHER_CONNECTORS = [ }, ] as const; -// Composio Connector (Single entry that opens toolkit selector) +// Composio Connectors - Individual entries for each supported toolkit export const COMPOSIO_CONNECTORS = [ { - id: "composio-connector", - title: "Composio", - description: "Connect 100+ apps via Composio (Google, Slack, Notion, etc.)", - connectorType: EnumConnectorName.COMPOSIO_CONNECTOR, - // No authEndpoint - handled via toolkit selector view + id: "composio-googledrive", + title: "Google Drive", + description: "Search your Drive files via Composio", + connectorType: EnumConnectorName.COMPOSIO_GOOGLE_DRIVE_CONNECTOR, + authEndpoint: "/api/v1/auth/composio/connector/add/?toolkit_id=googledrive", + }, + { + id: "composio-gmail", + title: "Gmail", + description: "Search through your emails via Composio", + connectorType: EnumConnectorName.COMPOSIO_GMAIL_CONNECTOR, + authEndpoint: "/api/v1/auth/composio/connector/add/?toolkit_id=gmail", + }, + { + id: "composio-googlecalendar", + title: "Google Calendar", + description: "Search through your events via Composio", + connectorType: EnumConnectorName.COMPOSIO_GOOGLE_CALENDAR_CONNECTOR, + authEndpoint: "/api/v1/auth/composio/connector/add/?toolkit_id=googlecalendar", }, ] as const; diff --git a/surfsense_web/components/assistant-ui/connector-popup/constants/connector-popup.schemas.ts b/surfsense_web/components/assistant-ui/connector-popup/constants/connector-popup.schemas.ts index d74d66203..c7e77f666 100644 --- a/surfsense_web/components/assistant-ui/connector-popup/constants/connector-popup.schemas.ts +++ b/surfsense_web/components/assistant-ui/connector-popup/constants/connector-popup.schemas.ts @@ -7,7 +7,7 @@ import { searchSourceConnectorTypeEnum } from "@/contracts/types/connector.types export const connectorPopupQueryParamsSchema = z.object({ modal: z.enum(["connectors"]).optional(), tab: z.enum(["all", "active"]).optional(), - view: z.enum(["configure", "edit", "connect", "youtube", "accounts", "mcp-list"]).optional(), + view: z.enum(["configure", "edit", "connect", "youtube", "accounts", "mcp-list", "composio"]).optional(), connector: z.string().optional(), connectorId: z.string().optional(), connectorType: z.string().optional(), diff --git a/surfsense_web/components/assistant-ui/connector-popup/hooks/use-connector-dialog.ts b/surfsense_web/components/assistant-ui/connector-popup/hooks/use-connector-dialog.ts index c6ef1a927..4a177ac36 100644 --- a/surfsense_web/components/assistant-ui/connector-popup/hooks/use-connector-dialog.ts +++ b/surfsense_web/components/assistant-ui/connector-popup/hooks/use-connector-dialog.ts @@ -26,7 +26,7 @@ import { import { cacheKeys } from "@/lib/query-client/cache-keys"; import { queryClient } from "@/lib/query-client/client"; import type { IndexingConfigState } from "../constants/connector-constants"; -import { OAUTH_CONNECTORS, OTHER_CONNECTORS } from "../constants/connector-constants"; +import { COMPOSIO_CONNECTORS, OAUTH_CONNECTORS, OTHER_CONNECTORS } from "../constants/connector-constants"; import { dateRangeSchema, frequencyMinutesSchema, @@ -176,15 +176,24 @@ export const useConnectorDialog = () => { } // Handle accounts view - if (params.view === "accounts" && params.connectorType && !viewingAccountsType) { - const oauthConnector = OAUTH_CONNECTORS.find( - (c) => c.connectorType === params.connectorType - ); - if (oauthConnector) { - setViewingAccountsType({ - connectorType: oauthConnector.connectorType, - connectorTitle: oauthConnector.title, - }); + if (params.view === "accounts" && params.connectorType) { + // Update state if not set, or if connectorType has changed + const needsUpdate = !viewingAccountsType || + viewingAccountsType.connectorType !== params.connectorType; + + if (needsUpdate) { + // Check both OAUTH_CONNECTORS and COMPOSIO_CONNECTORS + const oauthConnector = OAUTH_CONNECTORS.find( + (c) => c.connectorType === params.connectorType + ) || COMPOSIO_CONNECTORS.find( + (c) => c.connectorType === params.connectorType + ); + if (oauthConnector) { + setViewingAccountsType({ + connectorType: oauthConnector.connectorType, + connectorTitle: oauthConnector.title, + }); + } } } @@ -293,6 +302,8 @@ export const useConnectorDialog = () => { indexingConfig, connectingConnectorType, viewingAccountsType, + viewingMCPList, + viewingComposio, ]); // Detect OAuth success / Failure and transition to config view @@ -389,15 +400,19 @@ export const useConnectorDialog = () => { // Handle OAuth connection const handleConnectOAuth = useCallback( - async (connector: (typeof OAUTH_CONNECTORS)[number]) => { + async (connector: (typeof OAUTH_CONNECTORS)[number] | (typeof COMPOSIO_CONNECTORS)[number]) => { if (!searchSpaceId || !connector.authEndpoint) return; // Set connecting state immediately to disable button and show spinner setConnectingId(connector.id); try { + // Check if authEndpoint already has query parameters + const separator = connector.authEndpoint.includes("?") ? "&" : "?"; + const url = `${process.env.NEXT_PUBLIC_FASTAPI_BACKEND_URL}${connector.authEndpoint}${separator}space_id=${searchSpaceId}`; + const response = await authenticatedFetch( - `${process.env.NEXT_PUBLIC_FASTAPI_BACKEND_URL}${connector.authEndpoint}?space_id=${searchSpaceId}`, + url, { method: "GET" } ); @@ -799,23 +814,19 @@ export const useConnectorDialog = () => { // Handle viewing accounts list for OAuth connector type const handleViewAccountsList = useCallback( - (connectorType: string, connectorTitle: string) => { + (connectorType: string, _connectorTitle?: string) => { if (!searchSpaceId) return; - setViewingAccountsType({ - connectorType, - connectorTitle, - }); - // Update URL to show accounts view, preserving current tab + // The useEffect will handle setting viewingAccountsType based on URL params const url = new URL(window.location.href); url.searchParams.set("modal", "connectors"); url.searchParams.set("view", "accounts"); url.searchParams.set("connectorType", connectorType); // Keep the current tab in URL so we can go back to it - window.history.pushState({ modal: true }, "", url.toString()); + router.replace(url.pathname + url.search, { scroll: false }); }, - [searchSpaceId] + [searchSpaceId, router] ); // Handle going back from accounts list view @@ -839,8 +850,8 @@ export const useConnectorDialog = () => { const url = new URL(window.location.href); url.searchParams.set("modal", "connectors"); url.searchParams.set("view", "mcp-list"); - window.history.pushState({ modal: true }, "", url.toString()); - }, [searchSpaceId]); + router.replace(url.pathname + url.search, { scroll: false }); + }, [searchSpaceId, router]); // Handle going back from MCP list view const handleBackFromMCPList = useCallback(() => { @@ -871,8 +882,8 @@ export const useConnectorDialog = () => { const url = new URL(window.location.href); url.searchParams.set("modal", "connectors"); url.searchParams.set("view", "composio"); - window.history.pushState({ modal: true }, "", url.toString()); - }, [searchSpaceId]); + router.replace(url.pathname + url.search, { scroll: false }); + }, [searchSpaceId, router]); // Handle going back from Composio view const handleBackFromComposio = useCallback(() => { @@ -1423,7 +1434,7 @@ export const useConnectorDialog = () => { setIsDisconnecting(false); } }, - [editingConnector, searchSpaceId, deleteConnector, router] + [editingConnector, searchSpaceId, deleteConnector, router, cameFromMCPList] ); // Handle quick index (index without date picker, uses backend defaults) @@ -1579,6 +1590,7 @@ export const useConnectorDialog = () => { viewingAccountsType, viewingMCPList, viewingComposio, + connectingComposioToolkit, // Setters setSearchQuery, @@ -1616,8 +1628,6 @@ export const useConnectorDialog = () => { setIndexingConnectorConfig, // Composio - viewingComposio, - connectingComposioToolkit, handleOpenComposio, handleBackFromComposio, handleConnectComposioToolkit, diff --git a/surfsense_web/components/assistant-ui/connector-popup/tabs/all-connectors-tab.tsx b/surfsense_web/components/assistant-ui/connector-popup/tabs/all-connectors-tab.tsx index 1b36b3b81..4a0680200 100644 --- a/surfsense_web/components/assistant-ui/connector-popup/tabs/all-connectors-tab.tsx +++ b/surfsense_web/components/assistant-ui/connector-popup/tabs/all-connectors-tab.tsx @@ -4,7 +4,6 @@ import type { FC } from "react"; import { EnumConnectorName } from "@/contracts/enums/connector"; import type { SearchSourceConnector } from "@/contracts/types/connector.types"; import { ConnectorCard } from "../components/connector-card"; -import { ComposioConnectorCard } from "../components/composio-connector-card"; import { CRAWLERS, OAUTH_CONNECTORS, OTHER_CONNECTORS, COMPOSIO_CONNECTORS } from "../constants/connector-constants"; import { getDocumentCountForConnector } from "../utils/connector-document-mapping"; @@ -29,13 +28,12 @@ interface AllConnectorsTabProps { allConnectors: SearchSourceConnector[] | undefined; documentTypeCounts?: Record; indexingConnectorIds?: Set; - onConnectOAuth: (connector: (typeof OAUTH_CONNECTORS)[number]) => void; + onConnectOAuth: (connector: (typeof OAUTH_CONNECTORS)[number] | (typeof COMPOSIO_CONNECTORS)[number]) => void; onConnectNonOAuth?: (connectorType: string) => void; onCreateWebcrawler?: () => void; onCreateYouTubeCrawler?: () => void; onManage?: (connector: SearchSourceConnector) => void; onViewAccountsList?: (connectorType: string, connectorTitle: string) => void; - onOpenComposio?: () => void; } export const AllConnectorsTab: FC = ({ @@ -51,7 +49,6 @@ export const AllConnectorsTab: FC = ({ onCreateYouTubeCrawler, onManage, onViewAccountsList, - onOpenComposio, }) => { // Filter connectors based on search const filteredOAuth = OAUTH_CONNECTORS.filter( @@ -79,23 +76,16 @@ export const AllConnectorsTab: FC = ({ c.description.toLowerCase().includes(searchQuery.toLowerCase()) ); - // Count Composio connectors - const composioConnectorCount = allConnectors - ? allConnectors.filter( - (c: SearchSourceConnector) => c.connector_type === EnumConnectorName.COMPOSIO_CONNECTOR - ).length - : 0; - return (
- {/* Quick Connect */} - {filteredOAuth.length > 0 && ( + {/* Managed OAuth (Composio Integrations) */} + {filteredComposio.length > 0 && (
-

Quick Connect

+

Managed OAuth

- {filteredOAuth.map((connector) => { + {filteredComposio.map((connector) => { const isConnected = connectedTypes.has(connector.connectorType); const isConnecting = connectingId === connector.id; @@ -109,17 +99,6 @@ export const AllConnectorsTab: FC = ({ const accountCount = typeConnectors.length; - // Get the most recent last_indexed_at across all accounts - const mostRecentLastIndexed = typeConnectors.reduce( - (latest, c) => { - if (!c.last_indexed_at) return latest; - if (!latest) return c.last_indexed_at; - return new Date(c.last_indexed_at) > new Date(latest) - ? c.last_indexed_at - : latest; - }, - undefined - ); const documentCount = getDocumentCountForConnector( connector.connectorType, @@ -154,26 +133,57 @@ export const AllConnectorsTab: FC = ({
)} - {/* Composio Integrations */} - {filteredComposio.length > 0 && onOpenComposio && ( + {/* Quick Connect */} + {filteredOAuth.length > 0 && (
-

Managed OAuth

- - No verification needed - +

Quick Connect

- {filteredComposio.map((connector) => ( - - ))} + {filteredOAuth.map((connector) => { + const isConnected = connectedTypes.has(connector.connectorType); + const isConnecting = connectingId === connector.id; + + // Find all connectors of this type + const typeConnectors = + isConnected && allConnectors + ? allConnectors.filter( + (c: SearchSourceConnector) => c.connector_type === connector.connectorType + ) + : []; + + const accountCount = typeConnectors.length; + + + const documentCount = getDocumentCountForConnector( + connector.connectorType, + documentTypeCounts + ); + + // Check if any account is currently indexing + const isIndexing = typeConnectors.some((c) => indexingConnectorIds?.has(c.id)); + + return ( + onConnectOAuth(connector)} + onManage={ + isConnected && onViewAccountsList + ? () => onViewAccountsList(connector.connectorType, connector.title) + : undefined + } + /> + ); + })}
)} diff --git a/surfsense_web/components/assistant-ui/connector-popup/utils/connector-document-mapping.ts b/surfsense_web/components/assistant-ui/connector-popup/utils/connector-document-mapping.ts index 522e1763c..ded3bdcca 100644 --- a/surfsense_web/components/assistant-ui/connector-popup/utils/connector-document-mapping.ts +++ b/surfsense_web/components/assistant-ui/connector-popup/utils/connector-document-mapping.ts @@ -30,7 +30,10 @@ export const CONNECTOR_TO_DOCUMENT_TYPE: Record = { // Special mappings (connector type differs from document type) GOOGLE_DRIVE_CONNECTOR: "GOOGLE_DRIVE_FILE", WEBCRAWLER_CONNECTOR: "CRAWLED_URL", - COMPOSIO_CONNECTOR: "COMPOSIO_CONNECTOR", + // Composio connectors map to their own document types + COMPOSIO_GOOGLE_DRIVE_CONNECTOR: "COMPOSIO_GOOGLE_DRIVE_CONNECTOR", + COMPOSIO_GMAIL_CONNECTOR: "COMPOSIO_GMAIL_CONNECTOR", + COMPOSIO_GOOGLE_CALENDAR_CONNECTOR: "COMPOSIO_GOOGLE_CALENDAR_CONNECTOR", }; /** diff --git a/surfsense_web/contracts/enums/connector.ts b/surfsense_web/contracts/enums/connector.ts index e1fb1e3f2..20d6093b6 100644 --- a/surfsense_web/contracts/enums/connector.ts +++ b/surfsense_web/contracts/enums/connector.ts @@ -24,5 +24,7 @@ export enum EnumConnectorName { YOUTUBE_CONNECTOR = "YOUTUBE_CONNECTOR", CIRCLEBACK_CONNECTOR = "CIRCLEBACK_CONNECTOR", MCP_CONNECTOR = "MCP_CONNECTOR", - COMPOSIO_CONNECTOR = "COMPOSIO_CONNECTOR", + COMPOSIO_GOOGLE_DRIVE_CONNECTOR = "COMPOSIO_GOOGLE_DRIVE_CONNECTOR", + COMPOSIO_GMAIL_CONNECTOR = "COMPOSIO_GMAIL_CONNECTOR", + COMPOSIO_GOOGLE_CALENDAR_CONNECTOR = "COMPOSIO_GOOGLE_CALENDAR_CONNECTOR", } diff --git a/surfsense_web/contracts/enums/connectorIcons.tsx b/surfsense_web/contracts/enums/connectorIcons.tsx index 947c886d5..a1e6c9040 100644 --- a/surfsense_web/contracts/enums/connectorIcons.tsx +++ b/surfsense_web/contracts/enums/connectorIcons.tsx @@ -66,8 +66,12 @@ export const getConnectorIcon = (connectorType: EnumConnectorName | string, clas return ; case EnumConnectorName.MCP_CONNECTOR: return MCP; - case EnumConnectorName.COMPOSIO_CONNECTOR: - return Composio; + case EnumConnectorName.COMPOSIO_GOOGLE_DRIVE_CONNECTOR: + return Google Drive; + case EnumConnectorName.COMPOSIO_GMAIL_CONNECTOR: + return Gmail; + case EnumConnectorName.COMPOSIO_GOOGLE_CALENDAR_CONNECTOR: + return Google Calendar; // Additional cases for non-enum connector types case "YOUTUBE_CONNECTOR": return YouTube; @@ -87,8 +91,12 @@ export const getConnectorIcon = (connectorType: EnumConnectorName | string, clas return ; case "GOOGLE_DRIVE_FILE": return ; - case "COMPOSIO_CONNECTOR": - return Composio; + case "COMPOSIO_GOOGLE_DRIVE_CONNECTOR": + return Google Drive; + case "COMPOSIO_GMAIL_CONNECTOR": + return Gmail; + case "COMPOSIO_GOOGLE_CALENDAR_CONNECTOR": + return Google Calendar; case "NOTE": return ; case "EXTENSION": diff --git a/surfsense_web/contracts/types/connector.types.ts b/surfsense_web/contracts/types/connector.types.ts index 861bf1758..d52469ce9 100644 --- a/surfsense_web/contracts/types/connector.types.ts +++ b/surfsense_web/contracts/types/connector.types.ts @@ -27,7 +27,9 @@ export const searchSourceConnectorTypeEnum = z.enum([ "BOOKSTACK_CONNECTOR", "CIRCLEBACK_CONNECTOR", "MCP_CONNECTOR", - "COMPOSIO_CONNECTOR", + "COMPOSIO_GOOGLE_DRIVE_CONNECTOR", + "COMPOSIO_GMAIL_CONNECTOR", + "COMPOSIO_GOOGLE_CALENDAR_CONNECTOR", ]); export const searchSourceConnector = z.object({ @@ -149,6 +151,13 @@ export const googleDriveIndexBody = z.object({ name: z.string(), }) ), + indexing_options: z + .object({ + max_files_per_folder: z.number().int().min(1).max(1000), + incremental_sync: z.boolean(), + include_subfolders: z.boolean(), + }) + .optional(), }); /** diff --git a/surfsense_web/contracts/types/document.types.ts b/surfsense_web/contracts/types/document.types.ts index a8f3a3b38..01a58173e 100644 --- a/surfsense_web/contracts/types/document.types.ts +++ b/surfsense_web/contracts/types/document.types.ts @@ -25,7 +25,9 @@ export const documentTypeEnum = z.enum([ "CIRCLEBACK", "SURFSENSE_DOCS", "NOTE", - "COMPOSIO_CONNECTOR", + "COMPOSIO_GOOGLE_DRIVE_CONNECTOR", + "COMPOSIO_GMAIL_CONNECTOR", + "COMPOSIO_GOOGLE_CALENDAR_CONNECTOR", ]); export const document = z.object({ From 6139b07a66f859adc80d93310e420209eeb3f2e0 Mon Sep 17 00:00:00 2001 From: Anish Sarkar <104695310+AnishSarkar22@users.noreply.github.com> Date: Fri, 23 Jan 2026 03:57:02 +0530 Subject: [PATCH 02/28] fix: remove toolkit view --- .../assistant-ui/connector-popup.tsx | 34 +- .../hooks/use-connector-dialog.ts | 79 ----- .../tabs/all-connectors-tab.tsx | 2 +- .../views/composio-toolkit-view.tsx | 301 ------------------ 4 files changed, 2 insertions(+), 414 deletions(-) delete mode 100644 surfsense_web/components/assistant-ui/connector-popup/views/composio-toolkit-view.tsx diff --git a/surfsense_web/components/assistant-ui/connector-popup.tsx b/surfsense_web/components/assistant-ui/connector-popup.tsx index 228b12836..a1108f7c8 100644 --- a/surfsense_web/components/assistant-ui/connector-popup.tsx +++ b/surfsense_web/components/assistant-ui/connector-popup.tsx @@ -21,7 +21,6 @@ import { useConnectorDialog } from "./connector-popup/hooks/use-connector-dialog import { useIndexingConnectors } from "./connector-popup/hooks/use-indexing-connectors"; import { ActiveConnectorsTab } from "./connector-popup/tabs/active-connectors-tab"; import { AllConnectorsTab } from "./connector-popup/tabs/all-connectors-tab"; -import { ComposioToolkitView } from "./connector-popup/views/composio-toolkit-view"; import { ConnectorAccountsListView } from "./connector-popup/views/connector-accounts-list-view"; import { YouTubeCrawlerView } from "./connector-popup/views/youtube-crawler-view"; @@ -88,12 +87,6 @@ export const ConnectorIndicator: FC = () => { setConnectorConfig, setIndexingConnectorConfig, setConnectorName, - // Composio - viewingComposio, - connectingComposioToolkit, - handleOpenComposio, - handleBackFromComposio, - handleConnectComposioToolkit, } = useConnectorDialog(); // Fetch connectors using Electric SQL + PGlite for real-time updates @@ -142,7 +135,7 @@ export const ConnectorIndicator: FC = () => { // Check which connectors are already connected // Using Electric SQL + PGlite for real-time connector updates - const connectedTypes = new Set( + const connectedTypes = new Set( (connectors || []).map((c: SearchSourceConnector) => c.connector_type) ); @@ -183,30 +176,6 @@ export const ConnectorIndicator: FC = () => { {/* YouTube Crawler View - shown when adding YouTube videos */} {isYouTubeView && searchSpaceId ? ( - ) : viewingComposio && searchSpaceId ? ( - - c.connector_type === "COMPOSIO_GOOGLE_DRIVE_CONNECTOR" || - c.connector_type === "COMPOSIO_GMAIL_CONNECTOR" || - c.connector_type === "COMPOSIO_GOOGLE_CALENDAR_CONNECTOR" - ) - .map((c: SearchSourceConnector) => { - // Map connector type back to toolkit_id - if (c.connector_type === "COMPOSIO_GOOGLE_DRIVE_CONNECTOR") return "googledrive"; - if (c.connector_type === "COMPOSIO_GMAIL_CONNECTOR") return "gmail"; - if (c.connector_type === "COMPOSIO_GOOGLE_CALENDAR_CONNECTOR") return "googlecalendar"; - return c.config?.toolkit_id as string; - }) - .filter(Boolean) - } - onBack={handleBackFromComposio} - onConnectToolkit={handleConnectComposioToolkit} - isConnecting={connectingComposioToolkit !== null} - connectingToolkitId={connectingComposioToolkit} - /> ) : viewingMCPList ? ( { onCreateYouTubeCrawler={handleCreateYouTubeCrawler} onManage={handleStartEdit} onViewAccountsList={handleViewAccountsList} - onOpenComposio={handleOpenComposio} /> diff --git a/surfsense_web/components/assistant-ui/connector-popup/hooks/use-connector-dialog.ts b/surfsense_web/components/assistant-ui/connector-popup/hooks/use-connector-dialog.ts index 4a177ac36..3ea1aab48 100644 --- a/surfsense_web/components/assistant-ui/connector-popup/hooks/use-connector-dialog.ts +++ b/surfsense_web/components/assistant-ui/connector-popup/hooks/use-connector-dialog.ts @@ -83,9 +83,6 @@ export const useConnectorDialog = () => { // MCP list view state (for managing multiple MCP connectors) const [viewingMCPList, setViewingMCPList] = useState(false); - // Composio toolkit view state - const [viewingComposio, setViewingComposio] = useState(false); - const [connectingComposioToolkit, setConnectingComposioToolkit] = useState(null); // Track if we came from accounts list when entering edit mode const [cameFromAccountsList, setCameFromAccountsList] = useState<{ @@ -159,17 +156,6 @@ export const useConnectorDialog = () => { setViewingMCPList(true); } - // Clear Composio view if view is not "composio" anymore - if (params.view !== "composio" && viewingComposio) { - setViewingComposio(false); - setConnectingComposioToolkit(null); - } - - // Handle Composio view - if (params.view === "composio" && !viewingComposio) { - setViewingComposio(true); - } - // Handle connect view if (params.view === "connect" && params.connectorType && !connectingConnectorType) { setConnectingConnectorType(params.connectorType); @@ -303,7 +289,6 @@ export const useConnectorDialog = () => { connectingConnectorType, viewingAccountsType, viewingMCPList, - viewingComposio, ]); // Detect OAuth success / Failure and transition to config view @@ -872,63 +857,6 @@ export const useConnectorDialog = () => { router.replace(url.pathname + url.search, { scroll: false }); }, [router]); - // Handle opening Composio toolkit view - const handleOpenComposio = useCallback(() => { - if (!searchSpaceId) return; - - setViewingComposio(true); - - // Update URL to show Composio view - const url = new URL(window.location.href); - url.searchParams.set("modal", "connectors"); - url.searchParams.set("view", "composio"); - router.replace(url.pathname + url.search, { scroll: false }); - }, [searchSpaceId, router]); - - // Handle going back from Composio view - const handleBackFromComposio = useCallback(() => { - setViewingComposio(false); - setConnectingComposioToolkit(null); - const url = new URL(window.location.href); - url.searchParams.set("modal", "connectors"); - url.searchParams.delete("view"); - router.replace(url.pathname + url.search, { scroll: false }); - }, [router]); - - // Handle connecting a Composio toolkit - const handleConnectComposioToolkit = useCallback( - async (toolkitId: string) => { - if (!searchSpaceId) return; - - setConnectingComposioToolkit(toolkitId); - - try { - const response = await authenticatedFetch( - `${process.env.NEXT_PUBLIC_FASTAPI_BACKEND_URL}/api/v1/auth/composio/connector/add?space_id=${searchSpaceId}&toolkit_id=${toolkitId}`, - { method: "GET" } - ); - - if (!response.ok) { - throw new Error(`Failed to initiate Composio OAuth for ${toolkitId}`); - } - - const data = await response.json(); - - if (data.auth_url) { - // Redirect to Composio OAuth - window.location.href = data.auth_url; - } else { - throw new Error("No authorization URL received from Composio"); - } - } catch (error) { - console.error("Error connecting Composio toolkit:", error); - toast.error(`Failed to connect ${toolkitId}. Please try again.`); - setConnectingComposioToolkit(null); - } - }, - [searchSpaceId] - ); - // Handle starting indexing const handleStartIndexing = useCallback( async (refreshConnectors: () => void) => { @@ -1589,8 +1517,6 @@ export const useConnectorDialog = () => { allConnectors, viewingAccountsType, viewingMCPList, - viewingComposio, - connectingComposioToolkit, // Setters setSearchQuery, @@ -1626,10 +1552,5 @@ export const useConnectorDialog = () => { connectorConfig, setConnectorConfig, setIndexingConnectorConfig, - - // Composio - handleOpenComposio, - handleBackFromComposio, - handleConnectComposioToolkit, }; }; diff --git a/surfsense_web/components/assistant-ui/connector-popup/tabs/all-connectors-tab.tsx b/surfsense_web/components/assistant-ui/connector-popup/tabs/all-connectors-tab.tsx index 4a0680200..ffe879d5d 100644 --- a/surfsense_web/components/assistant-ui/connector-popup/tabs/all-connectors-tab.tsx +++ b/surfsense_web/components/assistant-ui/connector-popup/tabs/all-connectors-tab.tsx @@ -82,7 +82,7 @@ export const AllConnectorsTab: FC = ({ {filteredComposio.length > 0 && (
-

Managed OAuth

+

Managed OAuth (Composio)

{filteredComposio.map((connector) => { diff --git a/surfsense_web/components/assistant-ui/connector-popup/views/composio-toolkit-view.tsx b/surfsense_web/components/assistant-ui/connector-popup/views/composio-toolkit-view.tsx deleted file mode 100644 index 456835597..000000000 --- a/surfsense_web/components/assistant-ui/connector-popup/views/composio-toolkit-view.tsx +++ /dev/null @@ -1,301 +0,0 @@ -"use client"; - -import { - ArrowLeft, - Calendar, - Check, - ExternalLink, - Github, - Loader2, - Mail, - HardDrive, - MessageSquare, - FileText, - Zap, -} from "lucide-react"; -import Image from "next/image"; -import type { FC } from "react"; -import { useState } from "react"; -import { Badge } from "@/components/ui/badge"; -import { Button } from "@/components/ui/button"; -import { cn } from "@/lib/utils"; - -interface ComposioToolkit { - id: string; - name: string; - description: string; - isIndexable: boolean; -} - -interface ComposioToolkitViewProps { - searchSpaceId: string; - connectedToolkits: string[]; - onBack: () => void; - onConnectToolkit: (toolkitId: string) => void; - isConnecting: boolean; - connectingToolkitId: string | null; -} - -// Available Composio toolkits -const COMPOSIO_TOOLKITS: ComposioToolkit[] = [ - { - id: "googledrive", - name: "Google Drive", - description: "Search your Drive files and documents", - isIndexable: true, - }, - { - id: "gmail", - name: "Gmail", - description: "Search through your emails", - isIndexable: true, - }, - { - id: "googlecalendar", - name: "Google Calendar", - description: "Search through your events", - isIndexable: true, - }, - { - id: "slack", - name: "Slack", - description: "Search Slack messages", - isIndexable: false, - }, - { - id: "notion", - name: "Notion", - description: "Search Notion pages", - isIndexable: false, - }, - { - id: "github", - name: "GitHub", - description: "Search repositories and code", - isIndexable: false, - }, -]; - -// Get icon for toolkit -const getToolkitIcon = (toolkitId: string, className?: string) => { - const iconClass = className || "size-5"; - - switch (toolkitId) { - case "googledrive": - return Google Drive; - case "gmail": - return Gmail; - case "googlecalendar": - return Google Calendar; - case "slack": - return Slack; - case "notion": - return Notion; - case "github": - return GitHub; - default: - return ; - } -}; - -export const ComposioToolkitView: FC = ({ - searchSpaceId, - connectedToolkits, - onBack, - onConnectToolkit, - isConnecting, - connectingToolkitId, -}) => { - const [hoveredToolkit, setHoveredToolkit] = useState(null); - - // Separate indexable and non-indexable toolkits - const indexableToolkits = COMPOSIO_TOOLKITS.filter((t) => t.isIndexable); - const nonIndexableToolkits = COMPOSIO_TOOLKITS.filter((t) => !t.isIndexable); - - return ( -
- {/* Header */} -
- {/* Back button */} - - - {/* Header content */} -
-
-
- Composio -
-
-

- Composio -

-

- Connect 100+ apps with managed OAuth - no verification needed -

-
-
- - Powered by Composio - - -
-
- - {/* Content */} -
- {/* Indexable Toolkits (Google Services) */} -
-
-

Google Services

- - Indexable - -
-

- Connect Google services via Composio's verified OAuth app. Your data will be indexed and searchable. -

-
- {indexableToolkits.map((toolkit) => { - const isConnected = connectedToolkits.includes(toolkit.id); - const isThisConnecting = connectingToolkitId === toolkit.id; - - return ( -
setHoveredToolkit(toolkit.id)} - onMouseLeave={() => setHoveredToolkit(null)} - className={cn( - "group relative flex flex-col p-4 rounded-xl border transition-all duration-200", - isConnected - ? "border-emerald-500/30 bg-emerald-500/5" - : "border-border bg-card hover:border-violet-500/30 hover:bg-violet-500/5" - )} - > -
-
- {getToolkitIcon(toolkit.id, "size-5")} -
- {isConnected && ( - - - Connected - - )} -
-

{toolkit.name}

-

- {toolkit.description} -

- -
- ); - })} -
-
- - {/* Non-Indexable Toolkits (Coming Soon) */} -
-
-

More Integrations

- - Coming Soon - -
-

- Connect these services for future indexing support. Currently available for connection only. -

-
- {nonIndexableToolkits.map((toolkit) => ( -
-
-
- {getToolkitIcon(toolkit.id, "size-5")} -
- - Soon - -
-

{toolkit.name}

-

- {toolkit.description} -

- -
- ))} -
-
- - {/* Info footer */} -
-
-
- -
-
-

Why use Composio?

-

- Composio provides pre-verified OAuth apps, so you don't need to wait for Google app verification. - Your data is securely processed through Composio's managed authentication. -

-
-
-
-
-
- ); -}; From 4cbf80d73a74170a532cf1b531d7a9d670cc4663 Mon Sep 17 00:00:00 2001 From: Anish Sarkar <104695310+AnishSarkar22@users.noreply.github.com> Date: Fri, 23 Jan 2026 04:44:37 +0530 Subject: [PATCH 03/28] feat: enhance Composio integration with pagination and improved error handling - Updated the list_gmail_messages method to support pagination with page tokens, allowing for more efficient message retrieval. - Modified the return structure to include next_page_token and result_size_estimate for better client-side handling. - Improved error handling and logging throughout the Gmail indexing process, ensuring better visibility into failures. - Implemented batch processing for Gmail messages, committing changes incrementally to prevent data loss. - Ensured consistent timestamp updates for connectors, even when no documents are indexed, to maintain accurate UI states. - Refactored the indexing logic to streamline message processing and enhance overall performance. --- .../app/connectors/composio_connector.py | 15 +- .../routes/search_source_connectors_routes.py | 16 +- .../app/services/composio_service.py | 54 +- .../app/tasks/composio_indexer.py | 579 ++++++++++++------ 4 files changed, 451 insertions(+), 213 deletions(-) diff --git a/surfsense_backend/app/connectors/composio_connector.py b/surfsense_backend/app/connectors/composio_connector.py index 18fd9564c..21e339d12 100644 --- a/surfsense_backend/app/connectors/composio_connector.py +++ b/surfsense_backend/app/connectors/composio_connector.py @@ -151,21 +151,23 @@ class ComposioConnector: async def list_gmail_messages( self, query: str = "", - max_results: int = 100, - ) -> tuple[list[dict[str, Any]], str | None]: + max_results: int = 50, + page_token: str | None = None, + ) -> tuple[list[dict[str, Any]], str | None, int | None, str | None]: """ - List Gmail messages via Composio. + List Gmail messages via Composio with pagination support. Args: query: Gmail search query. - max_results: Maximum number of messages. + max_results: Maximum number of messages per page (default: 50). + page_token: Optional pagination token for next page. Returns: - Tuple of (messages list, error message). + Tuple of (messages list, next_page_token, result_size_estimate, error message). """ connected_account_id = await self.get_connected_account_id() if not connected_account_id: - return [], "No connected account ID found" + return [], None, None, "No connected account ID found" entity_id = await self.get_entity_id() service = await self._get_service() @@ -174,6 +176,7 @@ class ComposioConnector: entity_id=entity_id, query=query, max_results=max_results, + page_token=page_token, ) async def get_gmail_message_detail( diff --git a/surfsense_backend/app/routes/search_source_connectors_routes.py b/surfsense_backend/app/routes/search_source_connectors_routes.py index 9ad03fba8..1578ad0d5 100644 --- a/surfsense_backend/app/routes/search_source_connectors_routes.py +++ b/surfsense_backend/app/routes/search_source_connectors_routes.py @@ -957,7 +957,7 @@ async def _update_connector_timestamp_by_id(session: AsyncSession, connector_id: connector = result.scalars().first() if connector: - connector.last_indexed_at = datetime.now() + connector.last_indexed_at = datetime.now(UTC) # Use UTC for timezone consistency await session.commit() logger.info(f"Updated last_indexed_at for connector {connector_id}") except Exception as e: @@ -1097,18 +1097,22 @@ async def _run_indexing_with_notifications( ) await update_timestamp_func(session, connector_id) + await session.commit() # Commit timestamp update logger.info( f"Indexing completed successfully: {documents_processed} documents processed" ) # Update notification on success if notification: + # Refresh notification to ensure it's not stale after timestamp update commit + await session.refresh(notification) await NotificationService.connector_indexing.notify_indexing_completed( session=session, notification=notification, indexed_count=documents_processed, error_message=None, ) + await session.commit() # Commit to ensure Electric SQL syncs the notification update elif documents_processed > 0: # Update notification to storing stage if notification: @@ -1124,24 +1128,30 @@ async def _run_indexing_with_notifications( f"Indexing completed successfully: {documents_processed} documents processed" ) if notification: + # Refresh notification to ensure it's not stale after indexing function commits + await session.refresh(notification) await NotificationService.connector_indexing.notify_indexing_completed( session=session, notification=notification, indexed_count=documents_processed, error_message=None, ) + await session.commit() # Commit to ensure Electric SQL syncs the notification update else: # No new documents processed - check if this is an error or just no changes if error_or_warning: # Actual failure logger.error(f"Indexing failed: {error_or_warning}") if notification: + # Refresh notification to ensure it's not stale after indexing function commits + await session.refresh(notification) await NotificationService.connector_indexing.notify_indexing_completed( session=session, notification=notification, indexed_count=0, error_message=error_or_warning, ) + await session.commit() # Commit to ensure Electric SQL syncs the notification update else: # Success - just no new documents to index (all skipped/unchanged) logger.info( @@ -1150,13 +1160,17 @@ async def _run_indexing_with_notifications( # Still update timestamp so ElectricSQL syncs and clears "Syncing" UI if update_timestamp_func: await update_timestamp_func(session, connector_id) + await session.commit() # Commit timestamp update if notification: + # Refresh notification to ensure it's not stale after timestamp update commit + await session.refresh(notification) await NotificationService.connector_indexing.notify_indexing_completed( session=session, notification=notification, indexed_count=0, error_message=None, # No error - sync succeeded ) + await session.commit() # Commit to ensure Electric SQL syncs the notification update except Exception as e: logger.error(f"Error in indexing task: {e!s}", exc_info=True) diff --git a/surfsense_backend/app/services/composio_service.py b/surfsense_backend/app/services/composio_service.py index 17fbd64e0..e32cbf8a0 100644 --- a/surfsense_backend/app/services/composio_service.py +++ b/surfsense_backend/app/services/composio_service.py @@ -256,7 +256,6 @@ class ComposioService: "user_id": getattr(acc, "user_id", None), }) - logger.info(f"DEBUG: Found {len(result)} TOTAL connections in Composio") return result except Exception as e: logger.error(f"Failed to list all connections: {e!s}") @@ -273,7 +272,6 @@ class ComposioService: List of connected account details. """ try: - logger.info(f"DEBUG: Calling connected_accounts.list(user_id='{user_id}')") accounts_response = self.client.connected_accounts.list(user_id=user_id) # Handle paginated response (may have .items attribute) or direct list @@ -358,7 +356,6 @@ class ComposioService: # - connected_account_id: for authentication # - user_id: user identifier (SDK uses user_id, not entity_id) # - dangerously_skip_version_check: skip version check for manual execution - logger.info(f"DEBUG: Executing tool {tool_name} with params: {params}") result = self.client.tools.execute( slug=tool_name, connected_account_id=connected_account_id, @@ -366,8 +363,6 @@ class ComposioService: arguments=params or {}, dangerously_skip_version_check=True, ) - logger.info(f"DEBUG: Tool {tool_name} raw result type: {type(result)}") - logger.info(f"DEBUG: Tool {tool_name} raw result: {result}") return {"success": True, "data": result} except Exception as e: logger.error(f"Failed to execute tool {tool_name}: {e!s}") @@ -417,7 +412,6 @@ class ComposioService: return [], None, result.get("error", "Unknown error") data = result.get("data", {}) - logger.info(f"DEBUG: Drive data type: {type(data)}, keys: {data.keys() if isinstance(data, dict) else 'N/A'}") # Handle nested response structure from Composio files = [] @@ -429,7 +423,6 @@ class ComposioService: elif isinstance(data, list): files = data - logger.info(f"DEBUG: Extracted {len(files)} drive files") return files, next_token, None except Exception as e: @@ -478,25 +471,30 @@ class ComposioService: connected_account_id: str, entity_id: str, query: str = "", - max_results: int = 100, - ) -> tuple[list[dict[str, Any]], str | None]: + max_results: int = 50, + page_token: str | None = None, + ) -> tuple[list[dict[str, Any]], str | None, int | None, str | None]: """ - List Gmail messages via Composio. + List Gmail messages via Composio with pagination support. Args: connected_account_id: Composio connected account ID. entity_id: The entity/user ID that owns the connected account. query: Gmail search query. - max_results: Maximum number of messages to return. + max_results: Maximum number of messages to return per page (default: 50 to avoid payload size issues). + page_token: Optional pagination token for next page. Returns: - Tuple of (messages list, error message). + Tuple of (messages list, next_page_token, result_size_estimate, error message). """ try: - # Composio uses snake_case for parameters, max is 500 - params = {"max_results": min(max_results, 500)} + # Use smaller batch size to avoid 413 payload too large errors + # Composio uses snake_case for parameters + params = {"max_results": min(max_results, 50)} # Reduced from 500 to 50 if query: params["query"] = query # Composio uses 'query' not 'q' + if page_token: + params["page_token"] = page_token result = await self.execute_tool( connected_account_id=connected_account_id, @@ -506,25 +504,38 @@ class ComposioService: ) if not result.get("success"): - return [], result.get("error", "Unknown error") + return [], None, result.get("error", "Unknown error") data = result.get("data", {}) - logger.info(f"DEBUG: Gmail data type: {type(data)}, keys: {data.keys() if isinstance(data, dict) else 'N/A'}") - logger.info(f"DEBUG: Gmail full data: {data}") # Try different possible response structures messages = [] + next_token = None + result_size_estimate = None if isinstance(data, dict): messages = data.get("messages", []) or data.get("data", {}).get("messages", []) or data.get("emails", []) + # Check for pagination token in various possible locations + next_token = ( + data.get("nextPageToken") + or data.get("next_page_token") + or data.get("data", {}).get("nextPageToken") + or data.get("data", {}).get("next_page_token") + ) + # Extract resultSizeEstimate if available (Gmail API provides this) + result_size_estimate = ( + data.get("resultSizeEstimate") + or data.get("result_size_estimate") + or data.get("data", {}).get("resultSizeEstimate") + or data.get("data", {}).get("result_size_estimate") + ) elif isinstance(data, list): messages = data - logger.info(f"DEBUG: Extracted {len(messages)} messages") - return messages, None + return messages, next_token, result_size_estimate, None except Exception as e: logger.error(f"Failed to list Gmail messages: {e!s}") - return [], str(e) + return [], None, str(e) async def get_gmail_message_detail( self, connected_account_id: str, entity_id: str, message_id: str @@ -603,8 +614,6 @@ class ComposioService: return [], result.get("error", "Unknown error") data = result.get("data", {}) - logger.info(f"DEBUG: Calendar data type: {type(data)}, keys: {data.keys() if isinstance(data, dict) else 'N/A'}") - logger.info(f"DEBUG: Calendar full data: {data}") # Try different possible response structures events = [] @@ -613,7 +622,6 @@ class ComposioService: elif isinstance(data, list): events = data - logger.info(f"DEBUG: Extracted {len(events)} calendar events") return events, None except Exception as e: diff --git a/surfsense_backend/app/tasks/composio_indexer.py b/surfsense_backend/app/tasks/composio_indexer.py index 8762561ee..c9cd74234 100644 --- a/surfsense_backend/app/tasks/composio_indexer.py +++ b/surfsense_backend/app/tasks/composio_indexer.py @@ -9,6 +9,7 @@ to avoid circular import issues with the connector_indexers package. import logging from datetime import UTC, datetime +from typing import Any from sqlalchemy.exc import SQLAlchemyError from sqlalchemy.ext.asyncio import AsyncSession @@ -26,6 +27,7 @@ from app.db import ( from app.services.composio_service import INDEXABLE_TOOLKITS, TOOLKIT_TO_DOCUMENT_TYPE from app.services.llm_service import get_user_long_context_llm from app.services.task_logging_service import TaskLoggingService +from app.tasks.connector_indexers.base import calculate_date_range from app.utils.document_converters import ( create_document_chunks, generate_content_hash, @@ -75,7 +77,7 @@ async def update_connector_last_indexed( ) -> None: """Update the last_indexed_at timestamp for a connector.""" if update_last_indexed: - connector.last_indexed_at = datetime.now() + connector.last_indexed_at = datetime.now(UTC) # Use UTC for timezone consistency logger.info(f"Updated last_indexed_at to {connector.last_indexed_at}") @@ -287,6 +289,9 @@ async def _index_composio_google_drive( await task_logger.log_task_success( log_entry, success_msg, {"files_count": 0} ) + # CRITICAL: Update timestamp even when no files found so Electric SQL syncs and UI shows indexed status + await update_connector_last_indexed(session, connector, update_last_indexed) + await session.commit() return 0, None # Return None (not error) when no items found - this is success with 0 items logger.info(f"Found {len(all_files)} Google Drive files to index via Composio") @@ -380,6 +385,13 @@ async def _index_composio_google_drive( existing_document.updated_at = get_current_timestamp() documents_indexed += 1 + + # Batch commit every 10 documents + if documents_indexed % 10 == 0: + logger.info( + f"Committing batch: {documents_indexed} Google Drive files processed so far" + ) + await session.commit() continue # Create new document @@ -425,7 +437,11 @@ async def _index_composio_google_drive( session.add(document) documents_indexed += 1 + # Batch commit every 10 documents if documents_indexed % 10 == 0: + logger.info( + f"Committing batch: {documents_indexed} Google Drive files processed so far" + ) await session.commit() except Exception as e: @@ -433,10 +449,19 @@ async def _index_composio_google_drive( documents_skipped += 1 continue - if documents_indexed > 0: - await update_connector_last_indexed(session, connector, update_last_indexed) + # CRITICAL: Always update timestamp (even if 0 documents indexed) so Electric SQL syncs + # This ensures the UI shows "Last indexed" instead of "Never indexed" + await update_connector_last_indexed(session, connector, update_last_indexed) + # Final commit to ensure all documents are persisted (safety net) + # This matches the pattern used in non-Composio Gmail indexer + logger.info( + f"Final commit: Total {documents_indexed} Google Drive files processed" + ) await session.commit() + logger.info( + "Successfully committed all Composio Google Drive document changes to database" + ) await task_logger.log_task_success( log_entry, @@ -454,154 +479,89 @@ async def _index_composio_google_drive( return 0, f"Failed to index Google Drive via Composio: {e!s}" -async def _index_composio_gmail( +async def _process_gmail_message_batch( session: AsyncSession, - connector, + messages: list[dict[str, Any]], + composio_connector: ComposioConnector, connector_id: int, search_space_id: int, user_id: str, - start_date: str | None, - end_date: str | None, - task_logger: TaskLoggingService, - log_entry, - update_last_indexed: bool = True, - max_items: int = 1000, -) -> tuple[int, str]: - """Index Gmail messages via Composio.""" - try: - composio_connector = ComposioConnector(session, connector_id) + total_documents_indexed: int = 0, +) -> tuple[int, int]: + """ + Process a batch of Gmail messages and index them. + + Args: + total_documents_indexed: Running total of documents indexed so far (for batch commits). + + Returns: + Tuple of (documents_indexed, documents_skipped) + """ + documents_indexed = 0 + documents_skipped = 0 - await task_logger.log_task_progress( - log_entry, - f"Fetching Gmail messages via Composio for connector {connector_id}", - {"stage": "fetching_messages"}, - ) + for message in messages: + try: + # Composio uses 'messageId' (camelCase), not 'id' + message_id = message.get("messageId", "") or message.get("id", "") + if not message_id: + documents_skipped += 1 + continue - # Build query with date range - query_parts = [] - if start_date: - query_parts.append(f"after:{start_date.replace('-', '/')}") - if end_date: - query_parts.append(f"before:{end_date.replace('-', '/')}") - query = " ".join(query_parts) + # Composio's GMAIL_FETCH_EMAILS already returns full message content + # No need for a separate detail API call - messages, error = await composio_connector.list_gmail_messages( - query=query, - max_results=max_items, - ) + # Extract message info from Composio response + # Composio structure: messageId, messageText, messageTimestamp, payload.headers, labelIds + payload = message.get("payload", {}) + headers = payload.get("headers", []) - if error: - await task_logger.log_task_failure( - log_entry, f"Failed to fetch Gmail messages: {error}", {} + subject = "No Subject" + sender = "Unknown Sender" + date_str = message.get("messageTimestamp", "Unknown Date") + + for header in headers: + name = header.get("name", "").lower() + value = header.get("value", "") + if name == "subject": + subject = value + elif name == "from": + sender = value + elif name == "date": + date_str = value + + # Format to markdown using the full message data + markdown_content = composio_connector.format_gmail_message_to_markdown(message) + + # Check for empty content (defensive parsing per Composio best practices) + if not markdown_content.strip(): + logger.warning(f"Skipping Gmail message with no content: {subject}") + documents_skipped += 1 + continue + + # Generate unique identifier + document_type = DocumentType(TOOLKIT_TO_DOCUMENT_TYPE["gmail"]) + unique_identifier_hash = generate_unique_identifier_hash( + document_type, f"gmail_{message_id}", search_space_id ) - return 0, f"Failed to fetch Gmail messages: {error}" - if not messages: - success_msg = "No Gmail messages found in the specified date range" - await task_logger.log_task_success( - log_entry, success_msg, {"messages_count": 0} + content_hash = generate_content_hash(markdown_content, search_space_id) + + existing_document = await check_document_by_unique_identifier( + session, unique_identifier_hash ) - return 0, None # Return None (not error) when no items found - this is success with 0 items - logger.info(f"Found {len(messages)} Gmail messages to index via Composio") + # Get label IDs from Composio response + label_ids = message.get("labelIds", []) + # Extract thread_id if available (for consistency with non-Composio implementation) + thread_id = message.get("threadId", "") or message.get("thread_id", "") - documents_indexed = 0 - documents_skipped = 0 - - for message in messages: - try: - # Composio uses 'messageId' (camelCase), not 'id' - message_id = message.get("messageId", "") or message.get("id", "") - if not message_id: + if existing_document: + if existing_document.content_hash == content_hash: documents_skipped += 1 continue - # Composio's GMAIL_FETCH_EMAILS already returns full message content - # No need for a separate detail API call - - # Extract message info from Composio response - # Composio structure: messageId, messageText, messageTimestamp, payload.headers, labelIds - payload = message.get("payload", {}) - headers = payload.get("headers", []) - - subject = "No Subject" - sender = "Unknown Sender" - date_str = message.get("messageTimestamp", "Unknown Date") - - for header in headers: - name = header.get("name", "").lower() - value = header.get("value", "") - if name == "subject": - subject = value - elif name == "from": - sender = value - elif name == "date": - date_str = value - - # Format to markdown using the full message data - markdown_content = composio_connector.format_gmail_message_to_markdown(message) - - # Generate unique identifier - document_type = DocumentType(TOOLKIT_TO_DOCUMENT_TYPE["gmail"]) - unique_identifier_hash = generate_unique_identifier_hash( - document_type, f"gmail_{message_id}", search_space_id - ) - - content_hash = generate_content_hash(markdown_content, search_space_id) - - existing_document = await check_document_by_unique_identifier( - session, unique_identifier_hash - ) - - # Get label IDs from Composio response - label_ids = message.get("labelIds", []) - - if existing_document: - if existing_document.content_hash == content_hash: - documents_skipped += 1 - continue - - # Update existing - user_llm = await get_user_long_context_llm( - session, user_id, search_space_id - ) - - if user_llm: - document_metadata = { - "message_id": message_id, - "subject": subject, - "sender": sender, - "document_type": "Gmail Message (Composio)", - } - summary_content, summary_embedding = await generate_document_summary( - markdown_content, user_llm, document_metadata - ) - else: - summary_content = f"Gmail: {subject}\n\nFrom: {sender}\nDate: {date_str}" - summary_embedding = config.embedding_model_instance.embed(summary_content) - - chunks = await create_document_chunks(markdown_content) - - existing_document.title = f"Gmail: {subject}" - existing_document.content = summary_content - existing_document.content_hash = content_hash - existing_document.embedding = summary_embedding - existing_document.document_metadata = { - "message_id": message_id, - "subject": subject, - "sender": sender, - "date": date_str, - "labels": label_ids, - "connector_id": connector_id, - "source": "composio", - } - existing_document.chunks = chunks - existing_document.updated_at = get_current_timestamp() - - documents_indexed += 1 - continue - - # Create new document + # Update existing user_llm = await get_user_long_context_llm( session, user_id, search_space_id ) @@ -609,6 +569,7 @@ async def _index_composio_gmail( if user_llm: document_metadata = { "message_id": message_id, + "thread_id": thread_id, "subject": subject, "sender": sender, "document_type": "Gmail Message (Composio)", @@ -622,53 +583,276 @@ async def _index_composio_gmail( chunks = await create_document_chunks(markdown_content) - document = Document( - search_space_id=search_space_id, - title=f"Gmail: {subject}", - document_type=DocumentType(TOOLKIT_TO_DOCUMENT_TYPE["gmail"]), - document_metadata={ - "message_id": message_id, - "subject": subject, - "sender": sender, - "date": date_str, - "labels": label_ids, - "connector_id": connector_id, - "toolkit_id": "gmail", - "source": "composio", - }, - content=summary_content, - content_hash=content_hash, - unique_identifier_hash=unique_identifier_hash, - embedding=summary_embedding, - chunks=chunks, - updated_at=get_current_timestamp(), - ) - session.add(document) + existing_document.title = f"Gmail: {subject}" + existing_document.content = summary_content + existing_document.content_hash = content_hash + existing_document.embedding = summary_embedding + existing_document.document_metadata = { + "message_id": message_id, + "thread_id": thread_id, + "subject": subject, + "sender": sender, + "date": date_str, + "labels": label_ids, + "connector_id": connector_id, + "source": "composio", + } + existing_document.chunks = chunks + existing_document.updated_at = get_current_timestamp() + documents_indexed += 1 - - if documents_indexed % 10 == 0: + + # Batch commit every 10 documents + current_total = total_documents_indexed + documents_indexed + if current_total % 10 == 0: + logger.info( + f"Committing batch: {current_total} Gmail messages processed so far" + ) await session.commit() - - except Exception as e: - logger.error(f"Error processing Gmail message: {e!s}", exc_info=True) - documents_skipped += 1 continue - if documents_indexed > 0: - await update_connector_last_indexed(session, connector, update_last_indexed) + # Create new document + user_llm = await get_user_long_context_llm( + session, user_id, search_space_id + ) + if user_llm: + document_metadata = { + "message_id": message_id, + "thread_id": thread_id, + "subject": subject, + "sender": sender, + "document_type": "Gmail Message (Composio)", + } + summary_content, summary_embedding = await generate_document_summary( + markdown_content, user_llm, document_metadata + ) + else: + summary_content = f"Gmail: {subject}\n\nFrom: {sender}\nDate: {date_str}" + summary_embedding = config.embedding_model_instance.embed(summary_content) + + chunks = await create_document_chunks(markdown_content) + + document = Document( + search_space_id=search_space_id, + title=f"Gmail: {subject}", + document_type=DocumentType(TOOLKIT_TO_DOCUMENT_TYPE["gmail"]), + document_metadata={ + "message_id": message_id, + "thread_id": thread_id, + "subject": subject, + "sender": sender, + "date": date_str, + "labels": label_ids, + "connector_id": connector_id, + "toolkit_id": "gmail", + "source": "composio", + }, + content=summary_content, + content_hash=content_hash, + unique_identifier_hash=unique_identifier_hash, + embedding=summary_embedding, + chunks=chunks, + updated_at=get_current_timestamp(), + ) + session.add(document) + documents_indexed += 1 + + # Batch commit every 10 documents + current_total = total_documents_indexed + documents_indexed + if current_total % 10 == 0: + logger.info( + f"Committing batch: {current_total} Gmail messages processed so far" + ) + await session.commit() + + except Exception as e: + logger.error(f"Error processing Gmail message: {e!s}", exc_info=True) + documents_skipped += 1 + # Rollback on error to avoid partial state (per Composio best practices) + try: + await session.rollback() + except Exception as rollback_error: + logger.error(f"Error during rollback: {rollback_error!s}", exc_info=True) + continue + + return documents_indexed, documents_skipped + + +async def _index_composio_gmail( + session: AsyncSession, + connector, + connector_id: int, + search_space_id: int, + user_id: str, + start_date: str | None, + end_date: str | None, + task_logger: TaskLoggingService, + log_entry, + update_last_indexed: bool = True, + max_items: int = 1000, +) -> tuple[int, str]: + """Index Gmail messages via Composio with pagination and incremental processing.""" + try: + composio_connector = ComposioConnector(session, connector_id) + + # Normalize date values - handle "undefined" strings from frontend + if start_date == "undefined" or start_date == "": + start_date = None + if end_date == "undefined" or end_date == "": + end_date = None + + # Calculate date range with defaults (uses last_indexed_at or 365 days back) + # This ensures indexing works even when user doesn't specify dates + start_date_str, end_date_str = calculate_date_range( + connector, start_date, end_date, default_days_back=365 + ) + + # Build query with date range + query_parts = [] + if start_date_str: + query_parts.append(f"after:{start_date_str.replace('-', '/')}") + if end_date_str: + query_parts.append(f"before:{end_date_str.replace('-', '/')}") + query = " ".join(query_parts) if query_parts else "" + + logger.info( + f"Gmail query for connector {connector_id}: '{query}' " + f"(start_date={start_date_str}, end_date={end_date_str})" + ) + + # Use smaller batch size to avoid 413 payload too large errors + batch_size = 50 + page_token = None + total_documents_indexed = 0 + total_documents_skipped = 0 + total_messages_fetched = 0 + result_size_estimate = None # Will be set from first API response + + while total_messages_fetched < max_items: + # Calculate how many messages to fetch in this batch + remaining = max_items - total_messages_fetched + current_batch_size = min(batch_size, remaining) + + # Use result_size_estimate if available, otherwise fall back to max_items + estimated_total = result_size_estimate if result_size_estimate is not None else max_items + # Cap estimated_total at max_items to avoid showing misleading progress + estimated_total = min(estimated_total, max_items) + + await task_logger.log_task_progress( + log_entry, + f"Fetching Gmail messages batch via Composio for connector {connector_id} " + f"({total_messages_fetched}/{estimated_total} fetched, {total_documents_indexed} indexed)", + { + "stage": "fetching_messages", + "batch_size": current_batch_size, + "total_fetched": total_messages_fetched, + "total_indexed": total_documents_indexed, + "estimated_total": estimated_total, + }, + ) + + # Fetch batch of messages + messages, next_token, result_size_estimate_batch, error = await composio_connector.list_gmail_messages( + query=query, + max_results=current_batch_size, + page_token=page_token, + ) + + if error: + await task_logger.log_task_failure( + log_entry, f"Failed to fetch Gmail messages: {error}", {} + ) + return 0, f"Failed to fetch Gmail messages: {error}" + + if not messages: + # No more messages available + break + + # Update result_size_estimate from first response (Gmail provides this estimate) + if result_size_estimate is None and result_size_estimate_batch is not None: + result_size_estimate = result_size_estimate_batch + logger.info(f"Gmail API estimated {result_size_estimate} total messages for query: '{query}'") + + total_messages_fetched += len(messages) + # Recalculate estimated_total after potentially updating result_size_estimate + estimated_total = result_size_estimate if result_size_estimate is not None else max_items + estimated_total = min(estimated_total, max_items) + + logger.info( + f"Fetched batch of {len(messages)} Gmail messages " + f"(total: {total_messages_fetched}/{estimated_total})" + ) + + # Process batch incrementally + batch_indexed, batch_skipped = await _process_gmail_message_batch( + session=session, + messages=messages, + composio_connector=composio_connector, + connector_id=connector_id, + search_space_id=search_space_id, + user_id=user_id, + total_documents_indexed=total_documents_indexed, + ) + + total_documents_indexed += batch_indexed + total_documents_skipped += batch_skipped + + logger.info( + f"Processed batch: {batch_indexed} indexed, {batch_skipped} skipped " + f"(total: {total_documents_indexed} indexed, {total_documents_skipped} skipped)" + ) + + # Batch commits happen in _process_gmail_message_batch every 10 documents + # This ensures progress is saved incrementally, preventing data loss on crashes + + # Check if we should continue + if not next_token: + # No more pages available + break + + if len(messages) < current_batch_size: + # Last page had fewer items than requested, we're done + break + + # Continue with next page + page_token = next_token + + if total_messages_fetched == 0: + success_msg = "No Gmail messages found in the specified date range" + await task_logger.log_task_success( + log_entry, success_msg, {"messages_count": 0} + ) + # CRITICAL: Update timestamp even when no messages found so Electric SQL syncs and UI shows indexed status + await update_connector_last_indexed(session, connector, update_last_indexed) + await session.commit() + return 0, None # Return None (not error) when no items found + + # CRITICAL: Always update timestamp (even if 0 documents indexed) so Electric SQL syncs + # This ensures the UI shows "Last indexed" instead of "Never indexed" + await update_connector_last_indexed(session, connector, update_last_indexed) + + # Final commit to ensure all documents are persisted (safety net) + # This matches the pattern used in non-Composio Gmail indexer + logger.info( + f"Final commit: Total {total_documents_indexed} Gmail messages processed" + ) await session.commit() + logger.info( + "Successfully committed all Composio Gmail document changes to database" + ) await task_logger.log_task_success( log_entry, f"Successfully completed Gmail indexing via Composio for connector {connector_id}", { - "documents_indexed": documents_indexed, - "documents_skipped": documents_skipped, + "documents_indexed": total_documents_indexed, + "documents_skipped": total_documents_skipped, + "messages_fetched": total_messages_fetched, }, ) - return documents_indexed, None + return total_documents_indexed, None except Exception as e: logger.error(f"Failed to index Gmail via Composio: {e!s}", exc_info=True) @@ -689,8 +873,6 @@ async def _index_composio_google_calendar( max_items: int = 2500, ) -> tuple[int, str]: """Index Google Calendar events via Composio.""" - from datetime import datetime, timedelta - try: composio_connector = ComposioConnector(session, connector_id) @@ -700,18 +882,26 @@ async def _index_composio_google_calendar( {"stage": "fetching_events"}, ) - # Build time range - if start_date: - time_min = f"{start_date}T00:00:00Z" - else: - # Default to 365 days ago - default_start = datetime.now() - timedelta(days=365) - time_min = default_start.strftime("%Y-%m-%dT00:00:00Z") + # Normalize date values - handle "undefined" strings from frontend + if start_date == "undefined" or start_date == "": + start_date = None + if end_date == "undefined" or end_date == "": + end_date = None - if end_date: - time_max = f"{end_date}T23:59:59Z" - else: - time_max = datetime.now().strftime("%Y-%m-%dT23:59:59Z") + # Calculate date range with defaults (uses last_indexed_at or 365 days back) + # This ensures indexing works even when user doesn't specify dates + start_date_str, end_date_str = calculate_date_range( + connector, start_date, end_date, default_days_back=365 + ) + + # Build time range for API call + time_min = f"{start_date_str}T00:00:00Z" + time_max = f"{end_date_str}T23:59:59Z" + + logger.info( + f"Google Calendar query for connector {connector_id}: " + f"(start_date={start_date_str}, end_date={end_date_str})" + ) events, error = await composio_connector.list_calendar_events( time_min=time_min, @@ -730,6 +920,9 @@ async def _index_composio_google_calendar( await task_logger.log_task_success( log_entry, success_msg, {"events_count": 0} ) + # CRITICAL: Update timestamp even when no events found so Electric SQL syncs and UI shows indexed status + await update_connector_last_indexed(session, connector, update_last_indexed) + await session.commit() return 0, None # Return None (not error) when no items found - this is success with 0 items logger.info(f"Found {len(events)} Google Calendar events to index via Composio") @@ -814,6 +1007,13 @@ async def _index_composio_google_calendar( existing_document.updated_at = get_current_timestamp() documents_indexed += 1 + + # Batch commit every 10 documents + if documents_indexed % 10 == 0: + logger.info( + f"Committing batch: {documents_indexed} Google Calendar events processed so far" + ) + await session.commit() continue # Create new document @@ -863,7 +1063,11 @@ async def _index_composio_google_calendar( session.add(document) documents_indexed += 1 + # Batch commit every 10 documents if documents_indexed % 10 == 0: + logger.info( + f"Committing batch: {documents_indexed} Google Calendar events processed so far" + ) await session.commit() except Exception as e: @@ -871,10 +1075,19 @@ async def _index_composio_google_calendar( documents_skipped += 1 continue - if documents_indexed > 0: - await update_connector_last_indexed(session, connector, update_last_indexed) + # CRITICAL: Always update timestamp (even if 0 documents indexed) so Electric SQL syncs + # This ensures the UI shows "Last indexed" instead of "Never indexed" + await update_connector_last_indexed(session, connector, update_last_indexed) + # Final commit to ensure all documents are persisted (safety net) + # This matches the pattern used in non-Composio Gmail indexer + logger.info( + f"Final commit: Total {documents_indexed} Google Calendar events processed" + ) await session.commit() + logger.info( + "Successfully committed all Composio Google Calendar document changes to database" + ) await task_logger.log_task_success( log_entry, From e6a4ac7c9cd14c3bcae4bbeb91b7b58abd538b80 Mon Sep 17 00:00:00 2001 From: Anish Sarkar <104695310+AnishSarkar22@users.noreply.github.com> Date: Fri, 23 Jan 2026 04:56:15 +0530 Subject: [PATCH 04/28] fix: change animation from spring to tween for sliding --- .../components/layout/ui/sidebar/AllPrivateChatsSidebar.tsx | 2 +- .../components/layout/ui/sidebar/AllSharedChatsSidebar.tsx | 2 +- surfsense_web/components/layout/ui/sidebar/InboxSidebar.tsx | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/surfsense_web/components/layout/ui/sidebar/AllPrivateChatsSidebar.tsx b/surfsense_web/components/layout/ui/sidebar/AllPrivateChatsSidebar.tsx index 39f1b95bc..c094ff44a 100644 --- a/surfsense_web/components/layout/ui/sidebar/AllPrivateChatsSidebar.tsx +++ b/surfsense_web/components/layout/ui/sidebar/AllPrivateChatsSidebar.tsx @@ -231,7 +231,7 @@ export function AllPrivateChatsSidebar({ initial={{ x: "-100%" }} animate={{ x: 0 }} exit={{ x: "-100%" }} - transition={{ type: "spring", damping: 25, stiffness: 300 }} + transition={{ type: "tween", duration: 0.3, ease: "easeOut" }} className="fixed inset-y-0 left-0 z-70 w-80 bg-background shadow-xl flex flex-col pointer-events-auto isolate" role="dialog" aria-modal="true" diff --git a/surfsense_web/components/layout/ui/sidebar/AllSharedChatsSidebar.tsx b/surfsense_web/components/layout/ui/sidebar/AllSharedChatsSidebar.tsx index 8dd593945..76dbf1aad 100644 --- a/surfsense_web/components/layout/ui/sidebar/AllSharedChatsSidebar.tsx +++ b/surfsense_web/components/layout/ui/sidebar/AllSharedChatsSidebar.tsx @@ -231,7 +231,7 @@ export function AllSharedChatsSidebar({ initial={{ x: "-100%" }} animate={{ x: 0 }} exit={{ x: "-100%" }} - transition={{ type: "spring", damping: 25, stiffness: 300 }} + transition={{ type: "tween", duration: 0.3, ease: "easeOut" }} className="fixed inset-y-0 left-0 z-70 w-80 bg-background shadow-xl flex flex-col pointer-events-auto isolate" role="dialog" aria-modal="true" diff --git a/surfsense_web/components/layout/ui/sidebar/InboxSidebar.tsx b/surfsense_web/components/layout/ui/sidebar/InboxSidebar.tsx index 166d77eca..a3fd3ea14 100644 --- a/surfsense_web/components/layout/ui/sidebar/InboxSidebar.tsx +++ b/surfsense_web/components/layout/ui/sidebar/InboxSidebar.tsx @@ -446,7 +446,7 @@ export function InboxSidebar({ initial={{ x: "-100%" }} animate={{ x: 0 }} exit={{ x: "-100%" }} - transition={{ type: "spring", damping: 25, stiffness: 300 }} + transition={{ type: "tween", duration: 0.3, ease: "easeOut" }} className="fixed inset-y-0 left-0 z-70 w-90 bg-background shadow-xl flex flex-col pointer-events-auto isolate" role="dialog" aria-modal="true" From 7ec7ed5c3b6dde85127e8809d7c07c47fe62fd87 Mon Sep 17 00:00:00 2001 From: Anish Sarkar <104695310+AnishSarkar22@users.noreply.github.com> Date: Fri, 23 Jan 2026 05:17:28 +0530 Subject: [PATCH 05/28] feat: enhance Composio Google Drive integration with folder and file selection - Added a new endpoint to list folders and files in a user's Composio Google Drive, supporting hierarchical structure. - Implemented UI components for selecting specific folders and files to index, improving user control over indexing options. - Introduced indexing options for maximum files per folder and inclusion of subfolders, allowing for customizable indexing behavior. - Enhanced error handling and logging for Composio Drive operations, ensuring better visibility into issues during file retrieval and indexing. - Updated the Composio configuration component to reflect new selection capabilities and indexing options. --- .../app/routes/composio_routes.py | 122 ++++++ .../routes/search_source_connectors_routes.py | 40 +- .../app/services/composio_service.py | 6 +- .../app/tasks/composio_indexer.py | 195 +++++++++- .../components/composio-config.tsx | 294 +++++++++++++- .../views/connector-edit-view.tsx | 7 +- .../hooks/use-connector-dialog.ts | 8 +- .../connectors/composio-drive-folder-tree.tsx | 365 ++++++++++++++++++ .../hooks/use-composio-drive-folders.ts | 29 ++ .../lib/apis/connectors-api.service.ts | 23 ++ surfsense_web/lib/query-client/cache-keys.ts | 4 + 11 files changed, 1069 insertions(+), 24 deletions(-) create mode 100644 surfsense_web/components/connectors/composio-drive-folder-tree.tsx create mode 100644 surfsense_web/hooks/use-composio-drive-folders.ts diff --git a/surfsense_backend/app/routes/composio_routes.py b/surfsense_backend/app/routes/composio_routes.py index 77891fc88..25e545dfb 100644 --- a/surfsense_backend/app/routes/composio_routes.py +++ b/surfsense_backend/app/routes/composio_routes.py @@ -8,6 +8,7 @@ Endpoints: - GET /composio/toolkits - List available Composio toolkits - GET /auth/composio/connector/add - Initiate OAuth for a specific toolkit - GET /auth/composio/connector/callback - Handle OAuth callback +- GET /connectors/{connector_id}/composio-drive/folders - List folders/files for Composio Google Drive """ import asyncio @@ -369,3 +370,124 @@ async def composio_callback( raise HTTPException( status_code=500, detail=f"Failed to complete Composio OAuth: {e!s}" ) from e + + +@router.get("/connectors/{connector_id}/composio-drive/folders") +async def list_composio_drive_folders( + connector_id: int, + parent_id: str | None = None, + session: AsyncSession = Depends(get_async_session), + user: User = Depends(current_active_user), +): + """ + List folders AND files in user's Google Drive via Composio with hierarchical support. + + This is called at index time from the manage connector page to display + the complete file system (folders and files). Only folders are selectable. + + Args: + connector_id: ID of the Composio Google Drive connector + parent_id: Optional parent folder ID to list contents (None for root) + + Returns: + JSON with list of items: { + "items": [ + {"id": str, "name": str, "mimeType": str, "isFolder": bool, ...}, + ... + ] + } + """ + if not ComposioService.is_enabled(): + raise HTTPException( + status_code=503, + detail="Composio integration is not enabled.", + ) + + try: + # Get connector and verify ownership + result = await session.execute( + select(SearchSourceConnector).filter( + SearchSourceConnector.id == connector_id, + SearchSourceConnector.user_id == user.id, + SearchSourceConnector.connector_type + == SearchSourceConnectorType.COMPOSIO_GOOGLE_DRIVE_CONNECTOR, + ) + ) + connector = result.scalars().first() + + if not connector: + raise HTTPException( + status_code=404, + detail="Composio Google Drive connector not found or access denied", + ) + + # Get Composio connected account ID from config + composio_connected_account_id = connector.config.get("composio_connected_account_id") + if not composio_connected_account_id: + raise HTTPException( + status_code=400, + detail="Composio connected account not found. Please reconnect the connector.", + ) + + # Initialize Composio service and fetch files + service = ComposioService() + entity_id = f"surfsense_{user.id}" + + # Fetch files/folders from Composio Google Drive + files, next_token, error = await service.get_drive_files( + connected_account_id=composio_connected_account_id, + entity_id=entity_id, + folder_id=parent_id, + page_size=100, + ) + + if error: + logger.error(f"Failed to list Composio Drive files: {error}") + raise HTTPException( + status_code=500, detail=f"Failed to list folder contents: {error}" + ) + + # Transform files to match the expected format with isFolder field + items = [] + for file_info in files: + file_id = file_info.get("id", "") or file_info.get("fileId", "") + file_name = file_info.get("name", "") or file_info.get("fileName", "") or "Untitled" + mime_type = file_info.get("mimeType", "") or file_info.get("mime_type", "") + + if not file_id: + continue + + is_folder = mime_type == "application/vnd.google-apps.folder" + + items.append({ + "id": file_id, + "name": file_name, + "mimeType": mime_type, + "isFolder": is_folder, + "parents": file_info.get("parents", []), + "size": file_info.get("size"), + "iconLink": file_info.get("iconLink"), + }) + + # Sort: folders first, then files, both alphabetically + folders = sorted([item for item in items if item["isFolder"]], key=lambda x: x["name"].lower()) + files_list = sorted([item for item in items if not item["isFolder"]], key=lambda x: x["name"].lower()) + items = folders + files_list + + folder_count = len(folders) + file_count = len(files_list) + + logger.info( + f"✅ Listed {len(items)} total items ({folder_count} folders, {file_count} files) for Composio connector {connector_id}" + + (f" in folder {parent_id}" if parent_id else " in ROOT") + ) + + return {"items": items} + + except HTTPException: + raise + except Exception as e: + logger.error(f"Error listing Composio Drive contents: {e!s}", exc_info=True) + raise HTTPException( + status_code=500, detail=f"Failed to list Drive contents: {e!s}" + ) from e diff --git a/surfsense_backend/app/routes/search_source_connectors_routes.py b/surfsense_backend/app/routes/search_source_connectors_routes.py index 1578ad0d5..89cdd9f95 100644 --- a/surfsense_backend/app/routes/search_source_connectors_routes.py +++ b/surfsense_backend/app/routes/search_source_connectors_routes.py @@ -897,8 +897,46 @@ async def index_connector_content( ) response_message = "Web page indexing started in the background." + elif connector.connector_type == SearchSourceConnectorType.COMPOSIO_GOOGLE_DRIVE_CONNECTOR: + from app.tasks.celery_tasks.connector_tasks import ( + index_composio_connector_task, + ) + + # For Composio Google Drive, if drive_items is provided, update connector config + # This allows the UI to pass folder/file selection like the regular Google Drive connector + if drive_items and drive_items.has_items(): + # Update connector config with the selected folders/files + config = connector.config or {} + config["selected_folders"] = [{"id": f.id, "name": f.name} for f in drive_items.folders] + config["selected_files"] = [{"id": f.id, "name": f.name} for f in drive_items.files] + if drive_items.indexing_options: + config["indexing_options"] = { + "max_files_per_folder": drive_items.indexing_options.max_files_per_folder, + "incremental_sync": drive_items.indexing_options.incremental_sync, + "include_subfolders": drive_items.indexing_options.include_subfolders, + } + connector.config = config + from sqlalchemy.orm.attributes import flag_modified + flag_modified(connector, "config") + await session.commit() + await session.refresh(connector) + + logger.info( + f"Triggering Composio Google Drive indexing for connector {connector_id} into search space {search_space_id}, " + f"folders: {len(drive_items.folders)}, files: {len(drive_items.files)}" + ) + else: + logger.info( + f"Triggering Composio Google Drive indexing for connector {connector_id} into search space {search_space_id} " + f"using existing config (from {indexing_from} to {indexing_to})" + ) + + index_composio_connector_task.delay( + connector_id, search_space_id, str(user.id), indexing_from, indexing_to + ) + response_message = "Composio Google Drive indexing started in the background." + elif connector.connector_type in [ - SearchSourceConnectorType.COMPOSIO_GOOGLE_DRIVE_CONNECTOR, SearchSourceConnectorType.COMPOSIO_GMAIL_CONNECTOR, SearchSourceConnectorType.COMPOSIO_GOOGLE_CALENDAR_CONNECTOR, ]: diff --git a/surfsense_backend/app/services/composio_service.py b/surfsense_backend/app/services/composio_service.py index e32cbf8a0..5a6148533 100644 --- a/surfsense_backend/app/services/composio_service.py +++ b/surfsense_backend/app/services/composio_service.py @@ -397,7 +397,11 @@ class ComposioService: "page_size": min(page_size, 100), } if folder_id: - params["folder_id"] = folder_id + # List contents of a specific folder (exclude shortcuts - we don't have access to them) + params["q"] = f"'{folder_id}' in parents and trashed = false and mimeType != 'application/vnd.google-apps.shortcut'" + else: + # List root-level items only (My Drive root), exclude shortcuts + params["q"] = "'root' in parents and trashed = false and mimeType != 'application/vnd.google-apps.shortcut'" if page_token: params["page_token"] = page_token diff --git a/surfsense_backend/app/tasks/composio_indexer.py b/surfsense_backend/app/tasks/composio_indexer.py index c9cd74234..f568d4134 100644 --- a/surfsense_backend/app/tasks/composio_indexer.py +++ b/surfsense_backend/app/tasks/composio_indexer.py @@ -252,37 +252,123 @@ async def _index_composio_google_drive( update_last_indexed: bool = True, max_items: int = 1000, ) -> tuple[int, str]: - """Index Google Drive files via Composio.""" + """Index Google Drive files via Composio. + + Supports folder/file selection via connector config: + - selected_folders: List of {id, name} for folders to index + - selected_files: List of {id, name} for individual files to index + - indexing_options: {max_files_per_folder, incremental_sync, include_subfolders} + """ try: composio_connector = ComposioConnector(session, connector_id) + connector_config = await composio_connector.get_config() + + # Get folder/file selection configuration + selected_folders = connector_config.get("selected_folders", []) + selected_files = connector_config.get("selected_files", []) + indexing_options = connector_config.get("indexing_options", {}) + + max_files_per_folder = indexing_options.get("max_files_per_folder", 100) + include_subfolders = indexing_options.get("include_subfolders", True) await task_logger.log_task_progress( log_entry, f"Fetching Google Drive files via Composio for connector {connector_id}", - {"stage": "fetching_files"}, + {"stage": "fetching_files", "selected_folders": len(selected_folders), "selected_files": len(selected_files)}, ) - # Fetch files all_files = [] - page_token = None - while len(all_files) < max_items: - files, next_token, error = await composio_connector.list_drive_files( - page_token=page_token, - page_size=min(100, max_items - len(all_files)), - ) + # If specific folders/files are selected, fetch from those + if selected_folders or selected_files: + # Fetch files from selected folders + for folder in selected_folders: + folder_id = folder.get("id") + folder_name = folder.get("name", "Unknown") + + if not folder_id: + continue + + # Handle special case for "root" folder + actual_folder_id = None if folder_id == "root" else folder_id + + logger.info(f"Fetching files from folder: {folder_name} ({folder_id})") + + # Fetch files from this folder + folder_files = [] + page_token = None + + while len(folder_files) < max_files_per_folder: + files, next_token, error = await composio_connector.list_drive_files( + folder_id=actual_folder_id, + page_token=page_token, + page_size=min(100, max_files_per_folder - len(folder_files)), + ) - if error: - await task_logger.log_task_failure( - log_entry, f"Failed to fetch Drive files: {error}", {} + if error: + logger.warning(f"Failed to fetch files from folder {folder_name}: {error}") + break + + # Process files + for file_info in files: + mime_type = file_info.get("mimeType", "") or file_info.get("mime_type", "") + + # If it's a folder and include_subfolders is enabled, recursively fetch + if mime_type == "application/vnd.google-apps.folder": + if include_subfolders: + # Add subfolder files recursively + subfolder_files = await _fetch_folder_files_recursively( + composio_connector, + file_info.get("id"), + max_files=max_files_per_folder, + current_count=len(folder_files), + ) + folder_files.extend(subfolder_files) + else: + folder_files.append(file_info) + + if not next_token: + break + page_token = next_token + + all_files.extend(folder_files[:max_files_per_folder]) + logger.info(f"Found {len(folder_files)} files in folder {folder_name}") + + # Add specifically selected files + for selected_file in selected_files: + file_id = selected_file.get("id") + file_name = selected_file.get("name", "Unknown") + + if not file_id: + continue + + # Add file info (we'll fetch content later during indexing) + all_files.append({ + "id": file_id, + "name": file_name, + "mimeType": "", # Will be determined later + }) + else: + # No selection specified - fetch all files (original behavior) + page_token = None + + while len(all_files) < max_items: + files, next_token, error = await composio_connector.list_drive_files( + page_token=page_token, + page_size=min(100, max_items - len(all_files)), ) - return 0, f"Failed to fetch Drive files: {error}" - all_files.extend(files) + if error: + await task_logger.log_task_failure( + log_entry, f"Failed to fetch Drive files: {error}", {} + ) + return 0, f"Failed to fetch Drive files: {error}" - if not next_token: - break - page_token = next_token + all_files.extend(files) + + if not next_token: + break + page_token = next_token if not all_files: success_msg = "No Google Drive files found" @@ -479,6 +565,81 @@ async def _index_composio_google_drive( return 0, f"Failed to index Google Drive via Composio: {e!s}" +async def _fetch_folder_files_recursively( + composio_connector: ComposioConnector, + folder_id: str, + max_files: int = 100, + current_count: int = 0, + depth: int = 0, + max_depth: int = 10, +) -> list[dict[str, Any]]: + """ + Recursively fetch files from a Google Drive folder via Composio. + + Args: + composio_connector: The Composio connector instance + folder_id: Google Drive folder ID + max_files: Maximum number of files to fetch + current_count: Current number of files already fetched + depth: Current recursion depth + max_depth: Maximum recursion depth to prevent infinite loops + + Returns: + List of file info dictionaries + """ + if depth >= max_depth: + logger.warning(f"Max recursion depth reached for folder {folder_id}") + return [] + + if current_count >= max_files: + return [] + + all_files = [] + page_token = None + + try: + while len(all_files) + current_count < max_files: + files, next_token, error = await composio_connector.list_drive_files( + folder_id=folder_id, + page_token=page_token, + page_size=min(100, max_files - len(all_files) - current_count), + ) + + if error: + logger.warning(f"Error fetching files from subfolder {folder_id}: {error}") + break + + for file_info in files: + mime_type = file_info.get("mimeType", "") or file_info.get("mime_type", "") + + if mime_type == "application/vnd.google-apps.folder": + # Recursively fetch from subfolders + subfolder_files = await _fetch_folder_files_recursively( + composio_connector, + file_info.get("id"), + max_files=max_files, + current_count=current_count + len(all_files), + depth=depth + 1, + max_depth=max_depth, + ) + all_files.extend(subfolder_files) + else: + all_files.append(file_info) + + if len(all_files) + current_count >= max_files: + break + + if not next_token: + break + page_token = next_token + + return all_files[:max_files - current_count] + + except Exception as e: + logger.error(f"Error in recursive folder fetch: {e!s}") + return all_files + + async def _process_gmail_message_batch( session: AsyncSession, messages: list[dict[str, Any]], diff --git a/surfsense_web/components/assistant-ui/connector-popup/connector-configs/components/composio-config.tsx b/surfsense_web/components/assistant-ui/connector-popup/connector-configs/components/composio-config.tsx index a96f906fe..255d0cef4 100644 --- a/surfsense_web/components/assistant-ui/connector-popup/connector-configs/components/composio-config.tsx +++ b/surfsense_web/components/assistant-ui/connector-popup/connector-configs/components/composio-config.tsx @@ -1,7 +1,20 @@ "use client"; +import { File, FileSpreadsheet, FileText, FolderClosed, Image, Presentation } from "lucide-react"; import type { FC } from "react"; +import { useEffect, useState } from "react"; +import { ComposioDriveFolderTree } from "@/components/connectors/composio-drive-folder-tree"; import { Badge } from "@/components/ui/badge"; +import { Button } from "@/components/ui/button"; +import { Label } from "@/components/ui/label"; +import { + Select, + SelectContent, + SelectItem, + SelectTrigger, + SelectValue, +} from "@/components/ui/select"; +import { Switch } from "@/components/ui/switch"; import type { SearchSourceConnector } from "@/contracts/types/connector.types"; import { cn } from "@/lib/utils"; @@ -11,11 +24,134 @@ interface ComposioConfigProps { onNameChange?: (name: string) => void; } -export const ComposioConfig: FC = ({ connector }) => { +interface SelectedFolder { + id: string; + name: string; +} + +interface IndexingOptions { + max_files_per_folder: number; + incremental_sync: boolean; + include_subfolders: boolean; +} + +const DEFAULT_INDEXING_OPTIONS: IndexingOptions = { + max_files_per_folder: 100, + incremental_sync: true, + include_subfolders: true, +}; + +// Helper to get appropriate icon for file type based on file name +function getFileIconFromName(fileName: string, className: string = "size-3.5 shrink-0") { + const lowerName = fileName.toLowerCase(); + // Spreadsheets + if ( + lowerName.endsWith(".xlsx") || + lowerName.endsWith(".xls") || + lowerName.endsWith(".csv") || + lowerName.includes("spreadsheet") + ) { + return ; + } + // Presentations + if ( + lowerName.endsWith(".pptx") || + lowerName.endsWith(".ppt") || + lowerName.includes("presentation") + ) { + return ; + } + // Documents (word, text only - not PDF) + if ( + lowerName.endsWith(".docx") || + lowerName.endsWith(".doc") || + lowerName.endsWith(".txt") || + lowerName.includes("document") || + lowerName.includes("word") || + lowerName.includes("text") + ) { + return ; + } + // Images + if ( + lowerName.endsWith(".png") || + lowerName.endsWith(".jpg") || + lowerName.endsWith(".jpeg") || + lowerName.endsWith(".gif") || + lowerName.endsWith(".webp") || + lowerName.endsWith(".svg") + ) { + return ; + } + // Default (including PDF) + return ; +} + +export const ComposioConfig: FC = ({ connector, onConfigChange }) => { const toolkitId = connector.config?.toolkit_id as string; const isIndexable = connector.config?.is_indexable as boolean; const composioAccountId = connector.config?.composio_connected_account_id as string; + // Check if this is a Google Drive Composio connector + const isGoogleDrive = toolkitId === "googledrive"; + + // Initialize with existing selected folders and files from connector config + const existingFolders = + (connector.config?.selected_folders as SelectedFolder[] | undefined) || []; + const existingFiles = (connector.config?.selected_files as SelectedFolder[] | undefined) || []; + const existingIndexingOptions = + (connector.config?.indexing_options as IndexingOptions | undefined) || DEFAULT_INDEXING_OPTIONS; + + const [selectedFolders, setSelectedFolders] = useState(existingFolders); + const [selectedFiles, setSelectedFiles] = useState(existingFiles); + const [showFolderSelector, setShowFolderSelector] = useState(false); + const [indexingOptions, setIndexingOptions] = useState(existingIndexingOptions); + + // Update selected folders and files when connector config changes + useEffect(() => { + const folders = (connector.config?.selected_folders as SelectedFolder[] | undefined) || []; + const files = (connector.config?.selected_files as SelectedFolder[] | undefined) || []; + const options = + (connector.config?.indexing_options as IndexingOptions | undefined) || + DEFAULT_INDEXING_OPTIONS; + setSelectedFolders(folders); + setSelectedFiles(files); + setIndexingOptions(options); + }, [connector.config]); + + const updateConfig = ( + folders: SelectedFolder[], + files: SelectedFolder[], + options: IndexingOptions + ) => { + if (onConfigChange) { + onConfigChange({ + ...connector.config, + selected_folders: folders, + selected_files: files, + indexing_options: options, + }); + } + }; + + const handleSelectFolders = (folders: SelectedFolder[]) => { + setSelectedFolders(folders); + updateConfig(folders, selectedFiles, indexingOptions); + }; + + const handleSelectFiles = (files: SelectedFolder[]) => { + setSelectedFiles(files); + updateConfig(selectedFolders, files, indexingOptions); + }; + + const handleIndexingOptionChange = (key: keyof IndexingOptions, value: number | boolean) => { + const newOptions = { ...indexingOptions, [key]: value }; + setIndexingOptions(newOptions); + updateConfig(selectedFolders, selectedFiles, newOptions); + }; + + const totalSelected = selectedFolders.length + selectedFiles.length; + return (
{/* Connection Details */} @@ -52,6 +188,162 @@ export const ComposioConfig: FC = ({ connector }) => { )}
+ + {/* Google Drive specific: Folder & File Selection */} + {isGoogleDrive && isIndexable && ( + <> +
+
+

Folder & File Selection

+

+ Select specific folders and/or individual files to index. +

+
+ + {totalSelected > 0 && ( +
+

+ Selected {totalSelected} item{totalSelected > 1 ? "s" : ""}: {(() => { + const parts: string[] = []; + if (selectedFolders.length > 0) { + parts.push( + `${selectedFolders.length} folder${selectedFolders.length > 1 ? "s" : ""}` + ); + } + if (selectedFiles.length > 0) { + parts.push(`${selectedFiles.length} file${selectedFiles.length > 1 ? "s" : ""}`); + } + return parts.length > 0 ? `(${parts.join(" ")})` : ""; + })()} +

+
+ {selectedFolders.map((folder) => ( +

+ + {folder.name} +

+ ))} + {selectedFiles.map((file) => ( +

+ {getFileIconFromName(file.name)} + {file.name} +

+ ))} +
+
+ )} + + {showFolderSelector ? ( +
+ + +
+ ) : ( + + )} +
+ + {/* Indexing Options */} +
+
+

Indexing Options

+

+ Configure how files are indexed from your Google Drive. +

+
+ + {/* Max files per folder */} +
+
+
+ +

+ Maximum number of files to index from each folder +

+
+ +
+
+ + {/* Include subfolders toggle */} +
+
+ +

+ Recursively index files in subfolders of selected folders +

+
+ handleIndexingOptionChange("include_subfolders", checked)} + /> +
+
+ + )}
); }; diff --git a/surfsense_web/components/assistant-ui/connector-popup/connector-configs/views/connector-edit-view.tsx b/surfsense_web/components/assistant-ui/connector-popup/connector-configs/views/connector-edit-view.tsx index 66afd84a5..71258a519 100644 --- a/surfsense_web/components/assistant-ui/connector-popup/connector-configs/views/connector-edit-view.tsx +++ b/surfsense_web/components/assistant-ui/connector-popup/connector-configs/views/connector-edit-view.tsx @@ -224,8 +224,11 @@ export const ConnectorEditView: FC = ({ {/* Periodic sync - shown for all indexable connectors */} {(() => { - // Check if Google Drive has folders/files selected + // Check if Google Drive (regular or Composio) has folders/files selected const isGoogleDrive = connector.connector_type === "GOOGLE_DRIVE_CONNECTOR"; + const isComposioGoogleDrive = + connector.connector_type === "COMPOSIO_GOOGLE_DRIVE_CONNECTOR"; + const requiresFolderSelection = isGoogleDrive || isComposioGoogleDrive; const selectedFolders = (connector.config?.selected_folders as | Array<{ id: string; name: string }> @@ -235,7 +238,7 @@ export const ConnectorEditView: FC = ({ | Array<{ id: string; name: string }> | undefined) || []; const hasItemsSelected = selectedFolders.length > 0 || selectedFiles.length > 0; - const isDisabled = isGoogleDrive && !hasItemsSelected; + const isDisabled = requiresFolderSelection && !hasItemsSelected; return ( { return; } - // Prevent periodic indexing for Google Drive without folders/files selected - if (periodicEnabled && editingConnector.connector_type === "GOOGLE_DRIVE_CONNECTOR") { + // Prevent periodic indexing for Google Drive (regular or Composio) without folders/files selected + if ( + periodicEnabled && + (editingConnector.connector_type === "GOOGLE_DRIVE_CONNECTOR" || + editingConnector.connector_type === "COMPOSIO_GOOGLE_DRIVE_CONNECTOR") + ) { const selectedFolders = (connectorConfig || editingConnector.config)?.selected_folders as | Array<{ id: string; name: string }> | undefined; diff --git a/surfsense_web/components/connectors/composio-drive-folder-tree.tsx b/surfsense_web/components/connectors/composio-drive-folder-tree.tsx new file mode 100644 index 000000000..72c36edd5 --- /dev/null +++ b/surfsense_web/components/connectors/composio-drive-folder-tree.tsx @@ -0,0 +1,365 @@ +"use client"; + +import { + ChevronDown, + ChevronRight, + File, + FileSpreadsheet, + FileText, + FolderClosed, + FolderOpen, + HardDrive, + Image, + Loader2, + Presentation, +} from "lucide-react"; +import { useState } from "react"; +import { Checkbox } from "@/components/ui/checkbox"; +import { ScrollArea } from "@/components/ui/scroll-area"; +import { useComposioDriveFolders } from "@/hooks/use-composio-drive-folders"; +import { connectorsApiService } from "@/lib/apis/connectors-api.service"; +import { cn } from "@/lib/utils"; + +interface DriveItem { + id: string; + name: string; + mimeType: string; + isFolder: boolean; + parents?: string[]; + size?: number; + iconLink?: string; +} + +interface ItemTreeNode { + item: DriveItem; + children: DriveItem[] | null; // null = not loaded, [] = loaded but empty + isExpanded: boolean; + isLoading: boolean; +} + +interface SelectedFolder { + id: string; + name: string; +} + +interface ComposioDriveFolderTreeProps { + connectorId: number; + selectedFolders: SelectedFolder[]; + onSelectFolders: (folders: SelectedFolder[]) => void; + selectedFiles?: SelectedFolder[]; + onSelectFiles?: (files: SelectedFolder[]) => void; +} + +// Helper to get appropriate icon for file type +function getFileIcon(mimeType: string, className: string = "h-4 w-4") { + if (mimeType.includes("spreadsheet") || mimeType.includes("excel")) { + return ; + } + if (mimeType.includes("presentation") || mimeType.includes("powerpoint")) { + return ; + } + if (mimeType.includes("document") || mimeType.includes("word") || mimeType.includes("text")) { + return ; + } + if (mimeType.includes("image")) { + return ; + } + return ; +} + +export function ComposioDriveFolderTree({ + connectorId, + selectedFolders, + onSelectFolders, + selectedFiles = [], + onSelectFiles = () => {}, +}: ComposioDriveFolderTreeProps) { + const [itemStates, setItemStates] = useState>(new Map()); + + const { data: rootData, isLoading: isLoadingRoot } = useComposioDriveFolders({ + connectorId, + }); + + const rootItems = rootData?.items || []; + + const isFolderSelected = (folderId: string): boolean => { + return selectedFolders.some((f) => f.id === folderId); + }; + + const isFileSelected = (fileId: string): boolean => { + return selectedFiles.some((f) => f.id === fileId); + }; + + const toggleFolderSelection = (folderId: string, folderName: string) => { + if (isFolderSelected(folderId)) { + onSelectFolders(selectedFolders.filter((f) => f.id !== folderId)); + } else { + onSelectFolders([...selectedFolders, { id: folderId, name: folderName }]); + } + }; + + const toggleFileSelection = (fileId: string, fileName: string) => { + if (isFileSelected(fileId)) { + onSelectFiles(selectedFiles.filter((f) => f.id !== fileId)); + } else { + onSelectFiles([...selectedFiles, { id: fileId, name: fileName }]); + } + }; + + /** + * Find an item by ID across all loaded items (root and nested). + */ + const findItem = (itemId: string): DriveItem | undefined => { + const state = itemStates.get(itemId); + if (state?.item) return state.item; + + const rootItem = rootItems.find((item) => item.id === itemId); + if (rootItem) return rootItem; + + for (const [, nodeState] of itemStates) { + if (nodeState.children) { + const found = nodeState.children.find((child) => child.id === itemId); + if (found) return found; + } + } + + return undefined; + }; + + /** + * Load and display contents of a specific folder. + */ + const loadFolderContents = async (folderId: string) => { + try { + setItemStates((prev) => { + const newMap = new Map(prev); + const existing = newMap.get(folderId); + if (existing) { + newMap.set(folderId, { ...existing, isLoading: true }); + } else { + const item = findItem(folderId); + if (item) { + newMap.set(folderId, { + item, + children: null, + isExpanded: false, + isLoading: true, + }); + } + } + return newMap; + }); + + const data = await connectorsApiService.listComposioDriveFolders({ + connector_id: connectorId, + parent_id: folderId, + }); + const items = data.items || []; + + setItemStates((prev) => { + const newMap = new Map(prev); + const existing = newMap.get(folderId); + const item = existing?.item || findItem(folderId); + + if (item) { + newMap.set(folderId, { + item, + children: items, + isExpanded: true, + isLoading: false, + }); + } else { + console.error(`Could not find item for folderId: ${folderId}`); + } + return newMap; + }); + } catch (error) { + console.error("Error loading folder contents:", error); + setItemStates((prev) => { + const newMap = new Map(prev); + const existing = newMap.get(folderId); + if (existing) { + newMap.set(folderId, { ...existing, isLoading: false }); + } + return newMap; + }); + } + }; + + /** + * Toggle folder expand/collapse state. + */ + const toggleFolder = async (item: DriveItem) => { + if (!item.isFolder) return; + + const state = itemStates.get(item.id); + + if (!state || state.children === null) { + await loadFolderContents(item.id); + } else { + setItemStates((prev) => { + const newMap = new Map(prev); + newMap.set(item.id, { + ...state, + isExpanded: !state.isExpanded, + }); + return newMap; + }); + } + }; + + /** + * Render a single item (folder or file) with its children. + */ + const renderItem = (item: DriveItem, level: number = 0) => { + const state = itemStates.get(item.id); + const isExpanded = state?.isExpanded || false; + const isLoading = state?.isLoading || false; + const children = state?.children; + const isFolder = item.isFolder; + const isSelected = isFolder ? isFolderSelected(item.id) : isFileSelected(item.id); + + const childFolders = children?.filter((c) => c.isFolder) || []; + const childFiles = children?.filter((c) => !c.isFolder) || []; + + const indentSize = 0.75; // Smaller indent for mobile + + return ( +
+
+ {isFolder ? ( + + ) : ( + + )} + + { + if (isFolder) { + toggleFolderSelection(item.id, item.name); + } else { + toggleFileSelection(item.id, item.name); + } + }} + className="shrink-0 h-3.5 w-3.5 sm:h-4 sm:w-4 border-slate-400/20 dark:border-white/20" + onClick={(e) => e.stopPropagation()} + /> + +
+ {isFolder ? ( + isExpanded ? ( + + ) : ( + + ) + ) : ( + getFileIcon(item.mimeType, "h-3 w-3 sm:h-4 sm:w-4") + )} +
+ + {isFolder ? ( + + ) : ( + + {item.name} + + )} +
+ + {isExpanded && isFolder && children && ( +
+ {childFolders.map((child) => renderItem(child, level + 1))} + {childFiles.map((child) => renderItem(child, level + 1))} + + {children.length === 0 && ( +
+ Empty folder +
+ )} +
+ )} +
+ ); + }; + + return ( +
+ +
+
+
+ toggleFolderSelection("root", "My Drive")} + className="shrink-0 h-3.5 w-3.5 sm:h-4 sm:w-4 border-slate-400/20 dark:border-white/20" + /> + + +
+
+ + {isLoadingRoot && ( +
+ +
+ )} + +
+ {!isLoadingRoot && rootItems.map((item) => renderItem(item, 0))} +
+ + {!isLoadingRoot && rootItems.length === 0 && ( +
+ No files or folders found in your Google Drive +
+ )} +
+
+
+ ); +} + diff --git a/surfsense_web/hooks/use-composio-drive-folders.ts b/surfsense_web/hooks/use-composio-drive-folders.ts new file mode 100644 index 000000000..af8da1a81 --- /dev/null +++ b/surfsense_web/hooks/use-composio-drive-folders.ts @@ -0,0 +1,29 @@ +import { useQuery } from "@tanstack/react-query"; +import { connectorsApiService } from "@/lib/apis/connectors-api.service"; +import { cacheKeys } from "@/lib/query-client/cache-keys"; + +interface UseComposioDriveFoldersOptions { + connectorId: number; + parentId?: string; + enabled?: boolean; +} + +export function useComposioDriveFolders({ + connectorId, + parentId, + enabled = true, +}: UseComposioDriveFoldersOptions) { + return useQuery({ + queryKey: cacheKeys.connectors.composioDrive.folders(connectorId, parentId), + queryFn: async () => { + return connectorsApiService.listComposioDriveFolders({ + connector_id: connectorId, + parent_id: parentId, + }); + }, + enabled: enabled && !!connectorId, + staleTime: 5 * 60 * 1000, // 5 minutes + retry: 2, + }); +} + diff --git a/surfsense_web/lib/apis/connectors-api.service.ts b/surfsense_web/lib/apis/connectors-api.service.ts index 0e4f7f4d5..567db38de 100644 --- a/surfsense_web/lib/apis/connectors-api.service.ts +++ b/surfsense_web/lib/apis/connectors-api.service.ts @@ -233,6 +233,29 @@ class ConnectorsApiService { ); }; + /** + * List Composio Google Drive folders and files + */ + listComposioDriveFolders = async (request: ListGoogleDriveFoldersRequest) => { + const parsedRequest = listGoogleDriveFoldersRequest.safeParse(request); + + if (!parsedRequest.success) { + console.error("Invalid request:", parsedRequest.error); + + const errorMessage = parsedRequest.error.issues.map((issue) => issue.message).join(", "); + throw new ValidationError(`Invalid request: ${errorMessage}`); + } + + const { connector_id, parent_id } = parsedRequest.data; + + const queryParams = parent_id ? `?parent_id=${encodeURIComponent(parent_id)}` : ""; + + return baseApiService.get( + `/api/v1/connectors/${connector_id}/composio-drive/folders${queryParams}`, + listGoogleDriveFoldersResponse + ); + }; + // ============================================================================= // MCP Connector Methods // ============================================================================= diff --git a/surfsense_web/lib/query-client/cache-keys.ts b/surfsense_web/lib/query-client/cache-keys.ts index 72f2bbd54..8ffc3b786 100644 --- a/surfsense_web/lib/query-client/cache-keys.ts +++ b/surfsense_web/lib/query-client/cache-keys.ts @@ -71,6 +71,10 @@ export const cacheKeys = { folders: (connectorId: number, parentId?: string) => ["connectors", "google-drive", connectorId, "folders", parentId] as const, }, + composioDrive: { + folders: (connectorId: number, parentId?: string) => + ["connectors", "composio-drive", connectorId, "folders", parentId] as const, + }, }, comments: { byMessage: (messageId: number) => ["comments", "message", messageId] as const, From 42752bbeabea23f03e34821143d769b0ec83afc2 Mon Sep 17 00:00:00 2001 From: Anish Sarkar <104695310+AnishSarkar22@users.noreply.github.com> Date: Fri, 23 Jan 2026 05:28:03 +0530 Subject: [PATCH 06/28] feat: improve Composio file processing and error handling - Enhanced the handling of file content from Composio, supporting both binary and text files with appropriate processing methods. - Introduced robust error logging and handling for file content extraction, ensuring better visibility into issues during processing. - Updated the indexing logic to accommodate new content processing methods, improving overall reliability and user feedback on errors. - Added temporary file handling for binary files to facilitate text extraction using the ETL service. --- .../routes/search_source_connectors_routes.py | 6 +- .../app/services/composio_service.py | 75 ++++- .../app/tasks/composio_indexer.py | 301 +++++++++++++++++- 3 files changed, 360 insertions(+), 22 deletions(-) diff --git a/surfsense_backend/app/routes/search_source_connectors_routes.py b/surfsense_backend/app/routes/search_source_connectors_routes.py index 89cdd9f95..ed306c7bc 100644 --- a/surfsense_backend/app/routes/search_source_connectors_routes.py +++ b/surfsense_backend/app/routes/search_source_connectors_routes.py @@ -1140,7 +1140,7 @@ async def _run_indexing_with_notifications( f"Indexing completed successfully: {documents_processed} documents processed" ) - # Update notification on success + # Update notification on success (or partial success with errors) if notification: # Refresh notification to ensure it's not stale after timestamp update commit await session.refresh(notification) @@ -1148,7 +1148,7 @@ async def _run_indexing_with_notifications( session=session, notification=notification, indexed_count=documents_processed, - error_message=None, + error_message=error_or_warning, # Show errors even if some documents were indexed ) await session.commit() # Commit to ensure Electric SQL syncs the notification update elif documents_processed > 0: @@ -1172,7 +1172,7 @@ async def _run_indexing_with_notifications( session=session, notification=notification, indexed_count=documents_processed, - error_message=None, + error_message=error_or_warning, # Show errors even if some documents were indexed ) await session.commit() # Commit to ensure Electric SQL syncs the notification update else: diff --git a/surfsense_backend/app/services/composio_service.py b/surfsense_backend/app/services/composio_service.py index 5a6148533..1173cfb6a 100644 --- a/surfsense_backend/app/services/composio_service.py +++ b/surfsense_backend/app/services/composio_service.py @@ -458,11 +458,76 @@ class ComposioService: if not result.get("success"): return None, result.get("error", "Unknown error") - content = result.get("data") - if isinstance(content, str): - content = content.encode("utf-8") - - return content, None + data = result.get("data") + + # Composio GOOGLEDRIVE_DOWNLOAD_FILE returns a dict with file info + # The actual content is in "downloaded_file_content" field + if isinstance(data, dict): + # Try known Composio response fields in order of preference + content = None + + # Primary field from GOOGLEDRIVE_DOWNLOAD_FILE + if "downloaded_file_content" in data: + content = data["downloaded_file_content"] + # downloaded_file_content might itself be a dict with the actual content inside + if isinstance(content, dict): + # Try to extract actual content from nested dict + # Note: Composio nests downloaded_file_content inside another downloaded_file_content + actual_content = ( + content.get("downloaded_file_content") or + content.get("content") or + content.get("data") or + content.get("file_content") or + content.get("body") or + content.get("text") + ) + if actual_content is not None: + content = actual_content + else: + # Log structure for debugging + logger.warning(f"downloaded_file_content is dict with keys: {list(content.keys())}") + return None, f"Cannot extract content from downloaded_file_content. Keys: {list(content.keys())}" + # Fallback fields for compatibility + elif "content" in data: + content = data["content"] + elif "file_content" in data: + content = data["file_content"] + elif "data" in data: + content = data["data"] + + if content is None: + # Log available keys for debugging + logger.warning(f"Composio response dict keys: {list(data.keys())}") + return None, f"No file content found in Composio response. Available keys: {list(data.keys())}" + + # Convert content to bytes + if isinstance(content, str): + # Check if it's base64 encoded + import base64 + try: + # Try to decode as base64 first + content = base64.b64decode(content) + except Exception: + # If not base64, encode as UTF-8 + content = content.encode("utf-8") + elif isinstance(content, bytes): + pass # Already bytes + elif isinstance(content, dict): + # Still a dict after all extraction attempts - log structure + logger.warning(f"Content still dict after extraction: {list(content.keys())}") + return None, f"Unexpected nested content structure: {list(content.keys())}" + else: + return None, f"Unexpected content type in Composio response: {type(content).__name__}" + + return content, None + elif isinstance(data, str): + return data.encode("utf-8"), None + elif isinstance(data, bytes): + return data, None + elif data is None: + return None, "No data returned from Composio" + else: + return None, f"Unexpected data type from Composio: {type(data).__name__}" except Exception as e: logger.error(f"Failed to get Drive file content: {e!s}") diff --git a/surfsense_backend/app/tasks/composio_indexer.py b/surfsense_backend/app/tasks/composio_indexer.py index f568d4134..6f40e6d66 100644 --- a/surfsense_backend/app/tasks/composio_indexer.py +++ b/surfsense_backend/app/tasks/composio_indexer.py @@ -8,7 +8,10 @@ to avoid circular import issues with the connector_indexers package. """ import logging +import os +import tempfile from datetime import UTC, datetime +from pathlib import Path from typing import Any from sqlalchemy.exc import SQLAlchemyError @@ -21,6 +24,7 @@ from app.connectors.composio_connector import ComposioConnector from app.db import ( Document, DocumentType, + Log, SearchSourceConnector, SearchSourceConnectorType, ) @@ -81,6 +85,237 @@ async def update_connector_last_indexed( logger.info(f"Updated last_indexed_at to {connector.last_indexed_at}") +# Binary file extensions that need file processor +BINARY_FILE_EXTENSIONS = { + ".pdf", ".doc", ".docx", ".xls", ".xlsx", ".ppt", ".pptx", + ".png", ".jpg", ".jpeg", ".gif", ".bmp", ".tiff", ".webp", + ".zip", ".tar", ".gz", ".rar", ".7z", + ".mp3", ".mp4", ".wav", ".avi", ".mov", + ".exe", ".dll", ".so", ".bin", +} + +# Text file extensions that can be decoded as UTF-8 +TEXT_FILE_EXTENSIONS = { + ".txt", ".md", ".markdown", ".json", ".xml", ".html", ".htm", + ".css", ".js", ".ts", ".py", ".java", ".c", ".cpp", ".h", + ".yaml", ".yml", ".toml", ".ini", ".cfg", ".conf", + ".sh", ".bash", ".zsh", ".fish", + ".sql", ".csv", ".tsv", + ".rst", ".tex", ".log", +} + + +def _is_binary_file(file_name: str, mime_type: str) -> bool: + """Check if a file is binary based on extension or mime type.""" + extension = Path(file_name).suffix.lower() + + # Check extension first + if extension in BINARY_FILE_EXTENSIONS: + return True + if extension in TEXT_FILE_EXTENSIONS: + return False + + # Check mime type + if mime_type: + if mime_type.startswith(("image/", "audio/", "video/", "application/pdf")): + return True + if mime_type.startswith(("text/", "application/json", "application/xml")): + return False + # Office documents + if "spreadsheet" in mime_type or "document" in mime_type or "presentation" in mime_type: + return True + + # Default to text for unknown types + return False + + +async def _process_file_content( + content: bytes | str, + file_name: str, + file_id: str, + mime_type: str, + search_space_id: int, + user_id: str, + session: AsyncSession, + task_logger: TaskLoggingService, + log_entry: Log, + processing_errors: list[str], +) -> str: + """ + Process file content and return markdown text. + + For binary files (PDFs, images, etc.), uses Surfsense's ETL service. + For text files, decodes as UTF-8. + + Args: + content: File content as bytes or string + file_name: Name of the file + file_id: Google Drive file ID + mime_type: MIME type of the file + search_space_id: Search space ID + user_id: User ID + session: Database session + task_logger: Task logging service + log_entry: Log entry for tracking + processing_errors: List to append errors to + + Returns: + Markdown content string + """ + # Ensure content is bytes + if isinstance(content, str): + content = content.encode("utf-8") + + # Check if this is a binary file + if _is_binary_file(file_name, mime_type): + # Use ETL service for binary files (PDF, Office docs, etc.) + temp_file_path = None + try: + # Get file extension + extension = Path(file_name).suffix or ".bin" + + # Write to temp file + with tempfile.NamedTemporaryFile(delete=False, suffix=extension) as tmp_file: + tmp_file.write(content) + temp_file_path = tmp_file.name + + # Use the configured ETL service to extract text + extracted_text = await _extract_text_with_etl( + temp_file_path, file_name, task_logger, log_entry + ) + + if extracted_text: + return extracted_text + else: + # Fallback if extraction fails + logger.warning(f"Could not extract text from binary file {file_name}") + return f"# {file_name}\n\n[Binary file - text extraction failed]\n\n**File ID:** {file_id}\n**Type:** {mime_type}\n" + + except Exception as e: + error_msg = f"Error processing binary file {file_name}: {e!s}" + logger.error(error_msg) + processing_errors.append(error_msg) + return f"# {file_name}\n\n[Binary file - processing error]\n\n**File ID:** {file_id}\n**Type:** {mime_type}\n" + finally: + # Cleanup temp file + if temp_file_path and os.path.exists(temp_file_path): + try: + os.unlink(temp_file_path) + except Exception as e: + logger.debug(f"Could not delete temp file {temp_file_path}: {e}") + else: + # Text file - try to decode as UTF-8 + try: + return content.decode("utf-8") + except UnicodeDecodeError: + # Try other encodings + for encoding in ["latin-1", "cp1252", "iso-8859-1"]: + try: + return content.decode(encoding) + except UnicodeDecodeError: + continue + + # If all encodings fail, treat as binary + error_msg = f"Could not decode text file {file_name} with any encoding" + logger.warning(error_msg) + processing_errors.append(error_msg) + return f"# {file_name}\n\n[File content could not be decoded]\n\n**File ID:** {file_id}\n**Type:** {mime_type}\n" + + +async def _extract_text_with_etl( + file_path: str, + file_name: str, + task_logger: TaskLoggingService, + log_entry: Log, +) -> str | None: + """ + Extract text from a file using the configured ETL service. + + Args: + file_path: Path to the file + file_name: Name of the file + task_logger: Task logging service + log_entry: Log entry for tracking + + Returns: + Extracted text as markdown, or None if extraction fails + """ + import warnings + from logging import ERROR, getLogger + + etl_service = config.ETL_SERVICE + + try: + if etl_service == "UNSTRUCTURED": + from langchain_unstructured import UnstructuredLoader + + from app.utils.document_converters import convert_document_to_markdown + + loader = UnstructuredLoader( + file_path, + mode="elements", + post_processors=[], + languages=["eng"], + include_orig_elements=False, + include_metadata=False, + strategy="auto", + ) + + docs = await loader.aload() + if docs: + return await convert_document_to_markdown(docs) + return None + + elif etl_service == "LLAMACLOUD": + from app.tasks.document_processors.file_processors import parse_with_llamacloud_retry + + # Estimate pages (rough estimate based on file size) + file_size = os.path.getsize(file_path) + estimated_pages = max(1, file_size // (80 * 1024)) + + result = await parse_with_llamacloud_retry( + file_path=file_path, + estimated_pages=estimated_pages, + task_logger=task_logger, + log_entry=log_entry, + ) + + markdown_documents = await result.aget_markdown_documents(split_by_page=False) + if markdown_documents: + return markdown_documents[0].text + return None + + elif etl_service == "DOCLING": + from app.services.docling_service import create_docling_service + + docling_service = create_docling_service() + + # Suppress pdfminer warnings + pdfminer_logger = getLogger("pdfminer") + original_level = pdfminer_logger.level + + with warnings.catch_warnings(): + warnings.filterwarnings("ignore", category=UserWarning, module="pdfminer") + warnings.filterwarnings("ignore", message=".*Cannot set gray non-stroke color.*") + warnings.filterwarnings("ignore", message=".*invalid float value.*") + + pdfminer_logger.setLevel(ERROR) + + try: + result = await docling_service.process_document(file_path, file_name) + finally: + pdfminer_logger.setLevel(original_level) + + return result.get("content") + else: + logger.warning(f"Unknown ETL service: {etl_service}") + return None + + except Exception as e: + logger.error(f"ETL extraction failed for {file_name}: {e!s}") + return None + + # ============ Main indexer function ============ @@ -384,6 +619,7 @@ async def _index_composio_google_drive( documents_indexed = 0 documents_skipped = 0 + processing_errors = [] for file_info in all_files: try: @@ -422,11 +658,28 @@ async def _index_composio_google_drive( markdown_content = f"# {file_name}\n\n" markdown_content += f"**File ID:** {file_id}\n" markdown_content += f"**Type:** {mime_type}\n" + elif isinstance(content, dict): + # Safety check: if content is still a dict, log error and use fallback + error_msg = f"Unexpected dict content format for file {file_name}: {list(content.keys())}" + logger.error(error_msg) + processing_errors.append(error_msg) + markdown_content = f"# {file_name}\n\n" + markdown_content += f"**File ID:** {file_id}\n" + markdown_content += f"**Type:** {mime_type}\n" else: - try: - markdown_content = content.decode("utf-8") - except UnicodeDecodeError: - markdown_content = f"# {file_name}\n\n[Binary file content]\n" + # Process content based on file type + markdown_content = await _process_file_content( + content=content, + file_name=file_name, + file_id=file_id, + mime_type=mime_type, + search_space_id=search_space_id, + user_id=user_id, + session=session, + task_logger=task_logger, + log_entry=log_entry, + processing_errors=processing_errors, + ) content_hash = generate_content_hash(markdown_content, search_space_id) @@ -531,7 +784,9 @@ async def _index_composio_google_drive( await session.commit() except Exception as e: - logger.error(f"Error processing Drive file: {e!s}", exc_info=True) + error_msg = f"Error processing Drive file {file_name or 'unknown'}: {e!s}" + logger.error(error_msg, exc_info=True) + processing_errors.append(error_msg) documents_skipped += 1 continue @@ -549,16 +804,34 @@ async def _index_composio_google_drive( "Successfully committed all Composio Google Drive document changes to database" ) - await task_logger.log_task_success( - log_entry, - f"Successfully completed Google Drive indexing via Composio for connector {connector_id}", - { - "documents_indexed": documents_indexed, - "documents_skipped": documents_skipped, - }, - ) + # If there were processing errors, return them so notification can show them + error_message = None + if processing_errors: + # Combine all errors into a single message + if len(processing_errors) == 1: + error_message = processing_errors[0] + else: + error_message = f"Failed to process {len(processing_errors)} file(s). First error: {processing_errors[0]}" + await task_logger.log_task_failure( + log_entry, + f"Completed Google Drive indexing with {len(processing_errors)} error(s) for connector {connector_id}", + { + "documents_indexed": documents_indexed, + "documents_skipped": documents_skipped, + "errors": processing_errors, + }, + ) + else: + await task_logger.log_task_success( + log_entry, + f"Successfully completed Google Drive indexing via Composio for connector {connector_id}", + { + "documents_indexed": documents_indexed, + "documents_skipped": documents_skipped, + }, + ) - return documents_indexed, None + return documents_indexed, error_message except Exception as e: logger.error(f"Failed to index Google Drive via Composio: {e!s}", exc_info=True) From 8a0b8346a5ee913a94a922ae8750072ce3b0ec11 Mon Sep 17 00:00:00 2001 From: Anish Sarkar <104695310+AnishSarkar22@users.noreply.github.com> Date: Fri, 23 Jan 2026 05:28:18 +0530 Subject: [PATCH 07/28] chore: ran linting --- .../74_add_composio_connector_enums.py | 4 +- .../app/connectors/composio_connector.py | 6 +- .../app/connectors/github_connector.py | 46 +- .../app/routes/composio_routes.py | 74 ++- .../routes/search_source_connectors_routes.py | 48 +- .../app/services/composio_service.py | 178 ++++--- .../app/tasks/composio_indexer.py | 441 ++++++++++++------ .../app/tasks/connector_indexers/__init__.py | 1 + .../connector_indexers/github_indexer.py | 14 +- .../components/composio-config.tsx | 8 +- .../constants/connector-popup.schemas.ts | 4 +- .../hooks/use-connector-dialog.ts | 28 +- .../tabs/all-connectors-tab.tsx | 17 +- .../connectors/composio-drive-folder-tree.tsx | 1 - .../hooks/use-composio-drive-folders.ts | 1 - 15 files changed, 583 insertions(+), 288 deletions(-) diff --git a/surfsense_backend/alembic/versions/74_add_composio_connector_enums.py b/surfsense_backend/alembic/versions/74_add_composio_connector_enums.py index cadf70cb6..2996d9d07 100644 --- a/surfsense_backend/alembic/versions/74_add_composio_connector_enums.py +++ b/surfsense_backend/alembic/versions/74_add_composio_connector_enums.py @@ -82,14 +82,14 @@ def upgrade() -> None: def downgrade() -> None: """Downgrade schema - remove Composio connector types from connector and document enums. - + Note: PostgreSQL does not support removing enum values directly. To properly downgrade, you would need to: 1. Delete any rows using the Composio connector type values 2. Create new enums without the Composio connector types 3. Alter the columns to use the new enums 4. Drop the old enums - + This is left as a no-op since removing enum values is complex and typically not needed in practice. """ diff --git a/surfsense_backend/app/connectors/composio_connector.py b/surfsense_backend/app/connectors/composio_connector.py index 21e339d12..b49988887 100644 --- a/surfsense_backend/app/connectors/composio_connector.py +++ b/surfsense_backend/app/connectors/composio_connector.py @@ -12,7 +12,7 @@ from sqlalchemy.ext.asyncio import AsyncSession from sqlalchemy.future import select from app.db import SearchSourceConnector -from app.services.composio_service import ComposioService, INDEXABLE_TOOLKITS +from app.services.composio_service import INDEXABLE_TOOLKITS, ComposioService logger = logging.getLogger(__name__) @@ -271,7 +271,9 @@ class ComposioConnector: from_email = header_dict.get("from", "Unknown Sender") to_email = header_dict.get("to", "Unknown Recipient") # Composio provides messageTimestamp directly - date_str = message.get("messageTimestamp", "") or header_dict.get("date", "Unknown Date") + date_str = message.get("messageTimestamp", "") or header_dict.get( + "date", "Unknown Date" + ) # Build markdown content markdown_content = f"# {subject}\n\n" diff --git a/surfsense_backend/app/connectors/github_connector.py b/surfsense_backend/app/connectors/github_connector.py index 6f04ccdba..9d4b98c4b 100644 --- a/surfsense_backend/app/connectors/github_connector.py +++ b/surfsense_backend/app/connectors/github_connector.py @@ -58,7 +58,9 @@ class GitHubConnector: if self.token: logger.info("GitHub connector initialized with authentication token.") else: - logger.info("GitHub connector initialized without token (public repos only).") + logger.info( + "GitHub connector initialized without token (public repos only)." + ) def ingest_repository( self, @@ -95,17 +97,27 @@ class GitHubConnector: cmd = [ "gitingest", repo_url, - "--output", output_path, - "--max-size", str(max_file_size), + "--output", + output_path, + "--max-size", + str(max_file_size), # Common exclude patterns - "-e", "node_modules/*", - "-e", "vendor/*", - "-e", ".git/*", - "-e", "__pycache__/*", - "-e", "dist/*", - "-e", "build/*", - "-e", "*.lock", - "-e", "package-lock.json", + "-e", + "node_modules/*", + "-e", + "vendor/*", + "-e", + ".git/*", + "-e", + "__pycache__/*", + "-e", + "dist/*", + "-e", + "build/*", + "-e", + "*.lock", + "-e", + "package-lock.json", ] # Add branch if specified @@ -147,7 +159,9 @@ class GitHubConnector: os.unlink(output_path) if not full_content or not full_content.strip(): - logger.warning(f"No content retrieved from repository: {repo_full_name}") + logger.warning( + f"No content retrieved from repository: {repo_full_name}" + ) return None # Parse the gitingest output @@ -171,11 +185,11 @@ class GitHubConnector: logger.error(f"gitingest timed out for repository: {repo_full_name}") return None except FileNotFoundError: - logger.error( - "gitingest CLI not found. Falling back to Python library." - ) + logger.error("gitingest CLI not found. Falling back to Python library.") # Fall back to Python library - return self._ingest_with_python_library(repo_full_name, branch, max_file_size) + return self._ingest_with_python_library( + repo_full_name, branch, max_file_size + ) except Exception as e: logger.error(f"Failed to ingest repository {repo_full_name}: {e}") return None diff --git a/surfsense_backend/app/routes/composio_routes.py b/surfsense_backend/app/routes/composio_routes.py index 25e545dfb..dec9beb02 100644 --- a/surfsense_backend/app/routes/composio_routes.py +++ b/surfsense_backend/app/routes/composio_routes.py @@ -11,7 +11,6 @@ Endpoints: - GET /connectors/{connector_id}/composio-drive/folders - List folders/files for Composio Google Drive """ -import asyncio import logging from uuid import UUID @@ -89,7 +88,9 @@ async def list_composio_toolkits(user: User = Depends(current_active_user)): @router.get("/auth/composio/connector/add") async def initiate_composio_auth( space_id: int, - toolkit_id: str = Query(..., description="Composio toolkit ID (e.g., 'googledrive', 'gmail')"), + toolkit_id: str = Query( + ..., description="Composio toolkit ID (e.g., 'googledrive', 'gmail')" + ), user: User = Depends(current_active_user), ): """ @@ -239,13 +240,15 @@ async def composio_callback( # Initialize Composio service service = ComposioService() entity_id = f"surfsense_{user_id}" - + # Use camelCase param if provided (Composio's format), fallback to snake_case final_connected_account_id = connectedAccountId or connected_account_id - + # DEBUG: Log all query parameters received - logger.info(f"DEBUG: Callback received - connectedAccountId: {connectedAccountId}, connected_account_id: {connected_account_id}, using: {final_connected_account_id}") - + logger.info( + f"DEBUG: Callback received - connectedAccountId: {connectedAccountId}, connected_account_id: {connected_account_id}, using: {final_connected_account_id}" + ) + # If we still don't have a connected_account_id, warn but continue # (the connector will be created but indexing won't work until updated) if not final_connected_account_id: @@ -254,7 +257,9 @@ async def composio_callback( "The connector will be created but indexing may not work." ) else: - logger.info(f"Successfully got connected_account_id: {final_connected_account_id}") + logger.info( + f"Successfully got connected_account_id: {final_connected_account_id}" + ) # Build connector config connector_config = { @@ -287,10 +292,17 @@ async def composio_callback( if existing_connector: # Delete the old Composio connected account before updating - old_connected_account_id = existing_connector.config.get("composio_connected_account_id") - if old_connected_account_id and old_connected_account_id != final_connected_account_id: + old_connected_account_id = existing_connector.config.get( + "composio_connected_account_id" + ) + if ( + old_connected_account_id + and old_connected_account_id != final_connected_account_id + ): try: - deleted = await service.delete_connected_account(old_connected_account_id) + deleted = await service.delete_connected_account( + old_connected_account_id + ) if deleted: logger.info( f"Deleted old Composio connected account {old_connected_account_id} " @@ -422,7 +434,9 @@ async def list_composio_drive_folders( ) # Get Composio connected account ID from config - composio_connected_account_id = connector.config.get("composio_connected_account_id") + composio_connected_account_id = connector.config.get( + "composio_connected_account_id" + ) if not composio_connected_account_id: raise HTTPException( status_code=400, @@ -451,27 +465,37 @@ async def list_composio_drive_folders( items = [] for file_info in files: file_id = file_info.get("id", "") or file_info.get("fileId", "") - file_name = file_info.get("name", "") or file_info.get("fileName", "") or "Untitled" + file_name = ( + file_info.get("name", "") or file_info.get("fileName", "") or "Untitled" + ) mime_type = file_info.get("mimeType", "") or file_info.get("mime_type", "") - + if not file_id: continue is_folder = mime_type == "application/vnd.google-apps.folder" - - items.append({ - "id": file_id, - "name": file_name, - "mimeType": mime_type, - "isFolder": is_folder, - "parents": file_info.get("parents", []), - "size": file_info.get("size"), - "iconLink": file_info.get("iconLink"), - }) + + items.append( + { + "id": file_id, + "name": file_name, + "mimeType": mime_type, + "isFolder": is_folder, + "parents": file_info.get("parents", []), + "size": file_info.get("size"), + "iconLink": file_info.get("iconLink"), + } + ) # Sort: folders first, then files, both alphabetically - folders = sorted([item for item in items if item["isFolder"]], key=lambda x: x["name"].lower()) - files_list = sorted([item for item in items if not item["isFolder"]], key=lambda x: x["name"].lower()) + folders = sorted( + [item for item in items if item["isFolder"]], + key=lambda x: x["name"].lower(), + ) + files_list = sorted( + [item for item in items if not item["isFolder"]], + key=lambda x: x["name"].lower(), + ) items = folders + files_list folder_count = len(folders) diff --git a/surfsense_backend/app/routes/search_source_connectors_routes.py b/surfsense_backend/app/routes/search_source_connectors_routes.py index ed306c7bc..433acac1c 100644 --- a/surfsense_backend/app/routes/search_source_connectors_routes.py +++ b/surfsense_backend/app/routes/search_source_connectors_routes.py @@ -37,7 +37,6 @@ from app.db import ( async_session_maker, get_async_session, ) -from app.services.composio_service import ComposioService from app.schemas import ( GoogleDriveIndexRequest, MCPConnectorCreate, @@ -48,6 +47,7 @@ from app.schemas import ( SearchSourceConnectorRead, SearchSourceConnectorUpdate, ) +from app.services.composio_service import ComposioService from app.services.notification_service import NotificationService from app.tasks.connector_indexers import ( index_airtable_records, @@ -537,11 +537,15 @@ async def delete_search_source_connector( SearchSourceConnectorType.COMPOSIO_GOOGLE_CALENDAR_CONNECTOR, ] if db_connector.connector_type in composio_connector_types: - composio_connected_account_id = db_connector.config.get("composio_connected_account_id") + composio_connected_account_id = db_connector.config.get( + "composio_connected_account_id" + ) if composio_connected_account_id and ComposioService.is_enabled(): try: service = ComposioService() - deleted = await service.delete_connected_account(composio_connected_account_id) + deleted = await service.delete_connected_account( + composio_connected_account_id + ) if deleted: logger.info( f"Successfully deleted Composio connected account {composio_connected_account_id} " @@ -897,7 +901,10 @@ async def index_connector_content( ) response_message = "Web page indexing started in the background." - elif connector.connector_type == SearchSourceConnectorType.COMPOSIO_GOOGLE_DRIVE_CONNECTOR: + elif ( + connector.connector_type + == SearchSourceConnectorType.COMPOSIO_GOOGLE_DRIVE_CONNECTOR + ): from app.tasks.celery_tasks.connector_tasks import ( index_composio_connector_task, ) @@ -907,8 +914,12 @@ async def index_connector_content( if drive_items and drive_items.has_items(): # Update connector config with the selected folders/files config = connector.config or {} - config["selected_folders"] = [{"id": f.id, "name": f.name} for f in drive_items.folders] - config["selected_files"] = [{"id": f.id, "name": f.name} for f in drive_items.files] + config["selected_folders"] = [ + {"id": f.id, "name": f.name} for f in drive_items.folders + ] + config["selected_files"] = [ + {"id": f.id, "name": f.name} for f in drive_items.files + ] if drive_items.indexing_options: config["indexing_options"] = { "max_files_per_folder": drive_items.indexing_options.max_files_per_folder, @@ -917,6 +928,7 @@ async def index_connector_content( } connector.config = config from sqlalchemy.orm.attributes import flag_modified + flag_modified(connector, "config") await session.commit() await session.refresh(connector) @@ -934,7 +946,9 @@ async def index_connector_content( index_composio_connector_task.delay( connector_id, search_space_id, str(user.id), indexing_from, indexing_to ) - response_message = "Composio Google Drive indexing started in the background." + response_message = ( + "Composio Google Drive indexing started in the background." + ) elif connector.connector_type in [ SearchSourceConnectorType.COMPOSIO_GMAIL_CONNECTOR, @@ -995,7 +1009,9 @@ async def _update_connector_timestamp_by_id(session: AsyncSession, connector_id: connector = result.scalars().first() if connector: - connector.last_indexed_at = datetime.now(UTC) # Use UTC for timezone consistency + connector.last_indexed_at = datetime.now( + UTC + ) # Use UTC for timezone consistency await session.commit() logger.info(f"Updated last_indexed_at for connector {connector_id}") except Exception as e: @@ -1150,7 +1166,9 @@ async def _run_indexing_with_notifications( indexed_count=documents_processed, error_message=error_or_warning, # Show errors even if some documents were indexed ) - await session.commit() # Commit to ensure Electric SQL syncs the notification update + await ( + session.commit() + ) # Commit to ensure Electric SQL syncs the notification update elif documents_processed > 0: # Update notification to storing stage if notification: @@ -1174,7 +1192,9 @@ async def _run_indexing_with_notifications( indexed_count=documents_processed, error_message=error_or_warning, # Show errors even if some documents were indexed ) - await session.commit() # Commit to ensure Electric SQL syncs the notification update + await ( + session.commit() + ) # Commit to ensure Electric SQL syncs the notification update else: # No new documents processed - check if this is an error or just no changes if error_or_warning: @@ -1189,7 +1209,9 @@ async def _run_indexing_with_notifications( indexed_count=0, error_message=error_or_warning, ) - await session.commit() # Commit to ensure Electric SQL syncs the notification update + await ( + session.commit() + ) # Commit to ensure Electric SQL syncs the notification update else: # Success - just no new documents to index (all skipped/unchanged) logger.info( @@ -1208,7 +1230,9 @@ async def _run_indexing_with_notifications( indexed_count=0, error_message=None, # No error - sync succeeded ) - await session.commit() # Commit to ensure Electric SQL syncs the notification update + await ( + session.commit() + ) # Commit to ensure Electric SQL syncs the notification update except Exception as e: logger.error(f"Error in indexing task: {e!s}", exc_info=True) diff --git a/surfsense_backend/app/services/composio_service.py b/surfsense_backend/app/services/composio_service.py index 1173cfb6a..0d6189cd9 100644 --- a/surfsense_backend/app/services/composio_service.py +++ b/surfsense_backend/app/services/composio_service.py @@ -111,7 +111,7 @@ class ComposioService: config_toolkit = getattr(auth_config, "toolkit", None) if config_toolkit is None: continue - + # Extract toolkit name/slug from the object toolkit_name = None if isinstance(config_toolkit, str): @@ -122,18 +122,22 @@ class ComposioService: toolkit_name = config_toolkit.name elif hasattr(config_toolkit, "id"): toolkit_name = config_toolkit.id - + # Compare case-insensitively if toolkit_name and toolkit_name.lower() == toolkit_id.lower(): - logger.info(f"Found auth config {auth_config.id} for toolkit {toolkit_id}") + logger.info( + f"Found auth config {auth_config.id} for toolkit {toolkit_id}" + ) return auth_config.id - + # Log available auth configs for debugging - logger.warning(f"No auth config found for toolkit '{toolkit_id}'. Available auth configs:") + logger.warning( + f"No auth config found for toolkit '{toolkit_id}'. Available auth configs:" + ) for auth_config in auth_configs.items: config_toolkit = getattr(auth_config, "toolkit", None) logger.warning(f" - {auth_config.id}: toolkit={config_toolkit}") - + return None except Exception as e: logger.error(f"Failed to list auth configs: {e!s}") @@ -162,7 +166,7 @@ class ComposioService: try: # First, get the auth_config_id for this toolkit auth_config_id = self._get_auth_config_for_toolkit(toolkit_id) - + if not auth_config_id: raise ValueError( f"No auth config found for toolkit '{toolkit_id}'. " @@ -214,7 +218,9 @@ class ComposioService: "user_id": getattr(account, "user_id", None), } except Exception as e: - logger.error(f"Failed to get connected account {connected_account_id}: {e!s}") + logger.error( + f"Failed to get connected account {connected_account_id}: {e!s}" + ) return None async def list_all_connections(self) -> list[dict[str, Any]]: @@ -226,15 +232,17 @@ class ComposioService: """ try: accounts_response = self.client.connected_accounts.list() - + if hasattr(accounts_response, "items"): accounts = accounts_response.items elif hasattr(accounts_response, "__iter__"): accounts = accounts_response else: - logger.warning(f"Unexpected accounts response type: {type(accounts_response)}") + logger.warning( + f"Unexpected accounts response type: {type(accounts_response)}" + ) return [] - + result = [] for acc in accounts: toolkit_raw = getattr(acc, "toolkit", None) @@ -248,14 +256,16 @@ class ComposioService: toolkit_info = toolkit_raw.name else: toolkit_info = str(toolkit_raw) - - result.append({ - "id": acc.id, - "status": getattr(acc, "status", None), - "toolkit": toolkit_info, - "user_id": getattr(acc, "user_id", None), - }) - + + result.append( + { + "id": acc.id, + "status": getattr(acc, "status", None), + "toolkit": toolkit_info, + "user_id": getattr(acc, "user_id", None), + } + ) + return result except Exception as e: logger.error(f"Failed to list all connections: {e!s}") @@ -273,16 +283,18 @@ class ComposioService: """ try: accounts_response = self.client.connected_accounts.list(user_id=user_id) - + # Handle paginated response (may have .items attribute) or direct list if hasattr(accounts_response, "items"): accounts = accounts_response.items elif hasattr(accounts_response, "__iter__"): accounts = accounts_response else: - logger.warning(f"Unexpected accounts response type: {type(accounts_response)}") + logger.warning( + f"Unexpected accounts response type: {type(accounts_response)}" + ) return [] - + result = [] for acc in accounts: # Extract toolkit info - might be string or object @@ -297,13 +309,15 @@ class ComposioService: toolkit_info = toolkit_raw.name else: toolkit_info = toolkit_raw - - result.append({ - "id": acc.id, - "status": getattr(acc, "status", None), - "toolkit": toolkit_info, - }) - + + result.append( + { + "id": acc.id, + "status": getattr(acc, "status", None), + "toolkit": toolkit_info, + } + ) + logger.info(f"Found {len(result)} connections for user {user_id}: {result}") return result except Exception as e: @@ -324,10 +338,14 @@ class ComposioService: """ try: self.client.connected_accounts.delete(connected_account_id) - logger.info(f"Successfully deleted Composio connected account: {connected_account_id}") + logger.info( + f"Successfully deleted Composio connected account: {connected_account_id}" + ) return True except Exception as e: - logger.error(f"Failed to delete Composio connected account {connected_account_id}: {e!s}") + logger.error( + f"Failed to delete Composio connected account {connected_account_id}: {e!s}" + ) return False async def execute_tool( @@ -398,10 +416,14 @@ class ComposioService: } if folder_id: # List contents of a specific folder (exclude shortcuts - we don't have access to them) - params["q"] = f"'{folder_id}' in parents and trashed = false and mimeType != 'application/vnd.google-apps.shortcut'" + params["q"] = ( + f"'{folder_id}' in parents and trashed = false and mimeType != 'application/vnd.google-apps.shortcut'" + ) else: # List root-level items only (My Drive root), exclude shortcuts - params["q"] = "'root' in parents and trashed = false and mimeType != 'application/vnd.google-apps.shortcut'" + params["q"] = ( + "'root' in parents and trashed = false and mimeType != 'application/vnd.google-apps.shortcut'" + ) if page_token: params["page_token"] = page_token @@ -416,17 +438,21 @@ class ComposioService: return [], None, result.get("error", "Unknown error") data = result.get("data", {}) - + # Handle nested response structure from Composio files = [] next_token = None if isinstance(data, dict): # Try direct access first, then nested files = data.get("files", []) or data.get("data", {}).get("files", []) - next_token = data.get("nextPageToken") or data.get("next_page_token") or data.get("data", {}).get("nextPageToken") + next_token = ( + data.get("nextPageToken") + or data.get("next_page_token") + or data.get("data", {}).get("nextPageToken") + ) elif isinstance(data, list): files = data - + return files, next_token, None except Exception as e: @@ -459,13 +485,13 @@ class ComposioService: return None, result.get("error", "Unknown error") data = result.get("data") - + # Composio GOOGLEDRIVE_DOWNLOAD_FILE returns a dict with file info # The actual content is in "downloaded_file_content" field if isinstance(data, dict): # Try known Composio response fields in order of preference content = None - + # Primary field from GOOGLEDRIVE_DOWNLOAD_FILE if "downloaded_file_content" in data: content = data["downloaded_file_content"] @@ -474,19 +500,24 @@ class ComposioService: # Try to extract actual content from nested dict # Note: Composio nests downloaded_file_content inside another downloaded_file_content actual_content = ( - content.get("downloaded_file_content") or - content.get("content") or - content.get("data") or - content.get("file_content") or - content.get("body") or - content.get("text") + content.get("downloaded_file_content") + or content.get("content") + or content.get("data") + or content.get("file_content") + or content.get("body") + or content.get("text") ) if actual_content is not None: content = actual_content else: # Log structure for debugging - logger.warning(f"downloaded_file_content is dict with keys: {list(content.keys())}") - return None, f"Cannot extract content from downloaded_file_content. Keys: {list(content.keys())}" + logger.warning( + f"downloaded_file_content is dict with keys: {list(content.keys())}" + ) + return ( + None, + f"Cannot extract content from downloaded_file_content. Keys: {list(content.keys())}", + ) # Fallback fields for compatibility elif "content" in data: content = data["content"] @@ -494,16 +525,20 @@ class ComposioService: content = data["file_content"] elif "data" in data: content = data["data"] - + if content is None: # Log available keys for debugging logger.warning(f"Composio response dict keys: {list(data.keys())}") - return None, f"No file content found in Composio response. Available keys: {list(data.keys())}" - + return ( + None, + f"No file content found in Composio response. Available keys: {list(data.keys())}", + ) + # Convert content to bytes if isinstance(content, str): # Check if it's base64 encoded import base64 + try: # Try to decode as base64 first content = base64.b64decode(content) @@ -514,11 +549,19 @@ class ComposioService: pass # Already bytes elif isinstance(content, dict): # Still a dict after all extraction attempts - log structure - logger.warning(f"Content still dict after extraction: {list(content.keys())}") - return None, f"Unexpected nested content structure: {list(content.keys())}" + logger.warning( + f"Content still dict after extraction: {list(content.keys())}" + ) + return ( + None, + f"Unexpected nested content structure: {list(content.keys())}", + ) else: - return None, f"Unexpected content type in Composio response: {type(content).__name__}" - + return ( + None, + f"Unexpected content type in Composio response: {type(content).__name__}", + ) + return content, None elif isinstance(data, str): return data.encode("utf-8"), None @@ -527,7 +570,10 @@ class ComposioService: elif data is None: return None, "No data returned from Composio" else: - return None, f"Unexpected data type from Composio: {type(data).__name__}" + return ( + None, + f"Unexpected data type from Composio: {type(data).__name__}", + ) except Exception as e: logger.error(f"Failed to get Drive file content: {e!s}") @@ -576,17 +622,21 @@ class ComposioService: return [], None, result.get("error", "Unknown error") data = result.get("data", {}) - + # Try different possible response structures messages = [] next_token = None result_size_estimate = None if isinstance(data, dict): - messages = data.get("messages", []) or data.get("data", {}).get("messages", []) or data.get("emails", []) + messages = ( + data.get("messages", []) + or data.get("data", {}).get("messages", []) + or data.get("emails", []) + ) # Check for pagination token in various possible locations next_token = ( - data.get("nextPageToken") - or data.get("next_page_token") + data.get("nextPageToken") + or data.get("next_page_token") or data.get("data", {}).get("nextPageToken") or data.get("data", {}).get("next_page_token") ) @@ -599,7 +649,7 @@ class ComposioService: ) elif isinstance(data, list): messages = data - + return messages, next_token, result_size_estimate, None except Exception as e: @@ -683,14 +733,18 @@ class ComposioService: return [], result.get("error", "Unknown error") data = result.get("data", {}) - + # Try different possible response structures events = [] if isinstance(data, dict): - events = data.get("items", []) or data.get("data", {}).get("items", []) or data.get("events", []) + events = ( + data.get("items", []) + or data.get("data", {}).get("items", []) + or data.get("events", []) + ) elif isinstance(data, list): events = data - + return events, None except Exception as e: diff --git a/surfsense_backend/app/tasks/composio_indexer.py b/surfsense_backend/app/tasks/composio_indexer.py index 6f40e6d66..e5c8b701e 100644 --- a/surfsense_backend/app/tasks/composio_indexer.py +++ b/surfsense_backend/app/tasks/composio_indexer.py @@ -64,10 +64,14 @@ async def check_document_by_unique_identifier( async def get_connector_by_id( - session: AsyncSession, connector_id: int, connector_type: SearchSourceConnectorType | None + session: AsyncSession, + connector_id: int, + connector_type: SearchSourceConnectorType | None, ) -> SearchSourceConnector | None: """Get a connector by ID and optionally by type from the database.""" - query = select(SearchSourceConnector).filter(SearchSourceConnector.id == connector_id) + query = select(SearchSourceConnector).filter( + SearchSourceConnector.id == connector_id + ) if connector_type is not None: query = query.filter(SearchSourceConnector.connector_type == connector_type) result = await session.execute(query) @@ -81,40 +85,90 @@ async def update_connector_last_indexed( ) -> None: """Update the last_indexed_at timestamp for a connector.""" if update_last_indexed: - connector.last_indexed_at = datetime.now(UTC) # Use UTC for timezone consistency + connector.last_indexed_at = datetime.now( + UTC + ) # Use UTC for timezone consistency logger.info(f"Updated last_indexed_at to {connector.last_indexed_at}") # Binary file extensions that need file processor BINARY_FILE_EXTENSIONS = { - ".pdf", ".doc", ".docx", ".xls", ".xlsx", ".ppt", ".pptx", - ".png", ".jpg", ".jpeg", ".gif", ".bmp", ".tiff", ".webp", - ".zip", ".tar", ".gz", ".rar", ".7z", - ".mp3", ".mp4", ".wav", ".avi", ".mov", - ".exe", ".dll", ".so", ".bin", + ".pdf", + ".doc", + ".docx", + ".xls", + ".xlsx", + ".ppt", + ".pptx", + ".png", + ".jpg", + ".jpeg", + ".gif", + ".bmp", + ".tiff", + ".webp", + ".zip", + ".tar", + ".gz", + ".rar", + ".7z", + ".mp3", + ".mp4", + ".wav", + ".avi", + ".mov", + ".exe", + ".dll", + ".so", + ".bin", } # Text file extensions that can be decoded as UTF-8 TEXT_FILE_EXTENSIONS = { - ".txt", ".md", ".markdown", ".json", ".xml", ".html", ".htm", - ".css", ".js", ".ts", ".py", ".java", ".c", ".cpp", ".h", - ".yaml", ".yml", ".toml", ".ini", ".cfg", ".conf", - ".sh", ".bash", ".zsh", ".fish", - ".sql", ".csv", ".tsv", - ".rst", ".tex", ".log", + ".txt", + ".md", + ".markdown", + ".json", + ".xml", + ".html", + ".htm", + ".css", + ".js", + ".ts", + ".py", + ".java", + ".c", + ".cpp", + ".h", + ".yaml", + ".yml", + ".toml", + ".ini", + ".cfg", + ".conf", + ".sh", + ".bash", + ".zsh", + ".fish", + ".sql", + ".csv", + ".tsv", + ".rst", + ".tex", + ".log", } def _is_binary_file(file_name: str, mime_type: str) -> bool: """Check if a file is binary based on extension or mime type.""" extension = Path(file_name).suffix.lower() - + # Check extension first if extension in BINARY_FILE_EXTENSIONS: return True if extension in TEXT_FILE_EXTENSIONS: return False - + # Check mime type if mime_type: if mime_type.startswith(("image/", "audio/", "video/", "application/pdf")): @@ -122,9 +176,13 @@ def _is_binary_file(file_name: str, mime_type: str) -> bool: if mime_type.startswith(("text/", "application/json", "application/xml")): return False # Office documents - if "spreadsheet" in mime_type or "document" in mime_type or "presentation" in mime_type: + if ( + "spreadsheet" in mime_type + or "document" in mime_type + or "presentation" in mime_type + ): return True - + # Default to text for unknown types return False @@ -143,10 +201,10 @@ async def _process_file_content( ) -> str: """ Process file content and return markdown text. - + For binary files (PDFs, images, etc.), uses Surfsense's ETL service. For text files, decodes as UTF-8. - + Args: content: File content as bytes or string file_name: Name of the file @@ -158,14 +216,14 @@ async def _process_file_content( task_logger: Task logging service log_entry: Log entry for tracking processing_errors: List to append errors to - + Returns: Markdown content string """ # Ensure content is bytes if isinstance(content, str): content = content.encode("utf-8") - + # Check if this is a binary file if _is_binary_file(file_name, mime_type): # Use ETL service for binary files (PDF, Office docs, etc.) @@ -173,24 +231,26 @@ async def _process_file_content( try: # Get file extension extension = Path(file_name).suffix or ".bin" - + # Write to temp file - with tempfile.NamedTemporaryFile(delete=False, suffix=extension) as tmp_file: + with tempfile.NamedTemporaryFile( + delete=False, suffix=extension + ) as tmp_file: tmp_file.write(content) temp_file_path = tmp_file.name - + # Use the configured ETL service to extract text extracted_text = await _extract_text_with_etl( temp_file_path, file_name, task_logger, log_entry ) - + if extracted_text: return extracted_text else: # Fallback if extraction fails logger.warning(f"Could not extract text from binary file {file_name}") return f"# {file_name}\n\n[Binary file - text extraction failed]\n\n**File ID:** {file_id}\n**Type:** {mime_type}\n" - + except Exception as e: error_msg = f"Error processing binary file {file_name}: {e!s}" logger.error(error_msg) @@ -214,7 +274,7 @@ async def _process_file_content( return content.decode(encoding) except UnicodeDecodeError: continue - + # If all encodings fail, treat as binary error_msg = f"Could not decode text file {file_name} with any encoding" logger.warning(error_msg) @@ -230,27 +290,27 @@ async def _extract_text_with_etl( ) -> str | None: """ Extract text from a file using the configured ETL service. - + Args: file_path: Path to the file file_name: Name of the file task_logger: Task logging service log_entry: Log entry for tracking - + Returns: Extracted text as markdown, or None if extraction fails """ import warnings from logging import ERROR, getLogger - + etl_service = config.ETL_SERVICE - + try: if etl_service == "UNSTRUCTURED": from langchain_unstructured import UnstructuredLoader from app.utils.document_converters import convert_document_to_markdown - + loader = UnstructuredLoader( file_path, mode="elements", @@ -260,57 +320,67 @@ async def _extract_text_with_etl( include_metadata=False, strategy="auto", ) - + docs = await loader.aload() if docs: return await convert_document_to_markdown(docs) return None - + elif etl_service == "LLAMACLOUD": - from app.tasks.document_processors.file_processors import parse_with_llamacloud_retry - + from app.tasks.document_processors.file_processors import ( + parse_with_llamacloud_retry, + ) + # Estimate pages (rough estimate based on file size) file_size = os.path.getsize(file_path) estimated_pages = max(1, file_size // (80 * 1024)) - + result = await parse_with_llamacloud_retry( file_path=file_path, estimated_pages=estimated_pages, task_logger=task_logger, log_entry=log_entry, ) - - markdown_documents = await result.aget_markdown_documents(split_by_page=False) + + markdown_documents = await result.aget_markdown_documents( + split_by_page=False + ) if markdown_documents: return markdown_documents[0].text return None - + elif etl_service == "DOCLING": from app.services.docling_service import create_docling_service - + docling_service = create_docling_service() - + # Suppress pdfminer warnings pdfminer_logger = getLogger("pdfminer") original_level = pdfminer_logger.level - + with warnings.catch_warnings(): - warnings.filterwarnings("ignore", category=UserWarning, module="pdfminer") - warnings.filterwarnings("ignore", message=".*Cannot set gray non-stroke color.*") + warnings.filterwarnings( + "ignore", category=UserWarning, module="pdfminer" + ) + warnings.filterwarnings( + "ignore", message=".*Cannot set gray non-stroke color.*" + ) warnings.filterwarnings("ignore", message=".*invalid float value.*") - + pdfminer_logger.setLevel(ERROR) - + try: - result = await docling_service.process_document(file_path, file_name) + result = await docling_service.process_document( + file_path, file_name + ) finally: pdfminer_logger.setLevel(original_level) - + return result.get("content") else: logger.warning(f"Unknown ETL service: {etl_service}") return None - + except Exception as e: logger.error(f"ETL extraction failed for {file_name}: {e!s}") return None @@ -367,9 +437,11 @@ async def index_composio_connector( # Get connector by id - accept any Composio connector type # We'll check the actual type after loading connector = await get_connector_by_id( - session, connector_id, None # Don't filter by type, we'll validate after + session, + connector_id, + None, # Don't filter by type, we'll validate after ) - + # Validate it's a Composio connector if connector and connector.connector_type not in [ SearchSourceConnectorType.COMPOSIO_GOOGLE_DRIVE_CONNECTOR, @@ -392,7 +464,9 @@ async def index_composio_connector( # Get toolkit ID from config toolkit_id = connector.config.get("toolkit_id") if not toolkit_id: - error_msg = f"Composio connector {connector_id} has no toolkit_id configured" + error_msg = ( + f"Composio connector {connector_id} has no toolkit_id configured" + ) await task_logger.log_task_failure( log_entry, error_msg, {"error_type": "MissingToolkitId"} ) @@ -488,7 +562,7 @@ async def _index_composio_google_drive( max_items: int = 1000, ) -> tuple[int, str]: """Index Google Drive files via Composio. - + Supports folder/file selection via connector config: - selected_folders: List of {id, name} for folders to index - selected_files: List of {id, name} for individual files to index @@ -502,14 +576,18 @@ async def _index_composio_google_drive( selected_folders = connector_config.get("selected_folders", []) selected_files = connector_config.get("selected_files", []) indexing_options = connector_config.get("indexing_options", {}) - + max_files_per_folder = indexing_options.get("max_files_per_folder", 100) include_subfolders = indexing_options.get("include_subfolders", True) await task_logger.log_task_progress( log_entry, f"Fetching Google Drive files via Composio for connector {connector_id}", - {"stage": "fetching_files", "selected_folders": len(selected_folders), "selected_files": len(selected_files)}, + { + "stage": "fetching_files", + "selected_folders": len(selected_folders), + "selected_files": len(selected_files), + }, ) all_files = [] @@ -520,34 +598,42 @@ async def _index_composio_google_drive( for folder in selected_folders: folder_id = folder.get("id") folder_name = folder.get("name", "Unknown") - + if not folder_id: continue - + # Handle special case for "root" folder actual_folder_id = None if folder_id == "root" else folder_id - + logger.info(f"Fetching files from folder: {folder_name} ({folder_id})") - + # Fetch files from this folder folder_files = [] page_token = None - + while len(folder_files) < max_files_per_folder: - files, next_token, error = await composio_connector.list_drive_files( + ( + files, + next_token, + error, + ) = await composio_connector.list_drive_files( folder_id=actual_folder_id, page_token=page_token, page_size=min(100, max_files_per_folder - len(folder_files)), ) if error: - logger.warning(f"Failed to fetch files from folder {folder_name}: {error}") + logger.warning( + f"Failed to fetch files from folder {folder_name}: {error}" + ) break # Process files for file_info in files: - mime_type = file_info.get("mimeType", "") or file_info.get("mime_type", "") - + mime_type = file_info.get("mimeType", "") or file_info.get( + "mime_type", "" + ) + # If it's a folder and include_subfolders is enabled, recursively fetch if mime_type == "application/vnd.google-apps.folder": if include_subfolders: @@ -565,7 +651,7 @@ async def _index_composio_google_drive( if not next_token: break page_token = next_token - + all_files.extend(folder_files[:max_files_per_folder]) logger.info(f"Found {len(folder_files)} files in folder {folder_name}") @@ -573,16 +659,18 @@ async def _index_composio_google_drive( for selected_file in selected_files: file_id = selected_file.get("id") file_name = selected_file.get("name", "Unknown") - + if not file_id: continue - + # Add file info (we'll fetch content later during indexing) - all_files.append({ - "id": file_id, - "name": file_name, - "mimeType": "", # Will be determined later - }) + all_files.append( + { + "id": file_id, + "name": file_name, + "mimeType": "", # Will be determined later + } + ) else: # No selection specified - fetch all files (original behavior) page_token = None @@ -613,7 +701,10 @@ async def _index_composio_google_drive( # CRITICAL: Update timestamp even when no files found so Electric SQL syncs and UI shows indexed status await update_connector_last_indexed(session, connector, update_last_indexed) await session.commit() - return 0, None # Return None (not error) when no items found - this is success with 0 items + return ( + 0, + None, + ) # Return None (not error) when no items found - this is success with 0 items logger.info(f"Found {len(all_files)} Google Drive files to index via Composio") @@ -625,8 +716,14 @@ async def _index_composio_google_drive( try: # Handle both standard Google API and potential Composio variations file_id = file_info.get("id", "") or file_info.get("fileId", "") - file_name = file_info.get("name", "") or file_info.get("fileName", "") or "Untitled" - mime_type = file_info.get("mimeType", "") or file_info.get("mime_type", "") + file_name = ( + file_info.get("name", "") + or file_info.get("fileName", "") + or "Untitled" + ) + mime_type = file_info.get("mimeType", "") or file_info.get( + "mime_type", "" + ) if not file_id: documents_skipped += 1 @@ -648,12 +745,15 @@ async def _index_composio_google_drive( ) # Get file content - content, content_error = await composio_connector.get_drive_file_content( - file_id - ) + ( + content, + content_error, + ) = await composio_connector.get_drive_file_content(file_id) if content_error or not content: - logger.warning(f"Could not get content for file {file_name}: {content_error}") + logger.warning( + f"Could not get content for file {file_name}: {content_error}" + ) # Use metadata as content fallback markdown_content = f"# {file_name}\n\n" markdown_content += f"**File ID:** {file_id}\n" @@ -700,12 +800,19 @@ async def _index_composio_google_drive( "mime_type": mime_type, "document_type": "Google Drive File (Composio)", } - summary_content, summary_embedding = await generate_document_summary( + ( + summary_content, + summary_embedding, + ) = await generate_document_summary( markdown_content, user_llm, document_metadata ) else: - summary_content = f"Google Drive File: {file_name}\n\nType: {mime_type}" - summary_embedding = config.embedding_model_instance.embed(summary_content) + summary_content = ( + f"Google Drive File: {file_name}\n\nType: {mime_type}" + ) + summary_embedding = config.embedding_model_instance.embed( + summary_content + ) chunks = await create_document_chunks(markdown_content) @@ -724,8 +831,8 @@ async def _index_composio_google_drive( existing_document.updated_at = get_current_timestamp() documents_indexed += 1 - - # Batch commit every 10 documents + + # Batch commit every 10 documents if documents_indexed % 10 == 0: logger.info( f"Committing batch: {documents_indexed} Google Drive files processed so far" @@ -745,12 +852,19 @@ async def _index_composio_google_drive( "mime_type": mime_type, "document_type": "Google Drive File (Composio)", } - summary_content, summary_embedding = await generate_document_summary( + ( + summary_content, + summary_embedding, + ) = await generate_document_summary( markdown_content, user_llm, document_metadata ) else: - summary_content = f"Google Drive File: {file_name}\n\nType: {mime_type}" - summary_embedding = config.embedding_model_instance.embed(summary_content) + summary_content = ( + f"Google Drive File: {file_name}\n\nType: {mime_type}" + ) + summary_embedding = config.embedding_model_instance.embed( + summary_content + ) chunks = await create_document_chunks(markdown_content) @@ -776,7 +890,7 @@ async def _index_composio_google_drive( session.add(document) documents_indexed += 1 - # Batch commit every 10 documents + # Batch commit every 10 documents if documents_indexed % 10 == 0: logger.info( f"Committing batch: {documents_indexed} Google Drive files processed so far" @@ -784,7 +898,9 @@ async def _index_composio_google_drive( await session.commit() except Exception as e: - error_msg = f"Error processing Drive file {file_name or 'unknown'}: {e!s}" + error_msg = ( + f"Error processing Drive file {file_name or 'unknown'}: {e!s}" + ) logger.error(error_msg, exc_info=True) processing_errors.append(error_msg) documents_skipped += 1 @@ -848,7 +964,7 @@ async def _fetch_folder_files_recursively( ) -> list[dict[str, Any]]: """ Recursively fetch files from a Google Drive folder via Composio. - + Args: composio_connector: The Composio connector instance folder_id: Google Drive folder ID @@ -856,20 +972,20 @@ async def _fetch_folder_files_recursively( current_count: Current number of files already fetched depth: Current recursion depth max_depth: Maximum recursion depth to prevent infinite loops - + Returns: List of file info dictionaries """ if depth >= max_depth: logger.warning(f"Max recursion depth reached for folder {folder_id}") return [] - + if current_count >= max_files: return [] - + all_files = [] page_token = None - + try: while len(all_files) + current_count < max_files: files, next_token, error = await composio_connector.list_drive_files( @@ -877,14 +993,18 @@ async def _fetch_folder_files_recursively( page_token=page_token, page_size=min(100, max_files - len(all_files) - current_count), ) - + if error: - logger.warning(f"Error fetching files from subfolder {folder_id}: {error}") + logger.warning( + f"Error fetching files from subfolder {folder_id}: {error}" + ) break - + for file_info in files: - mime_type = file_info.get("mimeType", "") or file_info.get("mime_type", "") - + mime_type = file_info.get("mimeType", "") or file_info.get( + "mime_type", "" + ) + if mime_type == "application/vnd.google-apps.folder": # Recursively fetch from subfolders subfolder_files = await _fetch_folder_files_recursively( @@ -898,16 +1018,16 @@ async def _fetch_folder_files_recursively( all_files.extend(subfolder_files) else: all_files.append(file_info) - + if len(all_files) + current_count >= max_files: break - + if not next_token: break page_token = next_token - - return all_files[:max_files - current_count] - + + return all_files[: max_files - current_count] + except Exception as e: logger.error(f"Error in recursive folder fetch: {e!s}") return all_files @@ -924,10 +1044,10 @@ async def _process_gmail_message_batch( ) -> tuple[int, int]: """ Process a batch of Gmail messages and index them. - + Args: total_documents_indexed: Running total of documents indexed so far (for batch commits). - + Returns: Tuple of (documents_indexed, documents_skipped) """ @@ -965,7 +1085,9 @@ async def _process_gmail_message_batch( date_str = value # Format to markdown using the full message data - markdown_content = composio_connector.format_gmail_message_to_markdown(message) + markdown_content = composio_connector.format_gmail_message_to_markdown( + message + ) # Check for empty content (defensive parsing per Composio best practices) if not markdown_content.strip(): @@ -1008,12 +1130,19 @@ async def _process_gmail_message_batch( "sender": sender, "document_type": "Gmail Message (Composio)", } - summary_content, summary_embedding = await generate_document_summary( + ( + summary_content, + summary_embedding, + ) = await generate_document_summary( markdown_content, user_llm, document_metadata ) else: - summary_content = f"Gmail: {subject}\n\nFrom: {sender}\nDate: {date_str}" - summary_embedding = config.embedding_model_instance.embed(summary_content) + summary_content = ( + f"Gmail: {subject}\n\nFrom: {sender}\nDate: {date_str}" + ) + summary_embedding = config.embedding_model_instance.embed( + summary_content + ) chunks = await create_document_chunks(markdown_content) @@ -1035,8 +1164,8 @@ async def _process_gmail_message_batch( existing_document.updated_at = get_current_timestamp() documents_indexed += 1 - - # Batch commit every 10 documents + + # Batch commit every 10 documents current_total = total_documents_indexed + documents_indexed if current_total % 10 == 0: logger.info( @@ -1062,8 +1191,12 @@ async def _process_gmail_message_batch( markdown_content, user_llm, document_metadata ) else: - summary_content = f"Gmail: {subject}\n\nFrom: {sender}\nDate: {date_str}" - summary_embedding = config.embedding_model_instance.embed(summary_content) + summary_content = ( + f"Gmail: {subject}\n\nFrom: {sender}\nDate: {date_str}" + ) + summary_embedding = config.embedding_model_instance.embed( + summary_content + ) chunks = await create_document_chunks(markdown_content) @@ -1092,7 +1225,7 @@ async def _process_gmail_message_batch( session.add(document) documents_indexed += 1 - # Batch commit every 10 documents + # Batch commit every 10 documents current_total = total_documents_indexed + documents_indexed if current_total % 10 == 0: logger.info( @@ -1107,7 +1240,9 @@ async def _process_gmail_message_batch( try: await session.rollback() except Exception as rollback_error: - logger.error(f"Error during rollback: {rollback_error!s}", exc_info=True) + logger.error( + f"Error during rollback: {rollback_error!s}", exc_info=True + ) continue return documents_indexed, documents_skipped @@ -1169,7 +1304,9 @@ async def _index_composio_gmail( current_batch_size = min(batch_size, remaining) # Use result_size_estimate if available, otherwise fall back to max_items - estimated_total = result_size_estimate if result_size_estimate is not None else max_items + estimated_total = ( + result_size_estimate if result_size_estimate is not None else max_items + ) # Cap estimated_total at max_items to avoid showing misleading progress estimated_total = min(estimated_total, max_items) @@ -1187,7 +1324,12 @@ async def _index_composio_gmail( ) # Fetch batch of messages - messages, next_token, result_size_estimate_batch, error = await composio_connector.list_gmail_messages( + ( + messages, + next_token, + result_size_estimate_batch, + error, + ) = await composio_connector.list_gmail_messages( query=query, max_results=current_batch_size, page_token=page_token, @@ -1206,13 +1348,17 @@ async def _index_composio_gmail( # Update result_size_estimate from first response (Gmail provides this estimate) if result_size_estimate is None and result_size_estimate_batch is not None: result_size_estimate = result_size_estimate_batch - logger.info(f"Gmail API estimated {result_size_estimate} total messages for query: '{query}'") + logger.info( + f"Gmail API estimated {result_size_estimate} total messages for query: '{query}'" + ) total_messages_fetched += len(messages) # Recalculate estimated_total after potentially updating result_size_estimate - estimated_total = result_size_estimate if result_size_estimate is not None else max_items + estimated_total = ( + result_size_estimate if result_size_estimate is not None else max_items + ) estimated_total = min(estimated_total, max_items) - + logger.info( f"Fetched batch of {len(messages)} Gmail messages " f"(total: {total_messages_fetched}/{estimated_total})" @@ -1357,7 +1503,10 @@ async def _index_composio_google_calendar( # CRITICAL: Update timestamp even when no events found so Electric SQL syncs and UI shows indexed status await update_connector_last_indexed(session, connector, update_last_indexed) await session.commit() - return 0, None # Return None (not error) when no items found - this is success with 0 items + return ( + 0, + None, + ) # Return None (not error) when no items found - this is success with 0 items logger.info(f"Found {len(events)} Google Calendar events to index via Composio") @@ -1368,14 +1517,18 @@ async def _index_composio_google_calendar( try: # Handle both standard Google API and potential Composio variations event_id = event.get("id", "") or event.get("eventId", "") - summary = event.get("summary", "") or event.get("title", "") or "No Title" + summary = ( + event.get("summary", "") or event.get("title", "") or "No Title" + ) if not event_id: documents_skipped += 1 continue # Format to markdown - markdown_content = composio_connector.format_calendar_event_to_markdown(event) + markdown_content = composio_connector.format_calendar_event_to_markdown( + event + ) # Generate unique identifier document_type = DocumentType(TOOLKIT_TO_DOCUMENT_TYPE["googlecalendar"]) @@ -1413,14 +1566,19 @@ async def _index_composio_google_calendar( "start_time": start_time, "document_type": "Google Calendar Event (Composio)", } - summary_content, summary_embedding = await generate_document_summary( + ( + summary_content, + summary_embedding, + ) = await generate_document_summary( markdown_content, user_llm, document_metadata ) else: summary_content = f"Calendar: {summary}\n\nStart: {start_time}\nEnd: {end_time}" if location: summary_content += f"\nLocation: {location}" - summary_embedding = config.embedding_model_instance.embed(summary_content) + summary_embedding = config.embedding_model_instance.embed( + summary_content + ) chunks = await create_document_chunks(markdown_content) @@ -1441,8 +1599,8 @@ async def _index_composio_google_calendar( existing_document.updated_at = get_current_timestamp() documents_indexed += 1 - - # Batch commit every 10 documents + + # Batch commit every 10 documents if documents_indexed % 10 == 0: logger.info( f"Committing batch: {documents_indexed} Google Calendar events processed so far" @@ -1462,21 +1620,30 @@ async def _index_composio_google_calendar( "start_time": start_time, "document_type": "Google Calendar Event (Composio)", } - summary_content, summary_embedding = await generate_document_summary( + ( + summary_content, + summary_embedding, + ) = await generate_document_summary( markdown_content, user_llm, document_metadata ) else: - summary_content = f"Calendar: {summary}\n\nStart: {start_time}\nEnd: {end_time}" + summary_content = ( + f"Calendar: {summary}\n\nStart: {start_time}\nEnd: {end_time}" + ) if location: summary_content += f"\nLocation: {location}" - summary_embedding = config.embedding_model_instance.embed(summary_content) + summary_embedding = config.embedding_model_instance.embed( + summary_content + ) chunks = await create_document_chunks(markdown_content) document = Document( search_space_id=search_space_id, title=f"Calendar: {summary}", - document_type=DocumentType(TOOLKIT_TO_DOCUMENT_TYPE["googlecalendar"]), + document_type=DocumentType( + TOOLKIT_TO_DOCUMENT_TYPE["googlecalendar"] + ), document_metadata={ "event_id": event_id, "summary": summary, @@ -1497,7 +1664,7 @@ async def _index_composio_google_calendar( session.add(document) documents_indexed += 1 - # Batch commit every 10 documents + # Batch commit every 10 documents if documents_indexed % 10 == 0: logger.info( f"Committing batch: {documents_indexed} Google Calendar events processed so far" @@ -1535,5 +1702,7 @@ async def _index_composio_google_calendar( return documents_indexed, None except Exception as e: - logger.error(f"Failed to index Google Calendar via Composio: {e!s}", exc_info=True) + logger.error( + f"Failed to index Google Calendar via Composio: {e!s}", exc_info=True + ) return 0, f"Failed to index Google Calendar via Composio: {e!s}" diff --git a/surfsense_backend/app/tasks/connector_indexers/__init__.py b/surfsense_backend/app/tasks/connector_indexers/__init__.py index 35b5fde4c..8f25e6fdd 100644 --- a/surfsense_backend/app/tasks/connector_indexers/__init__.py +++ b/surfsense_backend/app/tasks/connector_indexers/__init__.py @@ -26,6 +26,7 @@ Available indexers: # Calendar and scheduling from .airtable_indexer import index_airtable_records from .bookstack_indexer import index_bookstack_pages + # Note: composio_indexer is imported directly in connector_tasks.py to avoid circular imports from .clickup_indexer import index_clickup_tasks from .confluence_indexer import index_confluence_pages diff --git a/surfsense_backend/app/tasks/connector_indexers/github_indexer.py b/surfsense_backend/app/tasks/connector_indexers/github_indexer.py index f16ee0156..4a8df4918 100644 --- a/surfsense_backend/app/tasks/connector_indexers/github_indexer.py +++ b/surfsense_backend/app/tasks/connector_indexers/github_indexer.py @@ -128,7 +128,9 @@ async def index_github_repos( if github_pat: logger.info("Using GitHub PAT for authentication (private repos supported)") else: - logger.info("No GitHub PAT provided - only public repositories can be indexed") + logger.info( + "No GitHub PAT provided - only public repositories can be indexed" + ) # 3. Initialize GitHub connector with gitingest backend await task_logger.log_task_progress( @@ -308,9 +310,7 @@ async def _process_repository_digest( if existing_document: # Document exists - check if content has changed if existing_document.content_hash == content_hash: - logger.info( - f"Repository {repo_full_name} unchanged. Skipping." - ) + logger.info(f"Repository {repo_full_name} unchanged. Skipping.") return 0 else: logger.info( @@ -341,7 +341,7 @@ async def _process_repository_digest( summary_content = ( f"# Repository: {repo_full_name}\n\n" f"## File Structure\n\n{digest.tree}\n\n" - f"## File Contents (truncated)\n\n{digest.content[:MAX_DIGEST_CHARS - len(digest.tree) - 200]}..." + f"## File Contents (truncated)\n\n{digest.content[: MAX_DIGEST_CHARS - len(digest.tree) - 200]}..." ) summary_text, summary_embedding = await generate_document_summary( @@ -362,9 +362,7 @@ async def _process_repository_digest( # This preserves file-level granularity in search chunks_data = await create_document_chunks(digest.content) except Exception as chunk_err: - logger.error( - f"Failed to chunk repository {repo_full_name}: {chunk_err}" - ) + logger.error(f"Failed to chunk repository {repo_full_name}: {chunk_err}") # Fall back to a simpler chunking approach chunks_data = await _simple_chunk_content(digest.content) diff --git a/surfsense_web/components/assistant-ui/connector-popup/connector-configs/components/composio-config.tsx b/surfsense_web/components/assistant-ui/connector-popup/connector-configs/components/composio-config.tsx index 255d0cef4..fdff956e5 100644 --- a/surfsense_web/components/assistant-ui/connector-popup/connector-configs/components/composio-config.tsx +++ b/surfsense_web/components/assistant-ui/connector-popup/connector-configs/components/composio-config.tsx @@ -211,7 +211,9 @@ export const ComposioConfig: FC = ({ connector, onConfigCha ); } if (selectedFiles.length > 0) { - parts.push(`${selectedFiles.length} file${selectedFiles.length > 1 ? "s" : ""}`); + parts.push( + `${selectedFiles.length} file${selectedFiles.length > 1 ? "s" : ""}` + ); } return parts.length > 0 ? `(${parts.join(" ")})` : ""; })()} @@ -338,7 +340,9 @@ export const ComposioConfig: FC = ({ connector, onConfigCha handleIndexingOptionChange("include_subfolders", checked)} + onCheckedChange={(checked) => + handleIndexingOptionChange("include_subfolders", checked) + } /> diff --git a/surfsense_web/components/assistant-ui/connector-popup/constants/connector-popup.schemas.ts b/surfsense_web/components/assistant-ui/connector-popup/constants/connector-popup.schemas.ts index c7e77f666..5a0a8e8c8 100644 --- a/surfsense_web/components/assistant-ui/connector-popup/constants/connector-popup.schemas.ts +++ b/surfsense_web/components/assistant-ui/connector-popup/constants/connector-popup.schemas.ts @@ -7,7 +7,9 @@ import { searchSourceConnectorTypeEnum } from "@/contracts/types/connector.types export const connectorPopupQueryParamsSchema = z.object({ modal: z.enum(["connectors"]).optional(), tab: z.enum(["all", "active"]).optional(), - view: z.enum(["configure", "edit", "connect", "youtube", "accounts", "mcp-list", "composio"]).optional(), + view: z + .enum(["configure", "edit", "connect", "youtube", "accounts", "mcp-list", "composio"]) + .optional(), connector: z.string().optional(), connectorId: z.string().optional(), connectorType: z.string().optional(), diff --git a/surfsense_web/components/assistant-ui/connector-popup/hooks/use-connector-dialog.ts b/surfsense_web/components/assistant-ui/connector-popup/hooks/use-connector-dialog.ts index 1be8a7983..b30337de3 100644 --- a/surfsense_web/components/assistant-ui/connector-popup/hooks/use-connector-dialog.ts +++ b/surfsense_web/components/assistant-ui/connector-popup/hooks/use-connector-dialog.ts @@ -26,7 +26,11 @@ import { import { cacheKeys } from "@/lib/query-client/cache-keys"; import { queryClient } from "@/lib/query-client/client"; import type { IndexingConfigState } from "../constants/connector-constants"; -import { COMPOSIO_CONNECTORS, OAUTH_CONNECTORS, OTHER_CONNECTORS } from "../constants/connector-constants"; +import { + COMPOSIO_CONNECTORS, + OAUTH_CONNECTORS, + OTHER_CONNECTORS, +} from "../constants/connector-constants"; import { dateRangeSchema, frequencyMinutesSchema, @@ -83,7 +87,6 @@ export const useConnectorDialog = () => { // MCP list view state (for managing multiple MCP connectors) const [viewingMCPList, setViewingMCPList] = useState(false); - // Track if we came from accounts list when entering edit mode const [cameFromAccountsList, setCameFromAccountsList] = useState<{ connectorType: string; @@ -164,16 +167,14 @@ export const useConnectorDialog = () => { // Handle accounts view if (params.view === "accounts" && params.connectorType) { // Update state if not set, or if connectorType has changed - const needsUpdate = !viewingAccountsType || - viewingAccountsType.connectorType !== params.connectorType; - + const needsUpdate = + !viewingAccountsType || viewingAccountsType.connectorType !== params.connectorType; + if (needsUpdate) { // Check both OAUTH_CONNECTORS and COMPOSIO_CONNECTORS - const oauthConnector = OAUTH_CONNECTORS.find( - (c) => c.connectorType === params.connectorType - ) || COMPOSIO_CONNECTORS.find( - (c) => c.connectorType === params.connectorType - ); + const oauthConnector = + OAUTH_CONNECTORS.find((c) => c.connectorType === params.connectorType) || + COMPOSIO_CONNECTORS.find((c) => c.connectorType === params.connectorType); if (oauthConnector) { setViewingAccountsType({ connectorType: oauthConnector.connectorType, @@ -395,11 +396,8 @@ export const useConnectorDialog = () => { // Check if authEndpoint already has query parameters const separator = connector.authEndpoint.includes("?") ? "&" : "?"; const url = `${process.env.NEXT_PUBLIC_FASTAPI_BACKEND_URL}${connector.authEndpoint}${separator}space_id=${searchSpaceId}`; - - const response = await authenticatedFetch( - url, - { method: "GET" } - ); + + const response = await authenticatedFetch(url, { method: "GET" }); if (!response.ok) { throw new Error(`Failed to initiate ${connector.title} OAuth`); diff --git a/surfsense_web/components/assistant-ui/connector-popup/tabs/all-connectors-tab.tsx b/surfsense_web/components/assistant-ui/connector-popup/tabs/all-connectors-tab.tsx index ffe879d5d..6b38a37d2 100644 --- a/surfsense_web/components/assistant-ui/connector-popup/tabs/all-connectors-tab.tsx +++ b/surfsense_web/components/assistant-ui/connector-popup/tabs/all-connectors-tab.tsx @@ -4,7 +4,12 @@ import type { FC } from "react"; import { EnumConnectorName } from "@/contracts/enums/connector"; import type { SearchSourceConnector } from "@/contracts/types/connector.types"; import { ConnectorCard } from "../components/connector-card"; -import { CRAWLERS, OAUTH_CONNECTORS, OTHER_CONNECTORS, COMPOSIO_CONNECTORS } from "../constants/connector-constants"; +import { + CRAWLERS, + OAUTH_CONNECTORS, + OTHER_CONNECTORS, + COMPOSIO_CONNECTORS, +} from "../constants/connector-constants"; import { getDocumentCountForConnector } from "../utils/connector-document-mapping"; /** @@ -28,7 +33,9 @@ interface AllConnectorsTabProps { allConnectors: SearchSourceConnector[] | undefined; documentTypeCounts?: Record; indexingConnectorIds?: Set; - onConnectOAuth: (connector: (typeof OAUTH_CONNECTORS)[number] | (typeof COMPOSIO_CONNECTORS)[number]) => void; + onConnectOAuth: ( + connector: (typeof OAUTH_CONNECTORS)[number] | (typeof COMPOSIO_CONNECTORS)[number] + ) => void; onConnectNonOAuth?: (connectorType: string) => void; onCreateWebcrawler?: () => void; onCreateYouTubeCrawler?: () => void; @@ -82,7 +89,9 @@ export const AllConnectorsTab: FC = ({ {filteredComposio.length > 0 && (
-

Managed OAuth (Composio)

+

+ Managed OAuth (Composio) +

{filteredComposio.map((connector) => { @@ -99,7 +108,6 @@ export const AllConnectorsTab: FC = ({ const accountCount = typeConnectors.length; - const documentCount = getDocumentCountForConnector( connector.connectorType, documentTypeCounts @@ -154,7 +162,6 @@ export const AllConnectorsTab: FC = ({ const accountCount = typeConnectors.length; - const documentCount = getDocumentCountForConnector( connector.connectorType, documentTypeCounts diff --git a/surfsense_web/components/connectors/composio-drive-folder-tree.tsx b/surfsense_web/components/connectors/composio-drive-folder-tree.tsx index 72c36edd5..76ae218cb 100644 --- a/surfsense_web/components/connectors/composio-drive-folder-tree.tsx +++ b/surfsense_web/components/connectors/composio-drive-folder-tree.tsx @@ -362,4 +362,3 @@ export function ComposioDriveFolderTree({
); } - diff --git a/surfsense_web/hooks/use-composio-drive-folders.ts b/surfsense_web/hooks/use-composio-drive-folders.ts index af8da1a81..31e516286 100644 --- a/surfsense_web/hooks/use-composio-drive-folders.ts +++ b/surfsense_web/hooks/use-composio-drive-folders.ts @@ -26,4 +26,3 @@ export function useComposioDriveFolders({ retry: 2, }); } - From 6a41b0f6080f9d050a662fa8d0909f282c1c83d4 Mon Sep 17 00:00:00 2001 From: Anish Sarkar <104695310+AnishSarkar22@users.noreply.github.com> Date: Fri, 23 Jan 2026 14:12:57 +0530 Subject: [PATCH 08/28] feat: enhance Composio and Google Drive connector functionality - Added support for extracting connected account IDs from query parameters in the Composio callback, accommodating both camelCase and snake_case formats. - Improved logging for received query parameters in the Composio callback to enhance debugging. - Updated Google Drive folder listing logs to remove unnecessary emoji for consistency. - Expanded the connector dialog to include Composio Google Drive as a recognized connector type, improving user interface clarity. --- .../app/routes/composio_routes.py | 21 ++++++++++--------- .../google_drive_add_connector_route.py | 2 +- .../hooks/use-connector-dialog.ts | 7 +++++-- .../layout/ui/sidebar/InboxSidebar.tsx | 3 +++ 4 files changed, 20 insertions(+), 13 deletions(-) diff --git a/surfsense_backend/app/routes/composio_routes.py b/surfsense_backend/app/routes/composio_routes.py index dec9beb02..5af332760 100644 --- a/surfsense_backend/app/routes/composio_routes.py +++ b/surfsense_backend/app/routes/composio_routes.py @@ -14,7 +14,7 @@ Endpoints: import logging from uuid import UUID -from fastapi import APIRouter, Depends, HTTPException, Query +from fastapi import APIRouter, Depends, HTTPException, Query, Request from fastapi.responses import RedirectResponse from pydantic import ValidationError from sqlalchemy.exc import IntegrityError @@ -170,9 +170,8 @@ async def initiate_composio_auth( @router.get("/auth/composio/connector/callback") async def composio_callback( + request: Request, state: str | None = None, - connectedAccountId: str | None = None, # Composio sends camelCase - connected_account_id: str | None = None, # Fallback snake_case error: str | None = None, session: AsyncSession = Depends(get_async_session), ): @@ -239,14 +238,16 @@ async def composio_callback( # Initialize Composio service service = ComposioService() - entity_id = f"surfsense_{user_id}" - # Use camelCase param if provided (Composio's format), fallback to snake_case - final_connected_account_id = connectedAccountId or connected_account_id + # Extract connected_account_id from query params (accepts both camelCase and snake_case) + query_params = request.query_params + final_connected_account_id = query_params.get( + "connectedAccountId" + ) or query_params.get("connected_account_id") - # DEBUG: Log all query parameters received + # DEBUG: Log query parameter received logger.info( - f"DEBUG: Callback received - connectedAccountId: {connectedAccountId}, connected_account_id: {connected_account_id}, using: {final_connected_account_id}" + f"DEBUG: Callback received - connectedAccountId: {query_params.get('connectedAccountId')}, connected_account_id: {query_params.get('connected_account_id')}, using: {final_connected_account_id}" ) # If we still don't have a connected_account_id, warn but continue @@ -448,7 +449,7 @@ async def list_composio_drive_folders( entity_id = f"surfsense_{user.id}" # Fetch files/folders from Composio Google Drive - files, next_token, error = await service.get_drive_files( + files, _next_token, error = await service.get_drive_files( connected_account_id=composio_connected_account_id, entity_id=entity_id, folder_id=parent_id, @@ -502,7 +503,7 @@ async def list_composio_drive_folders( file_count = len(files_list) logger.info( - f"✅ Listed {len(items)} total items ({folder_count} folders, {file_count} files) for Composio connector {connector_id}" + f"Listed {len(items)} total items ({folder_count} folders, {file_count} files) for Composio connector {connector_id}" + (f" in folder {parent_id}" if parent_id else " in ROOT") ) diff --git a/surfsense_backend/app/routes/google_drive_add_connector_route.py b/surfsense_backend/app/routes/google_drive_add_connector_route.py index e15aed762..6b4159d29 100644 --- a/surfsense_backend/app/routes/google_drive_add_connector_route.py +++ b/surfsense_backend/app/routes/google_drive_add_connector_route.py @@ -402,7 +402,7 @@ async def list_google_drive_folders( file_count = len(items) - folder_count logger.info( - f"✅ Listed {len(items)} total items ({folder_count} folders, {file_count} files) for connector {connector_id}" + f"Listed {len(items)} total items ({folder_count} folders, {file_count} files) for connector {connector_id}" + (f" in folder {parent_id}" if parent_id else " in ROOT") ) diff --git a/surfsense_web/components/assistant-ui/connector-popup/hooks/use-connector-dialog.ts b/surfsense_web/components/assistant-ui/connector-popup/hooks/use-connector-dialog.ts index b30337de3..2923ab823 100644 --- a/surfsense_web/components/assistant-ui/connector-popup/hooks/use-connector-dialog.ts +++ b/surfsense_web/components/assistant-ui/connector-popup/hooks/use-connector-dialog.ts @@ -1182,8 +1182,11 @@ export const useConnectorDialog = () => { if (!editingConnector.is_indexable) { // Non-indexable connectors (like Tavily API) don't need re-indexing indexingDescription = "Settings saved."; - } else if (editingConnector.connector_type === "GOOGLE_DRIVE_CONNECTOR") { - // Google Drive uses folder selection from config, not date ranges + } else if ( + editingConnector.connector_type === "GOOGLE_DRIVE_CONNECTOR" || + editingConnector.connector_type === "COMPOSIO_GOOGLE_DRIVE_CONNECTOR" + ) { + // Google Drive (both regular and Composio) uses folder selection from config, not date ranges const selectedFolders = (connectorConfig || editingConnector.config)?.selected_folders as | Array<{ id: string; name: string }> | undefined; diff --git a/surfsense_web/components/layout/ui/sidebar/InboxSidebar.tsx b/surfsense_web/components/layout/ui/sidebar/InboxSidebar.tsx index a3fd3ea14..4dee8888a 100644 --- a/surfsense_web/components/layout/ui/sidebar/InboxSidebar.tsx +++ b/surfsense_web/components/layout/ui/sidebar/InboxSidebar.tsx @@ -79,6 +79,9 @@ function getConnectorTypeDisplayName(connectorType: string): string { GOOGLE_CALENDAR_CONNECTOR: "Google Calendar", GOOGLE_GMAIL_CONNECTOR: "Gmail", GOOGLE_DRIVE_CONNECTOR: "Google Drive", + COMPOSIO_GOOGLE_DRIVE_CONNECTOR: "Composio Google Drive", + COMPOSIO_GMAIL_CONNECTOR: "Composio Gmail", + COMPOSIO_GOOGLE_CALENDAR_CONNECTOR: "Composio Google Calendar", LINEAR_CONNECTOR: "Linear", NOTION_CONNECTOR: "Notion", SLACK_CONNECTOR: "Slack", From 9c5c925fcaf60e2ad7c8a76a6bd66e7e5a227904 Mon Sep 17 00:00:00 2001 From: Anish Sarkar <104695310+AnishSarkar22@users.noreply.github.com> Date: Fri, 23 Jan 2026 14:50:06 +0530 Subject: [PATCH 09/28] feat: update Obsidian connector UI and icon --- .../components/obsidian-connect-form.tsx | 4 +- surfsense_web/lib/connectors/utils.ts | 1 + surfsense_web/public/connectors/obsidian.svg | 56 +++++++++++++++---- 3 files changed, 48 insertions(+), 13 deletions(-) diff --git a/surfsense_web/components/assistant-ui/connector-popup/connect-forms/components/obsidian-connect-form.tsx b/surfsense_web/components/assistant-ui/connector-popup/connect-forms/components/obsidian-connect-form.tsx index 064e10e2f..94839b03b 100644 --- a/surfsense_web/components/assistant-ui/connector-popup/connect-forms/components/obsidian-connect-form.tsx +++ b/surfsense_web/components/assistant-ui/connector-popup/connect-forms/components/obsidian-connect-form.tsx @@ -1,7 +1,7 @@ "use client"; import { zodResolver } from "@hookform/resolvers/zod"; -import { FolderOpen, Info } from "lucide-react"; +import { Info } from "lucide-react"; import type { FC } from "react"; import { useRef, useState } from "react"; import { useForm } from "react-hook-form"; @@ -109,7 +109,7 @@ export const ObsidianConnectForm: FC = ({ onSubmit, isSubmitti return (
- +
Self-Hosted Only diff --git a/surfsense_web/lib/connectors/utils.ts b/surfsense_web/lib/connectors/utils.ts index a85b912ed..34721a6aa 100644 --- a/surfsense_web/lib/connectors/utils.ts +++ b/surfsense_web/lib/connectors/utils.ts @@ -21,6 +21,7 @@ export const getConnectorTypeDisplay = (type: string): string => { ELASTICSEARCH_CONNECTOR: "Elasticsearch", WEBCRAWLER_CONNECTOR: "Web Pages", CIRCLEBACK_CONNECTOR: "Circleback", + OBSIDIAN_CONNECTOR: "Obsidian", }; return typeMap[type] || type; }; diff --git a/surfsense_web/public/connectors/obsidian.svg b/surfsense_web/public/connectors/obsidian.svg index 9fe15c4a3..b5afd5724 100644 --- a/surfsense_web/public/connectors/obsidian.svg +++ b/surfsense_web/public/connectors/obsidian.svg @@ -1,12 +1,46 @@ - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + From 29382070aaa2583a18a0b5dcb0c078d13ebb62f2 Mon Sep 17 00:00:00 2001 From: Anish Sarkar <104695310+AnishSarkar22@users.noreply.github.com> Date: Fri, 23 Jan 2026 18:37:09 +0530 Subject: [PATCH 10/28] feat: enhance Composio connector functionality with Google Drive delta sync support - Added methods to retrieve the starting page token and list changes in Google Drive, enabling delta sync capabilities. - Updated Composio service to handle file download directory configuration. - Modified indexing tasks to support delta sync, improving efficiency by processing only changed files. - Adjusted date handling in connector tasks to allow optional start and end dates. - Improved error handling and logging throughout the Composio indexing process. --- .../app/connectors/composio_connector.py | 49 + .../routes/search_source_connectors_routes.py | 4 +- .../app/services/composio_service.py | 333 +++++-- .../app/tasks/celery_tasks/connector_tasks.py | 8 +- .../celery_tasks/schedule_checker_task.py | 5 + .../app/tasks/composio_indexer.py | 909 +++++++++++------- .../(manage)/components/RowActions.tsx | 2 +- .../assistant-ui/connector-popup.tsx | 13 +- .../views/connector-edit-view.tsx | 2 +- .../views/indexing-configuration-view.tsx | 11 +- .../hooks/use-connector-dialog.ts | 5 +- .../tabs/active-connectors-tab.tsx | 7 +- .../components/settings/llm-role-manager.tsx | 2 +- .../settings/model-config-manager.tsx | 18 +- .../settings/prompt-config-manager.tsx | 2 +- surfsense_web/messages/en.json | 6 +- 16 files changed, 905 insertions(+), 471 deletions(-) diff --git a/surfsense_backend/app/connectors/composio_connector.py b/surfsense_backend/app/connectors/composio_connector.py index b49988887..8cb91355d 100644 --- a/surfsense_backend/app/connectors/composio_connector.py +++ b/surfsense_backend/app/connectors/composio_connector.py @@ -146,6 +146,55 @@ class ComposioConnector: file_id=file_id, ) + async def get_drive_start_page_token(self) -> tuple[str | None, str | None]: + """ + Get the starting page token for Google Drive change tracking. + + Returns: + Tuple of (start_page_token, error message). + """ + connected_account_id = await self.get_connected_account_id() + if not connected_account_id: + return None, "No connected account ID found" + + entity_id = await self.get_entity_id() + service = await self._get_service() + return await service.get_drive_start_page_token( + connected_account_id=connected_account_id, + entity_id=entity_id, + ) + + async def list_drive_changes( + self, + page_token: str | None = None, + page_size: int = 100, + include_removed: bool = True, + ) -> tuple[list[dict[str, Any]], str | None, str | None]: + """ + List changes in Google Drive since the given page token. + + Args: + page_token: Page token from previous sync (optional). + page_size: Number of changes per page. + include_removed: Whether to include removed items. + + Returns: + Tuple of (changes list, new_start_page_token, error message). + """ + connected_account_id = await self.get_connected_account_id() + if not connected_account_id: + return [], None, "No connected account ID found" + + entity_id = await self.get_entity_id() + service = await self._get_service() + return await service.list_drive_changes( + connected_account_id=connected_account_id, + entity_id=entity_id, + page_token=page_token, + page_size=page_size, + include_removed=include_removed, + ) + # ===== Gmail Methods ===== async def list_gmail_messages( diff --git a/surfsense_backend/app/routes/search_source_connectors_routes.py b/surfsense_backend/app/routes/search_source_connectors_routes.py index db1b884e0..82f452c61 100644 --- a/surfsense_backend/app/routes/search_source_connectors_routes.py +++ b/surfsense_backend/app/routes/search_source_connectors_routes.py @@ -2288,8 +2288,8 @@ async def run_composio_indexing( connector_id: int, search_space_id: int, user_id: str, - start_date: str, - end_date: str, + start_date: str | None, + end_date: str | None, ): """ Run Composio connector indexing with real-time notifications. diff --git a/surfsense_backend/app/services/composio_service.py b/surfsense_backend/app/services/composio_service.py index 0d6189cd9..3810f03a4 100644 --- a/surfsense_backend/app/services/composio_service.py +++ b/surfsense_backend/app/services/composio_service.py @@ -57,17 +57,30 @@ TOOLKIT_TO_DOCUMENT_TYPE = { class ComposioService: """Service for interacting with Composio API.""" - def __init__(self, api_key: str | None = None): + # Default download directory for files from Composio + DEFAULT_DOWNLOAD_DIR = "/tmp/composio_downloads" + + def __init__(self, api_key: str | None = None, file_download_dir: str | None = None): """ Initialize the Composio service. Args: api_key: Composio API key. If not provided, uses config.COMPOSIO_API_KEY. + file_download_dir: Directory for downloaded files. Defaults to /tmp/composio_downloads. """ + import os + self.api_key = api_key or config.COMPOSIO_API_KEY if not self.api_key: raise ValueError("COMPOSIO_API_KEY is required but not configured") - self.client = Composio(api_key=self.api_key) + + # Set up download directory + self.file_download_dir = file_download_dir or self.DEFAULT_DOWNLOAD_DIR + os.makedirs(self.file_download_dir, exist_ok=True) + + # Initialize Composio client with download directory + # Per docs: file_download_dir configures where files are downloaded + self.client = Composio(api_key=self.api_key, file_download_dir=self.file_download_dir) @staticmethod def is_enabled() -> bool: @@ -465,6 +478,10 @@ class ComposioService: """ Download file content from Google Drive via Composio. + Per Composio docs: When tools return files, they are automatically downloaded + to a local directory, and the local file path is provided in the response. + Response includes: file_path, file_name, size fields. + Args: connected_account_id: Composio connected account ID. entity_id: The entity/user ID that owns the connected account. @@ -473,11 +490,13 @@ class ComposioService: Returns: Tuple of (file content bytes, error message). """ + from pathlib import Path + try: result = await self.execute_tool( connected_account_id=connected_account_id, tool_name="GOOGLEDRIVE_DOWNLOAD_FILE", - params={"file_id": file_id}, # snake_case + params={"file_id": file_id}, entity_id=entity_id, ) @@ -485,100 +504,234 @@ class ComposioService: return None, result.get("error", "Unknown error") data = result.get("data") - - # Composio GOOGLEDRIVE_DOWNLOAD_FILE returns a dict with file info - # The actual content is in "downloaded_file_content" field - if isinstance(data, dict): - # Try known Composio response fields in order of preference - content = None - - # Primary field from GOOGLEDRIVE_DOWNLOAD_FILE - if "downloaded_file_content" in data: - content = data["downloaded_file_content"] - # downloaded_file_content might itself be a dict with the actual content inside - if isinstance(content, dict): - # Try to extract actual content from nested dict - # Note: Composio nests downloaded_file_content inside another downloaded_file_content - actual_content = ( - content.get("downloaded_file_content") - or content.get("content") - or content.get("data") - or content.get("file_content") - or content.get("body") - or content.get("text") - ) - if actual_content is not None: - content = actual_content - else: - # Log structure for debugging - logger.warning( - f"downloaded_file_content is dict with keys: {list(content.keys())}" - ) - return ( - None, - f"Cannot extract content from downloaded_file_content. Keys: {list(content.keys())}", - ) - # Fallback fields for compatibility - elif "content" in data: - content = data["content"] - elif "file_content" in data: - content = data["file_content"] - elif "data" in data: - content = data["data"] - - if content is None: - # Log available keys for debugging - logger.warning(f"Composio response dict keys: {list(data.keys())}") - return ( - None, - f"No file content found in Composio response. Available keys: {list(data.keys())}", - ) - - # Convert content to bytes - if isinstance(content, str): - # Check if it's base64 encoded - import base64 - - try: - # Try to decode as base64 first - content = base64.b64decode(content) - except Exception: - # If not base64, encode as UTF-8 - content = content.encode("utf-8") - elif isinstance(content, bytes): - pass # Already bytes - elif isinstance(content, dict): - # Still a dict after all extraction attempts - log structure - logger.warning( - f"Content still dict after extraction: {list(content.keys())}" - ) - return ( - None, - f"Unexpected nested content structure: {list(content.keys())}", - ) - else: - return ( - None, - f"Unexpected content type in Composio response: {type(content).__name__}", - ) - - return content, None - elif isinstance(data, str): - return data.encode("utf-8"), None - elif isinstance(data, bytes): - return data, None - elif data is None: + if not data: return None, "No data returned from Composio" - else: - return ( - None, - f"Unexpected data type from Composio: {type(data).__name__}", + + # Per Composio docs, response includes file_path where file was downloaded + # Response structure: {data: {...}, error: ..., successful: ...} + # The actual file info is nested inside data["data"] + file_path = None + + if isinstance(data, dict): + # Handle nested response structure: data contains {data, error, successful} + # The actual file info is in data["data"] + inner_data = data + if "data" in data and isinstance(data["data"], dict): + inner_data = data["data"] + logger.debug(f"Found nested data structure. Inner keys: {list(inner_data.keys())}") + elif "successful" in data and "data" in data: + # Standard Composio response wrapper + inner_data = data["data"] if data["data"] else data + + # Try documented fields: file_path, downloaded_file_content, path, uri + file_path = ( + inner_data.get("file_path") or + inner_data.get("downloaded_file_content") or + inner_data.get("path") or + inner_data.get("uri") ) + + # Handle nested dict case where downloaded_file_content contains the path + if isinstance(file_path, dict): + file_path = ( + file_path.get("file_path") or + file_path.get("downloaded_file_content") or + file_path.get("path") or + file_path.get("uri") + ) + + # If still no path, check if inner_data itself has the nested structure + if not file_path and isinstance(inner_data, dict): + for key in ["downloaded_file_content", "file_path", "path", "uri"]: + if key in inner_data: + val = inner_data[key] + if isinstance(val, str): + file_path = val + break + elif isinstance(val, dict): + # One more level of nesting + file_path = ( + val.get("file_path") or + val.get("downloaded_file_content") or + val.get("path") or + val.get("uri") + ) + if file_path: + break + + logger.debug(f"Composio response keys: {list(data.keys())}, inner keys: {list(inner_data.keys()) if isinstance(inner_data, dict) else 'N/A'}, extracted path: {file_path}") + elif isinstance(data, str): + # Direct string response (could be path or content) + file_path = data + elif isinstance(data, bytes): + # Direct bytes response + return data, None + + # Read file from the path + if file_path and isinstance(file_path, str): + path_obj = Path(file_path) + + # Check if it's a valid file path (absolute or in .composio directory) + if path_obj.is_absolute() or '.composio' in str(path_obj): + try: + if path_obj.exists(): + content = path_obj.read_bytes() + logger.info(f"Successfully read {len(content)} bytes from Composio file: {file_path}") + return content, None + else: + logger.warning(f"File path from Composio does not exist: {file_path}") + return None, f"File not found at path: {file_path}" + except Exception as e: + logger.error(f"Failed to read file from Composio path {file_path}: {e!s}") + return None, f"Failed to read file: {e!s}" + else: + # Not a file path - might be base64 encoded content + try: + import base64 + content = base64.b64decode(file_path) + return content, None + except Exception: + # Not base64, return as UTF-8 bytes + return file_path.encode("utf-8"), None + + # If we got here, couldn't extract file path + if isinstance(data, dict): + # Log full structure for debugging + inner_data = data.get("data", {}) + logger.warning( + f"Could not extract file path from Composio response. " + f"Top keys: {list(data.keys())}, " + f"Inner data keys: {list(inner_data.keys()) if isinstance(inner_data, dict) else type(inner_data).__name__}, " + f"Full inner data: {inner_data}" + ) + return None, f"No file path in Composio response. Keys: {list(data.keys())}, inner: {list(inner_data.keys()) if isinstance(inner_data, dict) else 'N/A'}" + + return None, f"Unexpected data type from Composio: {type(data).__name__}" except Exception as e: logger.error(f"Failed to get Drive file content: {e!s}") return None, str(e) + async def get_drive_start_page_token( + self, connected_account_id: str, entity_id: str + ) -> tuple[str | None, str | None]: + """ + Get the starting page token for Google Drive change tracking. + + This token represents the current state and is used for future delta syncs. + Per Composio docs: Use GOOGLEDRIVE_GET_CHANGES_START_PAGE_TOKEN to get initial token. + + Args: + connected_account_id: Composio connected account ID. + entity_id: The entity/user ID that owns the connected account. + + Returns: + Tuple of (start_page_token, error message). + """ + try: + result = await self.execute_tool( + connected_account_id=connected_account_id, + tool_name="GOOGLEDRIVE_GET_CHANGES_START_PAGE_TOKEN", + params={}, + entity_id=entity_id, + ) + + if not result.get("success"): + return None, result.get("error", "Unknown error") + + data = result.get("data", {}) + # Handle nested response: {data: {startPageToken: ...}, successful: ...} + if isinstance(data, dict): + inner_data = data.get("data", data) + token = ( + inner_data.get("startPageToken") or + inner_data.get("start_page_token") or + data.get("startPageToken") or + data.get("start_page_token") + ) + if token: + logger.info(f"Got Drive start page token: {token}") + return token, None + + logger.warning(f"Could not extract start page token from response: {data}") + return None, "No start page token in response" + + except Exception as e: + logger.error(f"Failed to get Drive start page token: {e!s}") + return None, str(e) + + async def list_drive_changes( + self, + connected_account_id: str, + entity_id: str, + page_token: str | None = None, + page_size: int = 100, + include_removed: bool = True, + ) -> tuple[list[dict[str, Any]], str | None, str | None]: + """ + List changes in Google Drive since the given page token. + + Per Composio docs: GOOGLEDRIVE_LIST_CHANGES tracks modifications to files/folders. + If pageToken is not provided, it auto-fetches the current start page token. + Response includes nextPageToken for pagination and newStartPageToken for future syncs. + + Args: + connected_account_id: Composio connected account ID. + entity_id: The entity/user ID that owns the connected account. + page_token: Page token from previous sync (optional - will auto-fetch if not provided). + page_size: Number of changes per page. + include_removed: Whether to include removed items in the response. + + Returns: + Tuple of (changes list, new_start_page_token, error message). + """ + try: + params = { + "pageSize": min(page_size, 100), + "includeRemoved": include_removed, + } + if page_token: + params["pageToken"] = page_token + + result = await self.execute_tool( + connected_account_id=connected_account_id, + tool_name="GOOGLEDRIVE_LIST_CHANGES", + params=params, + entity_id=entity_id, + ) + + if not result.get("success"): + return [], None, result.get("error", "Unknown error") + + data = result.get("data", {}) + + # Handle nested response structure + changes = [] + new_start_token = None + + if isinstance(data, dict): + inner_data = data.get("data", data) + changes = inner_data.get("changes", []) or data.get("changes", []) + + # Get the token for next sync + # newStartPageToken is returned when all changes have been fetched + # nextPageToken is for pagination within the current fetch + new_start_token = ( + inner_data.get("newStartPageToken") or + inner_data.get("new_start_page_token") or + inner_data.get("nextPageToken") or + inner_data.get("next_page_token") or + data.get("newStartPageToken") or + data.get("nextPageToken") + ) + + logger.info(f"Got {len(changes)} Drive changes, new token: {new_start_token[:20] if new_start_token else 'None'}...") + return changes, new_start_token, None + + except Exception as e: + logger.error(f"Failed to list Drive changes: {e!s}") + return [], None, str(e) + # ===== Gmail specific methods ===== async def get_gmail_messages( diff --git a/surfsense_backend/app/tasks/celery_tasks/connector_tasks.py b/surfsense_backend/app/tasks/celery_tasks/connector_tasks.py index 81cafaa2c..d0710d246 100644 --- a/surfsense_backend/app/tasks/celery_tasks/connector_tasks.py +++ b/surfsense_backend/app/tasks/celery_tasks/connector_tasks.py @@ -810,8 +810,8 @@ def index_composio_connector_task( connector_id: int, search_space_id: int, user_id: str, - start_date: str, - end_date: str, + start_date: str | None, + end_date: str | None, ): """Celery task to index Composio connector content (Google Drive, Gmail, Calendar via Composio).""" import asyncio @@ -833,8 +833,8 @@ async def _index_composio_connector( connector_id: int, search_space_id: int, user_id: str, - start_date: str, - end_date: str, + start_date: str | None, + end_date: str | None, ): """Index Composio connector content with new session and real-time notifications.""" # Import from routes to use the notification-wrapped version diff --git a/surfsense_backend/app/tasks/celery_tasks/schedule_checker_task.py b/surfsense_backend/app/tasks/celery_tasks/schedule_checker_task.py index 21855f73f..bf80cbe78 100644 --- a/surfsense_backend/app/tasks/celery_tasks/schedule_checker_task.py +++ b/surfsense_backend/app/tasks/celery_tasks/schedule_checker_task.py @@ -66,6 +66,7 @@ async def _check_and_trigger_schedules(): from app.tasks.celery_tasks.connector_tasks import ( index_airtable_records_task, index_clickup_tasks_task, + index_composio_connector_task, index_confluence_pages_task, index_crawled_urls_task, index_discord_messages_task, @@ -98,6 +99,10 @@ async def _check_and_trigger_schedules(): SearchSourceConnectorType.ELASTICSEARCH_CONNECTOR: index_elasticsearch_documents_task, SearchSourceConnectorType.WEBCRAWLER_CONNECTOR: index_crawled_urls_task, SearchSourceConnectorType.GOOGLE_DRIVE_CONNECTOR: index_google_drive_files_task, + # Composio connector types + SearchSourceConnectorType.COMPOSIO_GOOGLE_DRIVE_CONNECTOR: index_composio_connector_task, + SearchSourceConnectorType.COMPOSIO_GMAIL_CONNECTOR: index_composio_connector_task, + SearchSourceConnectorType.COMPOSIO_GOOGLE_CALENDAR_CONNECTOR: index_composio_connector_task, } # Trigger indexing for each due connector diff --git a/surfsense_backend/app/tasks/composio_indexer.py b/surfsense_backend/app/tasks/composio_indexer.py index e5c8b701e..3eed8470e 100644 --- a/surfsense_backend/app/tasks/composio_indexer.py +++ b/surfsense_backend/app/tasks/composio_indexer.py @@ -561,8 +561,12 @@ async def _index_composio_google_drive( update_last_indexed: bool = True, max_items: int = 1000, ) -> tuple[int, str]: - """Index Google Drive files via Composio. + """Index Google Drive files via Composio with delta sync support. + Delta Sync Flow: + 1. First sync: Full scan + get initial page token + 2. Subsequent syncs: Use LIST_CHANGES to process only changed files + Supports folder/file selection via connector config: - selected_folders: List of {id, name} for folders to index - selected_files: List of {id, name} for individual files to index @@ -576,354 +580,88 @@ async def _index_composio_google_drive( selected_folders = connector_config.get("selected_folders", []) selected_files = connector_config.get("selected_files", []) indexing_options = connector_config.get("indexing_options", {}) + + # Check for stored page token for delta sync + stored_page_token = connector_config.get("drive_page_token") + use_delta_sync = stored_page_token and connector.last_indexed_at max_files_per_folder = indexing_options.get("max_files_per_folder", 100) include_subfolders = indexing_options.get("include_subfolders", True) - await task_logger.log_task_progress( - log_entry, - f"Fetching Google Drive files via Composio for connector {connector_id}", - { - "stage": "fetching_files", - "selected_folders": len(selected_folders), - "selected_files": len(selected_files), - }, - ) - - all_files = [] - - # If specific folders/files are selected, fetch from those - if selected_folders or selected_files: - # Fetch files from selected folders - for folder in selected_folders: - folder_id = folder.get("id") - folder_name = folder.get("name", "Unknown") - - if not folder_id: - continue - - # Handle special case for "root" folder - actual_folder_id = None if folder_id == "root" else folder_id - - logger.info(f"Fetching files from folder: {folder_name} ({folder_id})") - - # Fetch files from this folder - folder_files = [] - page_token = None - - while len(folder_files) < max_files_per_folder: - ( - files, - next_token, - error, - ) = await composio_connector.list_drive_files( - folder_id=actual_folder_id, - page_token=page_token, - page_size=min(100, max_files_per_folder - len(folder_files)), - ) - - if error: - logger.warning( - f"Failed to fetch files from folder {folder_name}: {error}" - ) - break - - # Process files - for file_info in files: - mime_type = file_info.get("mimeType", "") or file_info.get( - "mime_type", "" - ) - - # If it's a folder and include_subfolders is enabled, recursively fetch - if mime_type == "application/vnd.google-apps.folder": - if include_subfolders: - # Add subfolder files recursively - subfolder_files = await _fetch_folder_files_recursively( - composio_connector, - file_info.get("id"), - max_files=max_files_per_folder, - current_count=len(folder_files), - ) - folder_files.extend(subfolder_files) - else: - folder_files.append(file_info) - - if not next_token: - break - page_token = next_token - - all_files.extend(folder_files[:max_files_per_folder]) - logger.info(f"Found {len(folder_files)} files in folder {folder_name}") - - # Add specifically selected files - for selected_file in selected_files: - file_id = selected_file.get("id") - file_name = selected_file.get("name", "Unknown") - - if not file_id: - continue - - # Add file info (we'll fetch content later during indexing) - all_files.append( - { - "id": file_id, - "name": file_name, - "mimeType": "", # Will be determined later - } - ) - else: - # No selection specified - fetch all files (original behavior) - page_token = None - - while len(all_files) < max_items: - files, next_token, error = await composio_connector.list_drive_files( - page_token=page_token, - page_size=min(100, max_items - len(all_files)), - ) - - if error: - await task_logger.log_task_failure( - log_entry, f"Failed to fetch Drive files: {error}", {} - ) - return 0, f"Failed to fetch Drive files: {error}" - - all_files.extend(files) - - if not next_token: - break - page_token = next_token - - if not all_files: - success_msg = "No Google Drive files found" - await task_logger.log_task_success( - log_entry, success_msg, {"files_count": 0} + # Route to delta sync or full scan + if use_delta_sync: + logger.info(f"Using delta sync for Composio Google Drive connector {connector_id}") + await task_logger.log_task_progress( + log_entry, + f"Starting delta sync for Google Drive via Composio (connector {connector_id})", + {"stage": "delta_sync", "token": stored_page_token[:20] + "..."}, + ) + + documents_indexed, documents_skipped, processing_errors = await _index_composio_drive_delta_sync( + session=session, + composio_connector=composio_connector, + connector_id=connector_id, + search_space_id=search_space_id, + user_id=user_id, + page_token=stored_page_token, + max_items=max_items, + task_logger=task_logger, + log_entry=log_entry, + ) + else: + logger.info(f"Using full scan for Composio Google Drive connector {connector_id} (first sync or no token)") + await task_logger.log_task_progress( + log_entry, + f"Fetching Google Drive files via Composio for connector {connector_id}", + { + "stage": "full_scan", + "selected_folders": len(selected_folders), + "selected_files": len(selected_files), + }, + ) + + documents_indexed, documents_skipped, processing_errors = await _index_composio_drive_full_scan( + session=session, + composio_connector=composio_connector, + connector_id=connector_id, + search_space_id=search_space_id, + user_id=user_id, + selected_folders=selected_folders, + selected_files=selected_files, + max_files_per_folder=max_files_per_folder, + include_subfolders=include_subfolders, + max_items=max_items, + task_logger=task_logger, + log_entry=log_entry, ) - # CRITICAL: Update timestamp even when no files found so Electric SQL syncs and UI shows indexed status - await update_connector_last_indexed(session, connector, update_last_indexed) - await session.commit() - return ( - 0, - None, - ) # Return None (not error) when no items found - this is success with 0 items - logger.info(f"Found {len(all_files)} Google Drive files to index via Composio") + # Get new page token for next sync (always update after successful sync) + new_token, token_error = await composio_connector.get_drive_start_page_token() + if new_token and not token_error: + from sqlalchemy.orm.attributes import flag_modified + + # Refresh connector to avoid stale state + await session.refresh(connector) + + if not connector.config: + connector.config = {} + connector.config["drive_page_token"] = new_token + flag_modified(connector, "config") + logger.info(f"Updated drive_page_token for connector {connector_id}") + elif token_error: + logger.warning(f"Failed to get new page token: {token_error}") - documents_indexed = 0 - documents_skipped = 0 - processing_errors = [] - - for file_info in all_files: - try: - # Handle both standard Google API and potential Composio variations - file_id = file_info.get("id", "") or file_info.get("fileId", "") - file_name = ( - file_info.get("name", "") - or file_info.get("fileName", "") - or "Untitled" - ) - mime_type = file_info.get("mimeType", "") or file_info.get( - "mime_type", "" - ) - - if not file_id: - documents_skipped += 1 - continue - - # Skip folders - if mime_type == "application/vnd.google-apps.folder": - continue - - # Generate unique identifier hash - document_type = DocumentType(TOOLKIT_TO_DOCUMENT_TYPE["googledrive"]) - unique_identifier_hash = generate_unique_identifier_hash( - document_type, f"drive_{file_id}", search_space_id - ) - - # Check if document exists - existing_document = await check_document_by_unique_identifier( - session, unique_identifier_hash - ) - - # Get file content - ( - content, - content_error, - ) = await composio_connector.get_drive_file_content(file_id) - - if content_error or not content: - logger.warning( - f"Could not get content for file {file_name}: {content_error}" - ) - # Use metadata as content fallback - markdown_content = f"# {file_name}\n\n" - markdown_content += f"**File ID:** {file_id}\n" - markdown_content += f"**Type:** {mime_type}\n" - elif isinstance(content, dict): - # Safety check: if content is still a dict, log error and use fallback - error_msg = f"Unexpected dict content format for file {file_name}: {list(content.keys())}" - logger.error(error_msg) - processing_errors.append(error_msg) - markdown_content = f"# {file_name}\n\n" - markdown_content += f"**File ID:** {file_id}\n" - markdown_content += f"**Type:** {mime_type}\n" - else: - # Process content based on file type - markdown_content = await _process_file_content( - content=content, - file_name=file_name, - file_id=file_id, - mime_type=mime_type, - search_space_id=search_space_id, - user_id=user_id, - session=session, - task_logger=task_logger, - log_entry=log_entry, - processing_errors=processing_errors, - ) - - content_hash = generate_content_hash(markdown_content, search_space_id) - - if existing_document: - if existing_document.content_hash == content_hash: - documents_skipped += 1 - continue - - # Update existing document - user_llm = await get_user_long_context_llm( - session, user_id, search_space_id - ) - - if user_llm: - document_metadata = { - "file_id": file_id, - "file_name": file_name, - "mime_type": mime_type, - "document_type": "Google Drive File (Composio)", - } - ( - summary_content, - summary_embedding, - ) = await generate_document_summary( - markdown_content, user_llm, document_metadata - ) - else: - summary_content = ( - f"Google Drive File: {file_name}\n\nType: {mime_type}" - ) - summary_embedding = config.embedding_model_instance.embed( - summary_content - ) - - chunks = await create_document_chunks(markdown_content) - - existing_document.title = f"Drive: {file_name}" - existing_document.content = summary_content - existing_document.content_hash = content_hash - existing_document.embedding = summary_embedding - existing_document.document_metadata = { - "file_id": file_id, - "file_name": file_name, - "mime_type": mime_type, - "connector_id": connector_id, - "source": "composio", - } - existing_document.chunks = chunks - existing_document.updated_at = get_current_timestamp() - - documents_indexed += 1 - - # Batch commit every 10 documents - if documents_indexed % 10 == 0: - logger.info( - f"Committing batch: {documents_indexed} Google Drive files processed so far" - ) - await session.commit() - continue - - # Create new document - user_llm = await get_user_long_context_llm( - session, user_id, search_space_id - ) - - if user_llm: - document_metadata = { - "file_id": file_id, - "file_name": file_name, - "mime_type": mime_type, - "document_type": "Google Drive File (Composio)", - } - ( - summary_content, - summary_embedding, - ) = await generate_document_summary( - markdown_content, user_llm, document_metadata - ) - else: - summary_content = ( - f"Google Drive File: {file_name}\n\nType: {mime_type}" - ) - summary_embedding = config.embedding_model_instance.embed( - summary_content - ) - - chunks = await create_document_chunks(markdown_content) - - document = Document( - search_space_id=search_space_id, - title=f"Drive: {file_name}", - document_type=DocumentType(TOOLKIT_TO_DOCUMENT_TYPE["googledrive"]), - document_metadata={ - "file_id": file_id, - "file_name": file_name, - "mime_type": mime_type, - "connector_id": connector_id, - "toolkit_id": "googledrive", - "source": "composio", - }, - content=summary_content, - content_hash=content_hash, - unique_identifier_hash=unique_identifier_hash, - embedding=summary_embedding, - chunks=chunks, - updated_at=get_current_timestamp(), - ) - session.add(document) - documents_indexed += 1 - - # Batch commit every 10 documents - if documents_indexed % 10 == 0: - logger.info( - f"Committing batch: {documents_indexed} Google Drive files processed so far" - ) - await session.commit() - - except Exception as e: - error_msg = ( - f"Error processing Drive file {file_name or 'unknown'}: {e!s}" - ) - logger.error(error_msg, exc_info=True) - processing_errors.append(error_msg) - documents_skipped += 1 - continue - - # CRITICAL: Always update timestamp (even if 0 documents indexed) so Electric SQL syncs - # This ensures the UI shows "Last indexed" instead of "Never indexed" + # CRITICAL: Always update timestamp so Electric SQL syncs and UI shows indexed status await update_connector_last_indexed(session, connector, update_last_indexed) - # Final commit to ensure all documents are persisted (safety net) - # This matches the pattern used in non-Composio Gmail indexer - logger.info( - f"Final commit: Total {documents_indexed} Google Drive files processed" - ) + # Final commit + logger.info(f"Final commit: Total {documents_indexed} Google Drive files processed") await session.commit() - logger.info( - "Successfully committed all Composio Google Drive document changes to database" - ) + logger.info("Successfully committed all Composio Google Drive document changes to database") - # If there were processing errors, return them so notification can show them + # Handle processing errors error_message = None if processing_errors: - # Combine all errors into a single message if len(processing_errors) == 1: error_message = processing_errors[0] else: @@ -934,6 +672,7 @@ async def _index_composio_google_drive( { "documents_indexed": documents_indexed, "documents_skipped": documents_skipped, + "sync_type": "delta" if use_delta_sync else "full", "errors": processing_errors, }, ) @@ -944,6 +683,7 @@ async def _index_composio_google_drive( { "documents_indexed": documents_indexed, "documents_skipped": documents_skipped, + "sync_type": "delta" if use_delta_sync else "full", }, ) @@ -954,6 +694,469 @@ async def _index_composio_google_drive( return 0, f"Failed to index Google Drive via Composio: {e!s}" +async def _index_composio_drive_delta_sync( + session: AsyncSession, + composio_connector: ComposioConnector, + connector_id: int, + search_space_id: int, + user_id: str, + page_token: str, + max_items: int, + task_logger: TaskLoggingService, + log_entry, +) -> tuple[int, int, list[str]]: + """Index Google Drive files using delta sync (only changed files). + + Uses GOOGLEDRIVE_LIST_CHANGES to fetch only files that changed since last sync. + Handles: new files, modified files, and deleted files. + """ + documents_indexed = 0 + documents_skipped = 0 + processing_errors = [] + + # Fetch all changes with pagination + all_changes = [] + current_token = page_token + + while len(all_changes) < max_items: + changes, next_token, error = await composio_connector.list_drive_changes( + page_token=current_token, + page_size=100, + include_removed=True, + ) + + if error: + logger.error(f"Error fetching Drive changes: {error}") + processing_errors.append(f"Failed to fetch changes: {error}") + break + + all_changes.extend(changes) + + if not next_token or next_token == current_token: + break + current_token = next_token + + if not all_changes: + logger.info("No changes detected since last sync") + return 0, 0, [] + + logger.info(f"Processing {len(all_changes)} changes from delta sync") + + for change in all_changes[:max_items]: + try: + # Handle removed files + is_removed = change.get("removed", False) + file_info = change.get("file", {}) + file_id = change.get("fileId") or file_info.get("id", "") + + if not file_id: + documents_skipped += 1 + continue + + # Check if file was trashed or removed + if is_removed or file_info.get("trashed", False): + # Remove document from database + document_type = DocumentType(TOOLKIT_TO_DOCUMENT_TYPE["googledrive"]) + unique_identifier_hash = generate_unique_identifier_hash( + document_type, f"drive_{file_id}", search_space_id + ) + existing_document = await check_document_by_unique_identifier( + session, unique_identifier_hash + ) + if existing_document: + await session.delete(existing_document) + documents_indexed += 1 + logger.info(f"Deleted document for removed/trashed file: {file_id}") + continue + + # Process changed file + file_name = file_info.get("name", "") or "Untitled" + mime_type = file_info.get("mimeType", "") or file_info.get("mime_type", "") + + # Skip folders + if mime_type == "application/vnd.google-apps.folder": + continue + + # Process the file + indexed, skipped, errors = await _process_single_drive_file( + session=session, + composio_connector=composio_connector, + file_id=file_id, + file_name=file_name, + mime_type=mime_type, + connector_id=connector_id, + search_space_id=search_space_id, + user_id=user_id, + task_logger=task_logger, + log_entry=log_entry, + ) + + documents_indexed += indexed + documents_skipped += skipped + processing_errors.extend(errors) + + # Batch commit every 10 documents + if documents_indexed > 0 and documents_indexed % 10 == 0: + await session.commit() + logger.info(f"Committed batch: {documents_indexed} changes processed") + + except Exception as e: + error_msg = f"Error processing change for file {file_id}: {e!s}" + logger.error(error_msg, exc_info=True) + processing_errors.append(error_msg) + documents_skipped += 1 + + logger.info(f"Delta sync complete: {documents_indexed} indexed, {documents_skipped} skipped") + return documents_indexed, documents_skipped, processing_errors + + +async def _index_composio_drive_full_scan( + session: AsyncSession, + composio_connector: ComposioConnector, + connector_id: int, + search_space_id: int, + user_id: str, + selected_folders: list[dict], + selected_files: list[dict], + max_files_per_folder: int, + include_subfolders: bool, + max_items: int, + task_logger: TaskLoggingService, + log_entry, +) -> tuple[int, int, list[str]]: + """Index Google Drive files using full scan (first sync or when no delta token).""" + documents_indexed = 0 + documents_skipped = 0 + processing_errors = [] + + all_files = [] + + # If specific folders/files are selected, fetch from those + if selected_folders or selected_files: + # Fetch files from selected folders + for folder in selected_folders: + folder_id = folder.get("id") + folder_name = folder.get("name", "Unknown") + + if not folder_id: + continue + + # Handle special case for "root" folder + actual_folder_id = None if folder_id == "root" else folder_id + + logger.info(f"Fetching files from folder: {folder_name} ({folder_id})") + + # Fetch files from this folder + folder_files = [] + page_token = None + + while len(folder_files) < max_files_per_folder: + ( + files, + next_token, + error, + ) = await composio_connector.list_drive_files( + folder_id=actual_folder_id, + page_token=page_token, + page_size=min(100, max_files_per_folder - len(folder_files)), + ) + + if error: + logger.warning( + f"Failed to fetch files from folder {folder_name}: {error}" + ) + break + + # Process files + for file_info in files: + mime_type = file_info.get("mimeType", "") or file_info.get( + "mime_type", "" + ) + + # If it's a folder and include_subfolders is enabled, recursively fetch + if mime_type == "application/vnd.google-apps.folder": + if include_subfolders: + # Add subfolder files recursively + subfolder_files = await _fetch_folder_files_recursively( + composio_connector, + file_info.get("id"), + max_files=max_files_per_folder, + current_count=len(folder_files), + ) + folder_files.extend(subfolder_files) + else: + folder_files.append(file_info) + + if not next_token: + break + page_token = next_token + + all_files.extend(folder_files[:max_files_per_folder]) + logger.info(f"Found {len(folder_files)} files in folder {folder_name}") + + # Add specifically selected files + for selected_file in selected_files: + file_id = selected_file.get("id") + file_name = selected_file.get("name", "Unknown") + + if not file_id: + continue + + # Add file info (we'll fetch content later during indexing) + all_files.append( + { + "id": file_id, + "name": file_name, + "mimeType": "", # Will be determined later + } + ) + else: + # No selection specified - fetch all files (original behavior) + page_token = None + + while len(all_files) < max_items: + files, next_token, error = await composio_connector.list_drive_files( + page_token=page_token, + page_size=min(100, max_items - len(all_files)), + ) + + if error: + return 0, 0, [f"Failed to fetch Drive files: {error}"] + + all_files.extend(files) + + if not next_token: + break + page_token = next_token + + if not all_files: + logger.info("No Google Drive files found") + return 0, 0, [] + + logger.info(f"Found {len(all_files)} Google Drive files to index via Composio (full scan)") + + for file_info in all_files: + try: + # Handle both standard Google API and potential Composio variations + file_id = file_info.get("id", "") or file_info.get("fileId", "") + file_name = ( + file_info.get("name", "") + or file_info.get("fileName", "") + or "Untitled" + ) + mime_type = file_info.get("mimeType", "") or file_info.get( + "mime_type", "" + ) + + if not file_id: + documents_skipped += 1 + continue + + # Skip folders + if mime_type == "application/vnd.google-apps.folder": + continue + + # Process the file + indexed, skipped, errors = await _process_single_drive_file( + session=session, + composio_connector=composio_connector, + file_id=file_id, + file_name=file_name, + mime_type=mime_type, + connector_id=connector_id, + search_space_id=search_space_id, + user_id=user_id, + task_logger=task_logger, + log_entry=log_entry, + ) + + documents_indexed += indexed + documents_skipped += skipped + processing_errors.extend(errors) + + # Batch commit every 10 documents + if documents_indexed > 0 and documents_indexed % 10 == 0: + logger.info(f"Committing batch: {documents_indexed} Google Drive files processed so far") + await session.commit() + + except Exception as e: + error_msg = f"Error processing Drive file {file_name or 'unknown'}: {e!s}" + logger.error(error_msg, exc_info=True) + processing_errors.append(error_msg) + documents_skipped += 1 + + logger.info(f"Full scan complete: {documents_indexed} indexed, {documents_skipped} skipped") + return documents_indexed, documents_skipped, processing_errors + + +async def _process_single_drive_file( + session: AsyncSession, + composio_connector: ComposioConnector, + file_id: str, + file_name: str, + mime_type: str, + connector_id: int, + search_space_id: int, + user_id: str, + task_logger: TaskLoggingService, + log_entry, +) -> tuple[int, int, list[str]]: + """Process a single Google Drive file for indexing. + + Returns: + Tuple of (documents_indexed, documents_skipped, processing_errors) + """ + processing_errors = [] + + # Generate unique identifier hash + document_type = DocumentType(TOOLKIT_TO_DOCUMENT_TYPE["googledrive"]) + unique_identifier_hash = generate_unique_identifier_hash( + document_type, f"drive_{file_id}", search_space_id + ) + + # Check if document exists + existing_document = await check_document_by_unique_identifier( + session, unique_identifier_hash + ) + + # Get file content + content, content_error = await composio_connector.get_drive_file_content(file_id) + + if content_error or not content: + logger.warning( + f"Could not get content for file {file_name}: {content_error}" + ) + # Use metadata as content fallback + markdown_content = f"# {file_name}\n\n" + markdown_content += f"**File ID:** {file_id}\n" + markdown_content += f"**Type:** {mime_type}\n" + elif isinstance(content, dict): + # Safety check: if content is still a dict, log error and use fallback + error_msg = f"Unexpected dict content format for file {file_name}: {list(content.keys())}" + logger.error(error_msg) + processing_errors.append(error_msg) + markdown_content = f"# {file_name}\n\n" + markdown_content += f"**File ID:** {file_id}\n" + markdown_content += f"**Type:** {mime_type}\n" + else: + # Process content based on file type + markdown_content = await _process_file_content( + content=content, + file_name=file_name, + file_id=file_id, + mime_type=mime_type, + search_space_id=search_space_id, + user_id=user_id, + session=session, + task_logger=task_logger, + log_entry=log_entry, + processing_errors=processing_errors, + ) + + content_hash = generate_content_hash(markdown_content, search_space_id) + + if existing_document: + if existing_document.content_hash == content_hash: + return 0, 1, processing_errors # Skipped + + # Update existing document + user_llm = await get_user_long_context_llm( + session, user_id, search_space_id + ) + + if user_llm: + document_metadata = { + "file_id": file_id, + "file_name": file_name, + "mime_type": mime_type, + "document_type": "Google Drive File (Composio)", + } + ( + summary_content, + summary_embedding, + ) = await generate_document_summary( + markdown_content, user_llm, document_metadata + ) + else: + summary_content = ( + f"Google Drive File: {file_name}\n\nType: {mime_type}" + ) + summary_embedding = config.embedding_model_instance.embed( + summary_content + ) + + chunks = await create_document_chunks(markdown_content) + + existing_document.title = f"Drive: {file_name}" + existing_document.content = summary_content + existing_document.content_hash = content_hash + existing_document.embedding = summary_embedding + existing_document.document_metadata = { + "file_id": file_id, + "file_name": file_name, + "FILE_NAME": file_name, # For compatibility + "mime_type": mime_type, + "connector_id": connector_id, + "source": "composio", + } + existing_document.chunks = chunks + existing_document.updated_at = get_current_timestamp() + + return 1, 0, processing_errors # Indexed + + # Create new document + user_llm = await get_user_long_context_llm( + session, user_id, search_space_id + ) + + if user_llm: + document_metadata = { + "file_id": file_id, + "file_name": file_name, + "mime_type": mime_type, + "document_type": "Google Drive File (Composio)", + } + ( + summary_content, + summary_embedding, + ) = await generate_document_summary( + markdown_content, user_llm, document_metadata + ) + else: + summary_content = ( + f"Google Drive File: {file_name}\n\nType: {mime_type}" + ) + summary_embedding = config.embedding_model_instance.embed( + summary_content + ) + + chunks = await create_document_chunks(markdown_content) + + document = Document( + search_space_id=search_space_id, + title=f"Drive: {file_name}", + document_type=DocumentType(TOOLKIT_TO_DOCUMENT_TYPE["googledrive"]), + document_metadata={ + "file_id": file_id, + "file_name": file_name, + "FILE_NAME": file_name, # For compatibility + "mime_type": mime_type, + "connector_id": connector_id, + "toolkit_id": "googledrive", + "source": "composio", + }, + content=summary_content, + content_hash=content_hash, + unique_identifier_hash=unique_identifier_hash, + embedding=summary_embedding, + chunks=chunks, + updated_at=get_current_timestamp(), + ) + session.add(document) + + return 1, 0, processing_errors # Indexed + + async def _fetch_folder_files_recursively( composio_connector: ComposioConnector, folder_id: str, @@ -1271,11 +1474,18 @@ async def _index_composio_gmail( if end_date == "undefined" or end_date == "": end_date = None - # Calculate date range with defaults (uses last_indexed_at or 365 days back) - # This ensures indexing works even when user doesn't specify dates - start_date_str, end_date_str = calculate_date_range( - connector, start_date, end_date, default_days_back=365 - ) + # Use provided dates directly if both are provided, otherwise calculate from last_indexed_at + # This ensures user-selected dates are respected (matching non-Composio Gmail connector behavior) + if start_date is not None and end_date is not None: + # User provided both dates - use them directly + start_date_str = start_date + end_date_str = end_date + else: + # Calculate date range with defaults (uses last_indexed_at or 365 days back) + # This ensures indexing works even when user doesn't specify dates + start_date_str, end_date_str = calculate_date_range( + connector, start_date, end_date, default_days_back=365 + ) # Build query with date range query_parts = [] @@ -1468,11 +1678,18 @@ async def _index_composio_google_calendar( if end_date == "undefined" or end_date == "": end_date = None - # Calculate date range with defaults (uses last_indexed_at or 365 days back) - # This ensures indexing works even when user doesn't specify dates - start_date_str, end_date_str = calculate_date_range( - connector, start_date, end_date, default_days_back=365 - ) + # Use provided dates directly if both are provided, otherwise calculate from last_indexed_at + # This ensures user-selected dates are respected (matching non-Composio Calendar connector behavior) + if start_date is not None and end_date is not None: + # User provided both dates - use them directly + start_date_str = start_date + end_date_str = end_date + else: + # Calculate date range with defaults (uses last_indexed_at or 365 days back) + # This ensures indexing works even when user doesn't specify dates + start_date_str, end_date_str = calculate_date_range( + connector, start_date, end_date, default_days_back=365 + ) # Build time range for API call time_min = f"{start_date_str}T00:00:00Z" diff --git a/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/components/RowActions.tsx b/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/components/RowActions.tsx index d277a84ee..d9a894e5a 100644 --- a/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/components/RowActions.tsx +++ b/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/components/RowActions.tsx @@ -209,7 +209,7 @@ export function RowActions({ disabled={isDeleting} className="bg-destructive text-destructive-foreground hover:bg-destructive/90" > - {isDeleting ? "Deleting..." : "Delete"} + {isDeleting ? "Deleting" : "Delete"} diff --git a/surfsense_web/components/assistant-ui/connector-popup.tsx b/surfsense_web/components/assistant-ui/connector-popup.tsx index a1108f7c8..045c3c586 100644 --- a/surfsense_web/components/assistant-ui/connector-popup.tsx +++ b/surfsense_web/components/assistant-ui/connector-popup.tsx @@ -16,7 +16,7 @@ import { ConnectorDialogHeader } from "./connector-popup/components/connector-di import { ConnectorConnectView } from "./connector-popup/connector-configs/views/connector-connect-view"; import { ConnectorEditView } from "./connector-popup/connector-configs/views/connector-edit-view"; import { IndexingConfigurationView } from "./connector-popup/connector-configs/views/indexing-configuration-view"; -import { OAUTH_CONNECTORS } from "./connector-popup/constants/connector-constants"; +import { COMPOSIO_CONNECTORS, OAUTH_CONNECTORS } from "./connector-popup/constants/connector-constants"; import { useConnectorDialog } from "./connector-popup/hooks/use-connector-dialog"; import { useIndexingConnectors } from "./connector-popup/hooks/use-indexing-connectors"; import { ActiveConnectorsTab } from "./connector-popup/tabs/active-connectors-tab"; @@ -196,9 +196,14 @@ export const ConnectorIndicator: FC = () => { onBack={handleBackFromAccountsList} onManage={handleStartEdit} onAddAccount={() => { - const oauthConnector = OAUTH_CONNECTORS.find( - (c) => c.connectorType === viewingAccountsType.connectorType - ); + // Check both OAUTH_CONNECTORS and COMPOSIO_CONNECTORS + const oauthConnector = + OAUTH_CONNECTORS.find( + (c) => c.connectorType === viewingAccountsType.connectorType + ) || + COMPOSIO_CONNECTORS.find( + (c) => c.connectorType === viewingAccountsType.connectorType + ); if (oauthConnector) { handleConnectOAuth(oauthConnector); } diff --git a/surfsense_web/components/assistant-ui/connector-popup/connector-configs/views/connector-edit-view.tsx b/surfsense_web/components/assistant-ui/connector-popup/connector-configs/views/connector-edit-view.tsx index 71258a519..234898922 100644 --- a/surfsense_web/components/assistant-ui/connector-popup/connector-configs/views/connector-edit-view.tsx +++ b/surfsense_web/components/assistant-ui/connector-popup/connector-configs/views/connector-edit-view.tsx @@ -341,7 +341,7 @@ export const ConnectorEditView: FC = ({ {isSaving ? ( <> - Saving... + Saving ) : ( "Save Changes" diff --git a/surfsense_web/components/assistant-ui/connector-popup/connector-configs/views/indexing-configuration-view.tsx b/surfsense_web/components/assistant-ui/connector-popup/connector-configs/views/indexing-configuration-view.tsx index ea489aec8..68fc688c3 100644 --- a/surfsense_web/components/assistant-ui/connector-popup/connector-configs/views/indexing-configuration-view.tsx +++ b/surfsense_web/components/assistant-ui/connector-popup/connector-configs/views/indexing-configuration-view.tsx @@ -9,7 +9,11 @@ import { getConnectorTypeDisplay } from "@/lib/connectors/utils"; import { cn } from "@/lib/utils"; import { DateRangeSelector } from "../../components/date-range-selector"; import { PeriodicSyncConfig } from "../../components/periodic-sync-config"; -import { type IndexingConfigState, OAUTH_CONNECTORS } from "../../constants/connector-constants"; +import { + COMPOSIO_CONNECTORS, + type IndexingConfigState, + OAUTH_CONNECTORS, +} from "../../constants/connector-constants"; import { getConnectorDisplayName } from "../../tabs/all-connectors-tab"; import { getConnectorConfigComponent } from "../index"; @@ -91,7 +95,10 @@ export const IndexingConfigurationView: FC = ({ }; }, [checkScrollState]); - const authConnector = OAUTH_CONNECTORS.find((c) => c.connectorType === connector?.connector_type); + // Check both OAUTH_CONNECTORS and COMPOSIO_CONNECTORS + const authConnector = + OAUTH_CONNECTORS.find((c) => c.connectorType === connector?.connector_type) || + COMPOSIO_CONNECTORS.find((c) => c.connectorType === connector?.connector_type); return (
diff --git a/surfsense_web/components/assistant-ui/connector-popup/hooks/use-connector-dialog.ts b/surfsense_web/components/assistant-ui/connector-popup/hooks/use-connector-dialog.ts index 2923ab823..a2b1168bd 100644 --- a/surfsense_web/components/assistant-ui/connector-popup/hooks/use-connector-dialog.ts +++ b/surfsense_web/components/assistant-ui/connector-popup/hooks/use-connector-dialog.ts @@ -191,7 +191,10 @@ export const useConnectorDialog = () => { // Handle configure view (for page refresh support) if (params.view === "configure" && params.connector && !indexingConfig && allConnectors) { - const oauthConnector = OAUTH_CONNECTORS.find((c) => c.id === params.connector); + // Check both OAUTH_CONNECTORS and COMPOSIO_CONNECTORS + const oauthConnector = + OAUTH_CONNECTORS.find((c) => c.id === params.connector) || + COMPOSIO_CONNECTORS.find((c) => c.id === params.connector); if (oauthConnector) { let existingConnector: SearchSourceConnector | undefined; if (params.connectorId) { diff --git a/surfsense_web/components/assistant-ui/connector-popup/tabs/active-connectors-tab.tsx b/surfsense_web/components/assistant-ui/connector-popup/tabs/active-connectors-tab.tsx index a518d63a6..e45888bb1 100644 --- a/surfsense_web/components/assistant-ui/connector-popup/tabs/active-connectors-tab.tsx +++ b/surfsense_web/components/assistant-ui/connector-popup/tabs/active-connectors-tab.tsx @@ -13,7 +13,7 @@ import type { SearchSourceConnector } from "@/contracts/types/connector.types"; import type { LogActiveTask, LogSummary } from "@/contracts/types/log.types"; import { connectorsApiService } from "@/lib/apis/connectors-api.service"; import { cn } from "@/lib/utils"; -import { OAUTH_CONNECTORS } from "../constants/connector-constants"; +import { COMPOSIO_CONNECTORS, OAUTH_CONNECTORS } from "../constants/connector-constants"; import { getDocumentCountForConnector } from "../utils/connector-document-mapping"; interface ActiveConnectorsTabProps { @@ -113,7 +113,10 @@ export const ActiveConnectorsTab: FC = ({ // Get display info for OAuth connector type const getOAuthConnectorTypeInfo = (connectorType: string) => { - const oauthConnector = OAUTH_CONNECTORS.find((c) => c.connectorType === connectorType); + // Check both OAUTH_CONNECTORS and COMPOSIO_CONNECTORS + const oauthConnector = + OAUTH_CONNECTORS.find((c) => c.connectorType === connectorType) || + COMPOSIO_CONNECTORS.find((c) => c.connectorType === connectorType); return { title: oauthConnector?.title || diff --git a/surfsense_web/components/settings/llm-role-manager.tsx b/surfsense_web/components/settings/llm-role-manager.tsx index ba4c4970c..c41a2d3bf 100644 --- a/surfsense_web/components/settings/llm-role-manager.tsx +++ b/surfsense_web/components/settings/llm-role-manager.tsx @@ -398,7 +398,7 @@ export function LLMRoleManager({ searchSpaceId }: LLMRoleManagerProps) { className="flex items-center gap-2 text-xs md:text-sm h-9 md:h-10" > - {isSaving ? "Saving..." : "Save Changes"} + {isSaving ? "Saving" : "Save Changes"}
diff --git a/surfsense_web/messages/en.json b/surfsense_web/messages/en.json index 94e44c8ec..8ca382669 100644 --- a/surfsense_web/messages/en.json +++ b/surfsense_web/messages/en.json @@ -157,7 +157,7 @@ "delete_note": "Delete Note", "delete_note_confirm": "Are you sure you want to delete", "action_cannot_undone": "This action cannot be undone.", - "deleting": "Deleting...", + "deleting": "Deleting", "surfsense_dashboard": "SurfSense Dashboard", "welcome_message": "Welcome to your SurfSense dashboard.", "your_search_spaces": "Your Search Spaces", @@ -498,7 +498,7 @@ "base": "Base", "all_roles_assigned": "All roles are assigned and ready to use! Your LLM configuration is complete.", "save_changes": "Save Changes", - "saving": "Saving...", + "saving": "Saving", "reset": "Reset", "status": "Status", "status_ready": "Ready", @@ -548,7 +548,7 @@ "log_deleted_error": "Failed to delete log", "confirm_delete_log_title": "Are you sure?", "confirm_delete_log_desc": "This action cannot be undone. This will permanently delete the log entry.", - "deleting": "Deleting..." + "deleting": "Deleting" }, "onboard": { "welcome_title": "Welcome to SurfSense", From 8d8f69545ee869242fe27fb2f4d4512429cdb240 Mon Sep 17 00:00:00 2001 From: Anish Sarkar <104695310+AnishSarkar22@users.noreply.github.com> Date: Fri, 23 Jan 2026 18:57:10 +0530 Subject: [PATCH 11/28] feat: improve Google Calendar and Gmail connectors with enhanced error handling - Added user-friendly re-authentication messages for expired or revoked tokens in both Google Calendar and Gmail connectors. - Updated error handling in indexing tasks to log specific authentication errors and provide clearer feedback to users. - Enhanced the connector UI to handle indexing failures more effectively, improving overall user experience. --- .../connectors/google_calendar_connector.py | 14 +++++++ .../app/connectors/google_gmail_connector.py | 14 +++++++ .../google_calendar_indexer.py | 15 +++++-- .../google_gmail_indexer.py | 14 ++++++- .../assistant-ui/connector-popup.tsx | 18 ++++++-- .../views/connector-edit-view.tsx | 2 +- .../hooks/use-connector-dialog.ts | 6 ++- .../hooks/use-indexing-connectors.ts | 42 ++++++++++++++++++- 8 files changed, 113 insertions(+), 12 deletions(-) diff --git a/surfsense_backend/app/connectors/google_calendar_connector.py b/surfsense_backend/app/connectors/google_calendar_connector.py index 6d389ddd5..ac60b02a8 100644 --- a/surfsense_backend/app/connectors/google_calendar_connector.py +++ b/surfsense_backend/app/connectors/google_calendar_connector.py @@ -142,6 +142,12 @@ class GoogleCalendarConnector: flag_modified(connector, "config") await self._session.commit() except Exception as e: + error_str = str(e) + # Check if this is an invalid_grant error (token expired/revoked) + if "invalid_grant" in error_str.lower() or "token has been expired or revoked" in error_str.lower(): + raise Exception( + "Google Calendar authentication failed. Please re-authenticate." + ) from e raise Exception( f"Failed to refresh Google OAuth credentials: {e!s}" ) from e @@ -165,6 +171,10 @@ class GoogleCalendarConnector: self.service = build("calendar", "v3", credentials=credentials) return self.service except Exception as e: + error_str = str(e) + # If the error already contains a user-friendly re-authentication message, preserve it + if "re-authenticate" in error_str.lower() or "expired or been revoked" in error_str.lower() or "authentication failed" in error_str.lower(): + raise Exception(error_str) from e raise Exception(f"Failed to create Google Calendar service: {e!s}") from e async def get_calendars(self) -> tuple[list[dict[str, Any]], str | None]: @@ -271,6 +281,10 @@ class GoogleCalendarConnector: return events, None except Exception as e: + error_str = str(e) + # If the error already contains a user-friendly re-authentication message, preserve it + if "re-authenticate" in error_str.lower() or "expired or been revoked" in error_str.lower() or "authentication failed" in error_str.lower(): + return [], error_str return [], f"Error fetching events: {e!s}" def format_event_to_markdown(self, event: dict[str, Any]) -> str: diff --git a/surfsense_backend/app/connectors/google_gmail_connector.py b/surfsense_backend/app/connectors/google_gmail_connector.py index 10008ad73..8c0e4690e 100644 --- a/surfsense_backend/app/connectors/google_gmail_connector.py +++ b/surfsense_backend/app/connectors/google_gmail_connector.py @@ -141,6 +141,12 @@ class GoogleGmailConnector: flag_modified(connector, "config") await self._session.commit() except Exception as e: + error_str = str(e) + # Check if this is an invalid_grant error (token expired/revoked) + if "invalid_grant" in error_str.lower() or "token has been expired or revoked" in error_str.lower(): + raise Exception( + "Gmail authentication failed. Please re-authenticate." + ) from e raise Exception( f"Failed to refresh Google OAuth credentials: {e!s}" ) from e @@ -164,6 +170,10 @@ class GoogleGmailConnector: self.service = build("gmail", "v1", credentials=credentials) return self.service except Exception as e: + error_str = str(e) + # If the error already contains a user-friendly re-authentication message, preserve it + if "re-authenticate" in error_str.lower() or "expired or been revoked" in error_str.lower() or "authentication failed" in error_str.lower(): + raise Exception(error_str) from e raise Exception(f"Failed to create Gmail service: {e!s}") from e async def get_user_profile(self) -> tuple[dict[str, Any], str | None]: @@ -225,6 +235,10 @@ class GoogleGmailConnector: return messages, None except Exception as e: + error_str = str(e) + # If the error already contains a user-friendly re-authentication message, preserve it + if "re-authenticate" in error_str.lower() or "expired or been revoked" in error_str.lower() or "authentication failed" in error_str.lower(): + return [], error_str return [], f"Error fetching messages list: {e!s}" async def get_message_details( diff --git a/surfsense_backend/app/tasks/connector_indexers/google_calendar_indexer.py b/surfsense_backend/app/tasks/connector_indexers/google_calendar_indexer.py index b8c0e564d..09bb8de4b 100644 --- a/surfsense_backend/app/tasks/connector_indexers/google_calendar_indexer.py +++ b/surfsense_backend/app/tasks/connector_indexers/google_calendar_indexer.py @@ -246,13 +246,20 @@ async def index_google_calendar_events( ) return 0, None else: + # Check if this is an authentication error that requires re-authentication + error_message = error + error_type = "APIError" + if "re-authenticate" in error.lower() or "expired or been revoked" in error.lower() or "authentication failed" in error.lower(): + error_message = "Google Calendar authentication failed. Please re-authenticate." + error_type = "AuthenticationError" + await task_logger.log_task_failure( log_entry, - f"Failed to get Google Calendar events: {error}", - "API Error", - {"error_type": "APIError"}, + error_message, + error, + {"error_type": error_type}, ) - return 0, f"Failed to get Google Calendar events: {error}" + return 0, error_message logger.info(f"Retrieved {len(events)} events from Google Calendar API") diff --git a/surfsense_backend/app/tasks/connector_indexers/google_gmail_indexer.py b/surfsense_backend/app/tasks/connector_indexers/google_gmail_indexer.py index e10297057..6a3057437 100644 --- a/surfsense_backend/app/tasks/connector_indexers/google_gmail_indexer.py +++ b/surfsense_backend/app/tasks/connector_indexers/google_gmail_indexer.py @@ -170,10 +170,20 @@ async def index_google_gmail_messages( ) if error: + # Check if this is an authentication error that requires re-authentication + error_message = error + error_type = "APIError" + if "re-authenticate" in error.lower() or "expired or been revoked" in error.lower() or "authentication failed" in error.lower(): + error_message = "Gmail authentication failed. Please re-authenticate." + error_type = "AuthenticationError" + await task_logger.log_task_failure( - log_entry, f"Failed to fetch messages: {error}", {} + log_entry, + error_message, + error, + {"error_type": error_type} ) - return 0, f"Failed to fetch Gmail messages: {error}" + return 0, error_message if not messages: success_msg = "No Google gmail messages found in the specified date range" diff --git a/surfsense_web/components/assistant-ui/connector-popup.tsx b/surfsense_web/components/assistant-ui/connector-popup.tsx index 045c3c586..a04e2a9fd 100644 --- a/surfsense_web/components/assistant-ui/connector-popup.tsx +++ b/surfsense_web/components/assistant-ui/connector-popup.tsx @@ -5,12 +5,14 @@ import { Cable, Loader2 } from "lucide-react"; import { useSearchParams } from "next/navigation"; import type { FC } from "react"; import { activeSearchSpaceIdAtom } from "@/atoms/search-spaces/search-space-query.atoms"; +import { currentUserAtom } from "@/atoms/user/user-query.atoms"; import { TooltipIconButton } from "@/components/assistant-ui/tooltip-icon-button"; import { Dialog, DialogContent } from "@/components/ui/dialog"; import { Tabs, TabsContent } from "@/components/ui/tabs"; import type { SearchSourceConnector } from "@/contracts/types/connector.types"; import { useConnectorsElectric } from "@/hooks/use-connectors-electric"; import { useDocumentsElectric } from "@/hooks/use-documents-electric"; +import { useInbox } from "@/hooks/use-inbox"; import { cn } from "@/lib/utils"; import { ConnectorDialogHeader } from "./connector-popup/components/connector-dialog-header"; import { ConnectorConnectView } from "./connector-popup/connector-configs/views/connector-connect-view"; @@ -27,10 +29,18 @@ import { YouTubeCrawlerView } from "./connector-popup/views/youtube-crawler-view export const ConnectorIndicator: FC = () => { const searchSpaceId = useAtomValue(activeSearchSpaceIdAtom); const searchParams = useSearchParams(); + const { data: currentUser } = useAtomValue(currentUserAtom); // Fetch document type counts using Electric SQL + PGlite for real-time updates const { documentTypeCounts, loading: documentTypesLoading } = useDocumentsElectric(searchSpaceId); + // Fetch notifications to detect indexing failures + const { inboxItems = [] } = useInbox( + currentUser?.id ?? null, + searchSpaceId ? Number(searchSpaceId) : null, + "connector_indexing" + ); + // Check if YouTube view is active const isYouTubeView = searchParams.get("view") === "youtube"; @@ -116,8 +126,10 @@ export const ConnectorIndicator: FC = () => { }; // Track indexing state locally - clears automatically when Electric SQL detects last_indexed_at changed - const { indexingConnectorIds, startIndexing } = useIndexingConnectors( - connectors as SearchSourceConnector[] + // Also clears when failed notifications are detected + const { indexingConnectorIds, startIndexing, stopIndexing } = useIndexingConnectors( + connectors as SearchSourceConnector[], + inboxItems ); const isLoading = connectorsLoading || documentTypesLoading; @@ -246,7 +258,7 @@ export const ConnectorIndicator: FC = () => { editingConnector.connector_type !== "GOOGLE_DRIVE_CONNECTOR" ? () => { startIndexing(editingConnector.id); - handleQuickIndexConnector(editingConnector.id, editingConnector.connector_type); + handleQuickIndexConnector(editingConnector.id, editingConnector.connector_type, stopIndexing); } : undefined } diff --git a/surfsense_web/components/assistant-ui/connector-popup/connector-configs/views/connector-edit-view.tsx b/surfsense_web/components/assistant-ui/connector-popup/connector-configs/views/connector-edit-view.tsx index 234898922..fbdffed7a 100644 --- a/surfsense_web/components/assistant-ui/connector-popup/connector-configs/views/connector-edit-view.tsx +++ b/surfsense_web/components/assistant-ui/connector-popup/connector-configs/views/connector-edit-view.tsx @@ -97,7 +97,7 @@ export const ConnectorEditView: FC = ({ }; }, [checkScrollState]); - // Reset local quick indexing state when indexing completes + // Reset local quick indexing state when indexing completes or fails useEffect(() => { if (!isIndexing) { setIsQuickIndexing(false); diff --git a/surfsense_web/components/assistant-ui/connector-popup/hooks/use-connector-dialog.ts b/surfsense_web/components/assistant-ui/connector-popup/hooks/use-connector-dialog.ts index a2b1168bd..f505d8f83 100644 --- a/surfsense_web/components/assistant-ui/connector-popup/hooks/use-connector-dialog.ts +++ b/surfsense_web/components/assistant-ui/connector-popup/hooks/use-connector-dialog.ts @@ -1375,7 +1375,7 @@ export const useConnectorDialog = () => { // Handle quick index (index without date picker, uses backend defaults) const handleQuickIndexConnector = useCallback( - async (connectorId: number, connectorType?: string) => { + async (connectorId: number, connectorType?: string, stopIndexing?: (id: number) => void) => { if (!searchSpaceId) return; // Track quick index clicked event @@ -1401,6 +1401,10 @@ export const useConnectorDialog = () => { } catch (error) { console.error("Error indexing connector content:", error); toast.error(error instanceof Error ? error.message : "Failed to start indexing"); + // Stop indexing state on error + if (stopIndexing) { + stopIndexing(connectorId); + } } }, [searchSpaceId, indexConnector] diff --git a/surfsense_web/components/assistant-ui/connector-popup/hooks/use-indexing-connectors.ts b/surfsense_web/components/assistant-ui/connector-popup/hooks/use-indexing-connectors.ts index 2ac8d340a..e82a8eb29 100644 --- a/surfsense_web/components/assistant-ui/connector-popup/hooks/use-indexing-connectors.ts +++ b/surfsense_web/components/assistant-ui/connector-popup/hooks/use-indexing-connectors.ts @@ -2,6 +2,8 @@ import { useCallback, useEffect, useRef, useState } from "react"; import type { SearchSourceConnector } from "@/contracts/types/connector.types"; +import type { InboxItem } from "@/contracts/types/inbox.types"; +import { isConnectorIndexingMetadata } from "@/contracts/types/inbox.types"; /** * Hook to track which connectors are currently indexing using local state. @@ -9,10 +11,14 @@ import type { SearchSourceConnector } from "@/contracts/types/connector.types"; * This provides a better UX than polling by: * 1. Setting indexing state immediately when user triggers indexing (optimistic) * 2. Clearing indexing state when Electric SQL detects last_indexed_at changed + * 3. Clearing indexing state when a failed notification is detected * * The actual `last_indexed_at` value comes from Electric SQL/PGlite, not local state. */ -export function useIndexingConnectors(connectors: SearchSourceConnector[]) { +export function useIndexingConnectors( + connectors: SearchSourceConnector[], + inboxItems?: InboxItem[] +) { // Set of connector IDs that are currently indexing const [indexingConnectorIds, setIndexingConnectorIds] = useState>(new Set()); @@ -48,6 +54,40 @@ export function useIndexingConnectors(connectors: SearchSourceConnector[]) { } }, [connectors, indexingConnectorIds]); + // Detect failed notifications and stop indexing state + useEffect(() => { + if (!inboxItems || inboxItems.length === 0) return; + + const newIndexingIds = new Set(indexingConnectorIds); + let hasChanges = false; + + for (const item of inboxItems) { + // Only check connector_indexing notifications + if (item.type !== "connector_indexing") continue; + + // Check if this notification indicates a failure + const metadata = isConnectorIndexingMetadata(item.metadata) + ? item.metadata + : null; + if (!metadata) continue; + + // Check if status is "failed" or if there's an error_message + const isFailed = + metadata.status === "failed" || + (metadata.error_message && metadata.error_message.trim().length > 0); + + // If failed and connector is in indexing state, clear it + if (isFailed && indexingConnectorIds.has(metadata.connector_id)) { + newIndexingIds.delete(metadata.connector_id); + hasChanges = true; + } + } + + if (hasChanges) { + setIndexingConnectorIds(newIndexingIds); + } + }, [inboxItems, indexingConnectorIds]); + // Add a connector to the indexing set (called when indexing starts) const startIndexing = useCallback((connectorId: number) => { setIndexingConnectorIds((prev) => { From 1343fabeee9cfe7d101e031abb00f3dbd29ad631 Mon Sep 17 00:00:00 2001 From: Anish Sarkar <104695310+AnishSarkar22@users.noreply.github.com> Date: Fri, 23 Jan 2026 19:56:19 +0530 Subject: [PATCH 12/28] feat: refactor composio connectors for modularity --- .../app/connectors/composio_connector.py | 366 +--- .../connectors/composio_gmail_connector.py | 614 ++++++ .../composio_google_calendar_connector.py | 453 ++++ .../composio_google_drive_connector.py | 1162 +++++++++++ .../app/routes/composio_routes.py | 19 +- .../app/services/composio_service.py | 21 + .../app/tasks/composio_indexer.py | 1819 +---------------- .../components/composio-connector-card.tsx | 78 - .../components/composio-calendar-config.tsx | 220 ++ .../components/composio-config.tsx | 353 ---- .../components/composio-drive-config.tsx | 313 +++ .../components/composio-gmail-config.tsx | 174 ++ .../connector-configs/index.tsx | 8 +- .../views/connector-edit-view.tsx | 4 +- .../views/indexing-configuration-view.tsx | 20 +- .../hooks/use-connector-dialog.ts | 113 +- surfsense_web/lib/connectors/utils.ts | 3 + 17 files changed, 3128 insertions(+), 2612 deletions(-) create mode 100644 surfsense_backend/app/connectors/composio_gmail_connector.py create mode 100644 surfsense_backend/app/connectors/composio_google_calendar_connector.py create mode 100644 surfsense_backend/app/connectors/composio_google_drive_connector.py delete mode 100644 surfsense_web/components/assistant-ui/connector-popup/components/composio-connector-card.tsx create mode 100644 surfsense_web/components/assistant-ui/connector-popup/connector-configs/components/composio-calendar-config.tsx delete mode 100644 surfsense_web/components/assistant-ui/connector-popup/connector-configs/components/composio-config.tsx create mode 100644 surfsense_web/components/assistant-ui/connector-popup/connector-configs/components/composio-drive-config.tsx create mode 100644 surfsense_web/components/assistant-ui/connector-popup/connector-configs/components/composio-gmail-config.tsx diff --git a/surfsense_backend/app/connectors/composio_connector.py b/surfsense_backend/app/connectors/composio_connector.py index 8cb91355d..301296378 100644 --- a/surfsense_backend/app/connectors/composio_connector.py +++ b/surfsense_backend/app/connectors/composio_connector.py @@ -1,7 +1,7 @@ """ -Composio Connector Module. +Composio Connector Base Module. -Provides a unified interface for interacting with various services via Composio, +Provides a base class for interacting with various services via Composio, primarily used during indexing operations. """ @@ -19,10 +19,10 @@ logger = logging.getLogger(__name__) class ComposioConnector: """ - Generic Composio connector for data retrieval. + Base Composio connector for data retrieval. Wraps the ComposioService to provide toolkit-specific data access - for indexing operations. + for indexing operations. Subclasses implement toolkit-specific methods. """ def __init__( @@ -89,354 +89,12 @@ class ComposioConnector: toolkit_id = await self.get_toolkit_id() return toolkit_id in INDEXABLE_TOOLKITS - # ===== Google Drive Methods ===== + @property + def session(self) -> AsyncSession: + """Get the database session.""" + return self._session - async def list_drive_files( - self, - folder_id: str | None = None, - page_token: str | None = None, - page_size: int = 100, - ) -> tuple[list[dict[str, Any]], str | None, str | None]: - """ - List files from Google Drive via Composio. - - Args: - folder_id: Optional folder ID to list contents of. - page_token: Pagination token. - page_size: Number of files per page. - - Returns: - Tuple of (files list, next_page_token, error message). - """ - connected_account_id = await self.get_connected_account_id() - if not connected_account_id: - return [], None, "No connected account ID found" - - entity_id = await self.get_entity_id() - service = await self._get_service() - return await service.get_drive_files( - connected_account_id=connected_account_id, - entity_id=entity_id, - folder_id=folder_id, - page_token=page_token, - page_size=page_size, - ) - - async def get_drive_file_content( - self, file_id: str - ) -> tuple[bytes | None, str | None]: - """ - Download file content from Google Drive via Composio. - - Args: - file_id: Google Drive file ID. - - Returns: - Tuple of (file content bytes, error message). - """ - connected_account_id = await self.get_connected_account_id() - if not connected_account_id: - return None, "No connected account ID found" - - entity_id = await self.get_entity_id() - service = await self._get_service() - return await service.get_drive_file_content( - connected_account_id=connected_account_id, - entity_id=entity_id, - file_id=file_id, - ) - - async def get_drive_start_page_token(self) -> tuple[str | None, str | None]: - """ - Get the starting page token for Google Drive change tracking. - - Returns: - Tuple of (start_page_token, error message). - """ - connected_account_id = await self.get_connected_account_id() - if not connected_account_id: - return None, "No connected account ID found" - - entity_id = await self.get_entity_id() - service = await self._get_service() - return await service.get_drive_start_page_token( - connected_account_id=connected_account_id, - entity_id=entity_id, - ) - - async def list_drive_changes( - self, - page_token: str | None = None, - page_size: int = 100, - include_removed: bool = True, - ) -> tuple[list[dict[str, Any]], str | None, str | None]: - """ - List changes in Google Drive since the given page token. - - Args: - page_token: Page token from previous sync (optional). - page_size: Number of changes per page. - include_removed: Whether to include removed items. - - Returns: - Tuple of (changes list, new_start_page_token, error message). - """ - connected_account_id = await self.get_connected_account_id() - if not connected_account_id: - return [], None, "No connected account ID found" - - entity_id = await self.get_entity_id() - service = await self._get_service() - return await service.list_drive_changes( - connected_account_id=connected_account_id, - entity_id=entity_id, - page_token=page_token, - page_size=page_size, - include_removed=include_removed, - ) - - # ===== Gmail Methods ===== - - async def list_gmail_messages( - self, - query: str = "", - max_results: int = 50, - page_token: str | None = None, - ) -> tuple[list[dict[str, Any]], str | None, int | None, str | None]: - """ - List Gmail messages via Composio with pagination support. - - Args: - query: Gmail search query. - max_results: Maximum number of messages per page (default: 50). - page_token: Optional pagination token for next page. - - Returns: - Tuple of (messages list, next_page_token, result_size_estimate, error message). - """ - connected_account_id = await self.get_connected_account_id() - if not connected_account_id: - return [], None, None, "No connected account ID found" - - entity_id = await self.get_entity_id() - service = await self._get_service() - return await service.get_gmail_messages( - connected_account_id=connected_account_id, - entity_id=entity_id, - query=query, - max_results=max_results, - page_token=page_token, - ) - - async def get_gmail_message_detail( - self, message_id: str - ) -> tuple[dict[str, Any] | None, str | None]: - """ - Get full details of a Gmail message via Composio. - - Args: - message_id: Gmail message ID. - - Returns: - Tuple of (message details, error message). - """ - connected_account_id = await self.get_connected_account_id() - if not connected_account_id: - return None, "No connected account ID found" - - entity_id = await self.get_entity_id() - service = await self._get_service() - return await service.get_gmail_message_detail( - connected_account_id=connected_account_id, - entity_id=entity_id, - message_id=message_id, - ) - - # ===== Google Calendar Methods ===== - - async def list_calendar_events( - self, - time_min: str | None = None, - time_max: str | None = None, - max_results: int = 250, - ) -> tuple[list[dict[str, Any]], str | None]: - """ - List Google Calendar events via Composio. - - Args: - time_min: Start time (RFC3339 format). - time_max: End time (RFC3339 format). - max_results: Maximum number of events. - - Returns: - Tuple of (events list, error message). - """ - connected_account_id = await self.get_connected_account_id() - if not connected_account_id: - return [], "No connected account ID found" - - entity_id = await self.get_entity_id() - service = await self._get_service() - return await service.get_calendar_events( - connected_account_id=connected_account_id, - entity_id=entity_id, - time_min=time_min, - time_max=time_max, - max_results=max_results, - ) - - # ===== Utility Methods ===== - - def format_gmail_message_to_markdown(self, message: dict[str, Any]) -> str: - """ - Format a Gmail message to markdown. - - Args: - message: Message object from Composio's GMAIL_FETCH_EMAILS response. - Composio structure: messageId, messageText, messageTimestamp, - payload.headers, labelIds, attachmentList - - Returns: - Formatted markdown string. - """ - try: - # Composio uses 'messageId' (camelCase) - message_id = message.get("messageId", "") or message.get("id", "") - label_ids = message.get("labelIds", []) - - # Extract headers from payload - payload = message.get("payload", {}) - headers = payload.get("headers", []) - - # Parse headers into a dict - header_dict = {} - for header in headers: - name = header.get("name", "").lower() - value = header.get("value", "") - header_dict[name] = value - - # Extract key information - subject = header_dict.get("subject", "No Subject") - from_email = header_dict.get("from", "Unknown Sender") - to_email = header_dict.get("to", "Unknown Recipient") - # Composio provides messageTimestamp directly - date_str = message.get("messageTimestamp", "") or header_dict.get( - "date", "Unknown Date" - ) - - # Build markdown content - markdown_content = f"# {subject}\n\n" - markdown_content += f"**From:** {from_email}\n" - markdown_content += f"**To:** {to_email}\n" - markdown_content += f"**Date:** {date_str}\n" - - if label_ids: - markdown_content += f"**Labels:** {', '.join(label_ids)}\n" - - markdown_content += "\n---\n\n" - - # Composio provides full message text in 'messageText' - message_text = message.get("messageText", "") - if message_text: - markdown_content += f"## Content\n\n{message_text}\n\n" - else: - # Fallback to snippet if no messageText - snippet = message.get("snippet", "") - if snippet: - markdown_content += f"## Preview\n\n{snippet}\n\n" - - # Add attachment info if present - attachments = message.get("attachmentList", []) - if attachments: - markdown_content += "## Attachments\n\n" - for att in attachments: - att_name = att.get("filename", att.get("name", "Unknown")) - markdown_content += f"- {att_name}\n" - markdown_content += "\n" - - # Add message metadata - markdown_content += "## Message Details\n\n" - markdown_content += f"- **Message ID:** {message_id}\n" - - return markdown_content - - except Exception as e: - return f"Error formatting message to markdown: {e!s}" - - def format_calendar_event_to_markdown(self, event: dict[str, Any]) -> str: - """ - Format a Google Calendar event to markdown. - - Args: - event: Event object from Google Calendar API. - - Returns: - Formatted markdown string. - """ - from datetime import datetime - - try: - # Extract basic event information - summary = event.get("summary", "No Title") - description = event.get("description", "") - location = event.get("location", "") - - # Extract start and end times - start = event.get("start", {}) - end = event.get("end", {}) - - start_time = start.get("dateTime") or start.get("date", "") - end_time = end.get("dateTime") or end.get("date", "") - - # Format times for display - def format_time(time_str: str) -> str: - if not time_str: - return "Unknown" - try: - if "T" in time_str: - dt = datetime.fromisoformat(time_str.replace("Z", "+00:00")) - return dt.strftime("%Y-%m-%d %H:%M") - return time_str - except Exception: - return time_str - - start_formatted = format_time(start_time) - end_formatted = format_time(end_time) - - # Extract attendees - attendees = event.get("attendees", []) - attendee_list = [] - for attendee in attendees: - email = attendee.get("email", "") - display_name = attendee.get("displayName", email) - response_status = attendee.get("responseStatus", "") - attendee_list.append(f"- {display_name} ({response_status})") - - # Build markdown content - markdown_content = f"# {summary}\n\n" - markdown_content += f"**Start:** {start_formatted}\n" - markdown_content += f"**End:** {end_formatted}\n" - - if location: - markdown_content += f"**Location:** {location}\n" - - markdown_content += "\n" - - if description: - markdown_content += f"## Description\n\n{description}\n\n" - - if attendee_list: - markdown_content += "## Attendees\n\n" - markdown_content += "\n".join(attendee_list) - markdown_content += "\n\n" - - # Add event metadata - markdown_content += "## Event Details\n\n" - markdown_content += f"- **Event ID:** {event.get('id', 'Unknown')}\n" - markdown_content += f"- **Created:** {event.get('created', 'Unknown')}\n" - markdown_content += f"- **Updated:** {event.get('updated', 'Unknown')}\n" - - return markdown_content - - except Exception as e: - return f"Error formatting event to markdown: {e!s}" + @property + def connector_id(self) -> int: + """Get the connector ID.""" + return self._connector_id diff --git a/surfsense_backend/app/connectors/composio_gmail_connector.py b/surfsense_backend/app/connectors/composio_gmail_connector.py new file mode 100644 index 000000000..5a9645a66 --- /dev/null +++ b/surfsense_backend/app/connectors/composio_gmail_connector.py @@ -0,0 +1,614 @@ +""" +Composio Gmail Connector Module. + +Provides Gmail specific methods for data retrieval and indexing via Composio. +""" + +import logging +from datetime import UTC, datetime +from typing import Any + +from sqlalchemy.ext.asyncio import AsyncSession +from sqlalchemy.future import select +from sqlalchemy.orm import selectinload + +from app.config import config +from app.connectors.composio_connector import ComposioConnector +from app.db import Document, DocumentType +from app.services.composio_service import TOOLKIT_TO_DOCUMENT_TYPE +from app.services.llm_service import get_user_long_context_llm +from app.services.task_logging_service import TaskLoggingService +from app.tasks.connector_indexers.base import calculate_date_range +from app.utils.document_converters import ( + create_document_chunks, + generate_content_hash, + generate_document_summary, + generate_unique_identifier_hash, +) + +logger = logging.getLogger(__name__) + + +def get_current_timestamp() -> datetime: + """Get the current timestamp with timezone for updated_at field.""" + return datetime.now(UTC) + + +async def check_document_by_unique_identifier( + session: AsyncSession, unique_identifier_hash: str +) -> Document | None: + """Check if a document with the given unique identifier hash already exists.""" + existing_doc_result = await session.execute( + select(Document) + .options(selectinload(Document.chunks)) + .where(Document.unique_identifier_hash == unique_identifier_hash) + ) + return existing_doc_result.scalars().first() + + +async def update_connector_last_indexed( + session: AsyncSession, + connector, + update_last_indexed: bool = True, +) -> None: + """Update the last_indexed_at timestamp for a connector.""" + if update_last_indexed: + connector.last_indexed_at = datetime.now(UTC) + logger.info(f"Updated last_indexed_at to {connector.last_indexed_at}") + + +class ComposioGmailConnector(ComposioConnector): + """ + Gmail specific Composio connector. + + Provides methods for listing messages, getting message details, and formatting + Gmail messages from Gmail via Composio. + """ + + async def list_gmail_messages( + self, + query: str = "", + max_results: int = 50, + page_token: str | None = None, + ) -> tuple[list[dict[str, Any]], str | None, int | None, str | None]: + """ + List Gmail messages via Composio with pagination support. + + Args: + query: Gmail search query. + max_results: Maximum number of messages per page (default: 50). + page_token: Optional pagination token for next page. + + Returns: + Tuple of (messages list, next_page_token, result_size_estimate, error message). + """ + connected_account_id = await self.get_connected_account_id() + if not connected_account_id: + return [], None, None, "No connected account ID found" + + entity_id = await self.get_entity_id() + service = await self._get_service() + return await service.get_gmail_messages( + connected_account_id=connected_account_id, + entity_id=entity_id, + query=query, + max_results=max_results, + page_token=page_token, + ) + + async def get_gmail_message_detail( + self, message_id: str + ) -> tuple[dict[str, Any] | None, str | None]: + """ + Get full details of a Gmail message via Composio. + + Args: + message_id: Gmail message ID. + + Returns: + Tuple of (message details, error message). + """ + connected_account_id = await self.get_connected_account_id() + if not connected_account_id: + return None, "No connected account ID found" + + entity_id = await self.get_entity_id() + service = await self._get_service() + return await service.get_gmail_message_detail( + connected_account_id=connected_account_id, + entity_id=entity_id, + message_id=message_id, + ) + + def format_gmail_message_to_markdown(self, message: dict[str, Any]) -> str: + """ + Format a Gmail message to markdown. + + Args: + message: Message object from Composio's GMAIL_FETCH_EMAILS response. + Composio structure: messageId, messageText, messageTimestamp, + payload.headers, labelIds, attachmentList + + Returns: + Formatted markdown string. + """ + try: + # Composio uses 'messageId' (camelCase) + message_id = message.get("messageId", "") or message.get("id", "") + label_ids = message.get("labelIds", []) + + # Extract headers from payload + payload = message.get("payload", {}) + headers = payload.get("headers", []) + + # Parse headers into a dict + header_dict = {} + for header in headers: + name = header.get("name", "").lower() + value = header.get("value", "") + header_dict[name] = value + + # Extract key information + subject = header_dict.get("subject", "No Subject") + from_email = header_dict.get("from", "Unknown Sender") + to_email = header_dict.get("to", "Unknown Recipient") + # Composio provides messageTimestamp directly + date_str = message.get("messageTimestamp", "") or header_dict.get( + "date", "Unknown Date" + ) + + # Build markdown content + markdown_content = f"# {subject}\n\n" + markdown_content += f"**From:** {from_email}\n" + markdown_content += f"**To:** {to_email}\n" + markdown_content += f"**Date:** {date_str}\n" + + if label_ids: + markdown_content += f"**Labels:** {', '.join(label_ids)}\n" + + markdown_content += "\n---\n\n" + + # Composio provides full message text in 'messageText' + message_text = message.get("messageText", "") + if message_text: + markdown_content += f"## Content\n\n{message_text}\n\n" + else: + # Fallback to snippet if no messageText + snippet = message.get("snippet", "") + if snippet: + markdown_content += f"## Preview\n\n{snippet}\n\n" + + # Add attachment info if present + attachments = message.get("attachmentList", []) + if attachments: + markdown_content += "## Attachments\n\n" + for att in attachments: + att_name = att.get("filename", att.get("name", "Unknown")) + markdown_content += f"- {att_name}\n" + markdown_content += "\n" + + # Add message metadata + markdown_content += "## Message Details\n\n" + markdown_content += f"- **Message ID:** {message_id}\n" + + return markdown_content + + except Exception as e: + return f"Error formatting message to markdown: {e!s}" + + +# ============ Indexer Functions ============ + + +async def _process_gmail_message_batch( + session: AsyncSession, + messages: list[dict[str, Any]], + composio_connector: ComposioGmailConnector, + connector_id: int, + search_space_id: int, + user_id: str, + total_documents_indexed: int = 0, +) -> tuple[int, int]: + """ + Process a batch of Gmail messages and index them. + + Args: + total_documents_indexed: Running total of documents indexed so far (for batch commits). + + Returns: + Tuple of (documents_indexed, documents_skipped) + """ + documents_indexed = 0 + documents_skipped = 0 + + for message in messages: + try: + # Composio uses 'messageId' (camelCase), not 'id' + message_id = message.get("messageId", "") or message.get("id", "") + if not message_id: + documents_skipped += 1 + continue + + # Composio's GMAIL_FETCH_EMAILS already returns full message content + # No need for a separate detail API call + + # Extract message info from Composio response + # Composio structure: messageId, messageText, messageTimestamp, payload.headers, labelIds + payload = message.get("payload", {}) + headers = payload.get("headers", []) + + subject = "No Subject" + sender = "Unknown Sender" + date_str = message.get("messageTimestamp", "Unknown Date") + + for header in headers: + name = header.get("name", "").lower() + value = header.get("value", "") + if name == "subject": + subject = value + elif name == "from": + sender = value + elif name == "date": + date_str = value + + # Format to markdown using the full message data + markdown_content = composio_connector.format_gmail_message_to_markdown( + message + ) + + # Check for empty content (defensive parsing per Composio best practices) + if not markdown_content.strip(): + logger.warning(f"Skipping Gmail message with no content: {subject}") + documents_skipped += 1 + continue + + # Generate unique identifier + document_type = DocumentType(TOOLKIT_TO_DOCUMENT_TYPE["gmail"]) + unique_identifier_hash = generate_unique_identifier_hash( + document_type, f"gmail_{message_id}", search_space_id + ) + + content_hash = generate_content_hash(markdown_content, search_space_id) + + existing_document = await check_document_by_unique_identifier( + session, unique_identifier_hash + ) + + # Get label IDs from Composio response + label_ids = message.get("labelIds", []) + # Extract thread_id if available (for consistency with non-Composio implementation) + thread_id = message.get("threadId", "") or message.get("thread_id", "") + + if existing_document: + if existing_document.content_hash == content_hash: + documents_skipped += 1 + continue + + # Update existing + user_llm = await get_user_long_context_llm( + session, user_id, search_space_id + ) + + if user_llm: + document_metadata = { + "message_id": message_id, + "thread_id": thread_id, + "subject": subject, + "sender": sender, + "document_type": "Gmail Message (Composio)", + } + ( + summary_content, + summary_embedding, + ) = await generate_document_summary( + markdown_content, user_llm, document_metadata + ) + else: + summary_content = ( + f"Gmail: {subject}\n\nFrom: {sender}\nDate: {date_str}" + ) + summary_embedding = config.embedding_model_instance.embed( + summary_content + ) + + chunks = await create_document_chunks(markdown_content) + + existing_document.title = f"Gmail: {subject}" + existing_document.content = summary_content + existing_document.content_hash = content_hash + existing_document.embedding = summary_embedding + existing_document.document_metadata = { + "message_id": message_id, + "thread_id": thread_id, + "subject": subject, + "sender": sender, + "date": date_str, + "labels": label_ids, + "connector_id": connector_id, + "source": "composio", + } + existing_document.chunks = chunks + existing_document.updated_at = get_current_timestamp() + + documents_indexed += 1 + + # Batch commit every 10 documents + current_total = total_documents_indexed + documents_indexed + if current_total % 10 == 0: + logger.info( + f"Committing batch: {current_total} Gmail messages processed so far" + ) + await session.commit() + continue + + # Create new document + user_llm = await get_user_long_context_llm( + session, user_id, search_space_id + ) + + if user_llm: + document_metadata = { + "message_id": message_id, + "thread_id": thread_id, + "subject": subject, + "sender": sender, + "document_type": "Gmail Message (Composio)", + } + summary_content, summary_embedding = await generate_document_summary( + markdown_content, user_llm, document_metadata + ) + else: + summary_content = ( + f"Gmail: {subject}\n\nFrom: {sender}\nDate: {date_str}" + ) + summary_embedding = config.embedding_model_instance.embed( + summary_content + ) + + chunks = await create_document_chunks(markdown_content) + + document = Document( + search_space_id=search_space_id, + title=f"Gmail: {subject}", + document_type=DocumentType(TOOLKIT_TO_DOCUMENT_TYPE["gmail"]), + document_metadata={ + "message_id": message_id, + "thread_id": thread_id, + "subject": subject, + "sender": sender, + "date": date_str, + "labels": label_ids, + "connector_id": connector_id, + "toolkit_id": "gmail", + "source": "composio", + }, + content=summary_content, + content_hash=content_hash, + unique_identifier_hash=unique_identifier_hash, + embedding=summary_embedding, + chunks=chunks, + updated_at=get_current_timestamp(), + ) + session.add(document) + documents_indexed += 1 + + # Batch commit every 10 documents + current_total = total_documents_indexed + documents_indexed + if current_total % 10 == 0: + logger.info( + f"Committing batch: {current_total} Gmail messages processed so far" + ) + await session.commit() + + except Exception as e: + logger.error(f"Error processing Gmail message: {e!s}", exc_info=True) + documents_skipped += 1 + # Rollback on error to avoid partial state (per Composio best practices) + try: + await session.rollback() + except Exception as rollback_error: + logger.error( + f"Error during rollback: {rollback_error!s}", exc_info=True + ) + continue + + return documents_indexed, documents_skipped + + +async def index_composio_gmail( + session: AsyncSession, + connector, + connector_id: int, + search_space_id: int, + user_id: str, + start_date: str | None, + end_date: str | None, + task_logger: TaskLoggingService, + log_entry, + update_last_indexed: bool = True, + max_items: int = 1000, +) -> tuple[int, str]: + """Index Gmail messages via Composio with pagination and incremental processing.""" + try: + composio_connector = ComposioGmailConnector(session, connector_id) + + # Normalize date values - handle "undefined" strings from frontend + if start_date == "undefined" or start_date == "": + start_date = None + if end_date == "undefined" or end_date == "": + end_date = None + + # Use provided dates directly if both are provided, otherwise calculate from last_indexed_at + # This ensures user-selected dates are respected (matching non-Composio Gmail connector behavior) + if start_date is not None and end_date is not None: + # User provided both dates - use them directly + start_date_str = start_date + end_date_str = end_date + else: + # Calculate date range with defaults (uses last_indexed_at or 365 days back) + # This ensures indexing works even when user doesn't specify dates + start_date_str, end_date_str = calculate_date_range( + connector, start_date, end_date, default_days_back=365 + ) + + # Build query with date range + query_parts = [] + if start_date_str: + query_parts.append(f"after:{start_date_str.replace('-', '/')}") + if end_date_str: + query_parts.append(f"before:{end_date_str.replace('-', '/')}") + query = " ".join(query_parts) if query_parts else "" + + logger.info( + f"Gmail query for connector {connector_id}: '{query}' " + f"(start_date={start_date_str}, end_date={end_date_str})" + ) + + # Use smaller batch size to avoid 413 payload too large errors + batch_size = 50 + page_token = None + total_documents_indexed = 0 + total_documents_skipped = 0 + total_messages_fetched = 0 + result_size_estimate = None # Will be set from first API response + + while total_messages_fetched < max_items: + # Calculate how many messages to fetch in this batch + remaining = max_items - total_messages_fetched + current_batch_size = min(batch_size, remaining) + + # Use result_size_estimate if available, otherwise fall back to max_items + estimated_total = ( + result_size_estimate if result_size_estimate is not None else max_items + ) + # Cap estimated_total at max_items to avoid showing misleading progress + estimated_total = min(estimated_total, max_items) + + await task_logger.log_task_progress( + log_entry, + f"Fetching Gmail messages batch via Composio for connector {connector_id} " + f"({total_messages_fetched}/{estimated_total} fetched, {total_documents_indexed} indexed)", + { + "stage": "fetching_messages", + "batch_size": current_batch_size, + "total_fetched": total_messages_fetched, + "total_indexed": total_documents_indexed, + "estimated_total": estimated_total, + }, + ) + + # Fetch batch of messages + ( + messages, + next_token, + result_size_estimate_batch, + error, + ) = await composio_connector.list_gmail_messages( + query=query, + max_results=current_batch_size, + page_token=page_token, + ) + + if error: + await task_logger.log_task_failure( + log_entry, f"Failed to fetch Gmail messages: {error}", {} + ) + return 0, f"Failed to fetch Gmail messages: {error}" + + if not messages: + # No more messages available + break + + # Update result_size_estimate from first response (Gmail provides this estimate) + if result_size_estimate is None and result_size_estimate_batch is not None: + result_size_estimate = result_size_estimate_batch + logger.info( + f"Gmail API estimated {result_size_estimate} total messages for query: '{query}'" + ) + + total_messages_fetched += len(messages) + # Recalculate estimated_total after potentially updating result_size_estimate + estimated_total = ( + result_size_estimate if result_size_estimate is not None else max_items + ) + estimated_total = min(estimated_total, max_items) + + logger.info( + f"Fetched batch of {len(messages)} Gmail messages " + f"(total: {total_messages_fetched}/{estimated_total})" + ) + + # Process batch incrementally + batch_indexed, batch_skipped = await _process_gmail_message_batch( + session=session, + messages=messages, + composio_connector=composio_connector, + connector_id=connector_id, + search_space_id=search_space_id, + user_id=user_id, + total_documents_indexed=total_documents_indexed, + ) + + total_documents_indexed += batch_indexed + total_documents_skipped += batch_skipped + + logger.info( + f"Processed batch: {batch_indexed} indexed, {batch_skipped} skipped " + f"(total: {total_documents_indexed} indexed, {total_documents_skipped} skipped)" + ) + + # Batch commits happen in _process_gmail_message_batch every 10 documents + # This ensures progress is saved incrementally, preventing data loss on crashes + + # Check if we should continue + if not next_token: + # No more pages available + break + + if len(messages) < current_batch_size: + # Last page had fewer items than requested, we're done + break + + # Continue with next page + page_token = next_token + + if total_messages_fetched == 0: + success_msg = "No Gmail messages found in the specified date range" + await task_logger.log_task_success( + log_entry, success_msg, {"messages_count": 0} + ) + # CRITICAL: Update timestamp even when no messages found so Electric SQL syncs and UI shows indexed status + await update_connector_last_indexed(session, connector, update_last_indexed) + await session.commit() + return 0, None # Return None (not error) when no items found + + # CRITICAL: Always update timestamp (even if 0 documents indexed) so Electric SQL syncs + # This ensures the UI shows "Last indexed" instead of "Never indexed" + await update_connector_last_indexed(session, connector, update_last_indexed) + + # Final commit to ensure all documents are persisted (safety net) + # This matches the pattern used in non-Composio Gmail indexer + logger.info( + f"Final commit: Total {total_documents_indexed} Gmail messages processed" + ) + await session.commit() + logger.info( + "Successfully committed all Composio Gmail document changes to database" + ) + + await task_logger.log_task_success( + log_entry, + f"Successfully completed Gmail indexing via Composio for connector {connector_id}", + { + "documents_indexed": total_documents_indexed, + "documents_skipped": total_documents_skipped, + "messages_fetched": total_messages_fetched, + }, + ) + + return total_documents_indexed, None + + except Exception as e: + logger.error(f"Failed to index Gmail via Composio: {e!s}", exc_info=True) + return 0, f"Failed to index Gmail via Composio: {e!s}" + diff --git a/surfsense_backend/app/connectors/composio_google_calendar_connector.py b/surfsense_backend/app/connectors/composio_google_calendar_connector.py new file mode 100644 index 000000000..ab8bde53c --- /dev/null +++ b/surfsense_backend/app/connectors/composio_google_calendar_connector.py @@ -0,0 +1,453 @@ +""" +Composio Google Calendar Connector Module. + +Provides Google Calendar specific methods for data retrieval and indexing via Composio. +""" + +import logging +from datetime import UTC, datetime +from typing import Any + +from sqlalchemy.ext.asyncio import AsyncSession +from sqlalchemy.future import select +from sqlalchemy.orm import selectinload + +from app.config import config +from app.connectors.composio_connector import ComposioConnector +from app.db import Document, DocumentType +from app.services.composio_service import TOOLKIT_TO_DOCUMENT_TYPE +from app.services.llm_service import get_user_long_context_llm +from app.services.task_logging_service import TaskLoggingService +from app.tasks.connector_indexers.base import calculate_date_range +from app.utils.document_converters import ( + create_document_chunks, + generate_content_hash, + generate_document_summary, + generate_unique_identifier_hash, +) + +logger = logging.getLogger(__name__) + + +def get_current_timestamp() -> datetime: + """Get the current timestamp with timezone for updated_at field.""" + return datetime.now(UTC) + + +async def check_document_by_unique_identifier( + session: AsyncSession, unique_identifier_hash: str +) -> Document | None: + """Check if a document with the given unique identifier hash already exists.""" + existing_doc_result = await session.execute( + select(Document) + .options(selectinload(Document.chunks)) + .where(Document.unique_identifier_hash == unique_identifier_hash) + ) + return existing_doc_result.scalars().first() + + +async def update_connector_last_indexed( + session: AsyncSession, + connector, + update_last_indexed: bool = True, +) -> None: + """Update the last_indexed_at timestamp for a connector.""" + if update_last_indexed: + connector.last_indexed_at = datetime.now(UTC) + logger.info(f"Updated last_indexed_at to {connector.last_indexed_at}") + + +class ComposioGoogleCalendarConnector(ComposioConnector): + """ + Google Calendar specific Composio connector. + + Provides methods for listing calendar events and formatting them from + Google Calendar via Composio. + """ + + async def list_calendar_events( + self, + time_min: str | None = None, + time_max: str | None = None, + max_results: int = 250, + ) -> tuple[list[dict[str, Any]], str | None]: + """ + List Google Calendar events via Composio. + + Args: + time_min: Start time (RFC3339 format). + time_max: End time (RFC3339 format). + max_results: Maximum number of events. + + Returns: + Tuple of (events list, error message). + """ + connected_account_id = await self.get_connected_account_id() + if not connected_account_id: + return [], "No connected account ID found" + + entity_id = await self.get_entity_id() + service = await self._get_service() + return await service.get_calendar_events( + connected_account_id=connected_account_id, + entity_id=entity_id, + time_min=time_min, + time_max=time_max, + max_results=max_results, + ) + + def format_calendar_event_to_markdown(self, event: dict[str, Any]) -> str: + """ + Format a Google Calendar event to markdown. + + Args: + event: Event object from Google Calendar API. + + Returns: + Formatted markdown string. + """ + try: + # Extract basic event information + summary = event.get("summary", "No Title") + description = event.get("description", "") + location = event.get("location", "") + + # Extract start and end times + start = event.get("start", {}) + end = event.get("end", {}) + + start_time = start.get("dateTime") or start.get("date", "") + end_time = end.get("dateTime") or end.get("date", "") + + # Format times for display + def format_time(time_str: str) -> str: + if not time_str: + return "Unknown" + try: + if "T" in time_str: + dt = datetime.fromisoformat(time_str.replace("Z", "+00:00")) + return dt.strftime("%Y-%m-%d %H:%M") + return time_str + except Exception: + return time_str + + start_formatted = format_time(start_time) + end_formatted = format_time(end_time) + + # Extract attendees + attendees = event.get("attendees", []) + attendee_list = [] + for attendee in attendees: + email = attendee.get("email", "") + display_name = attendee.get("displayName", email) + response_status = attendee.get("responseStatus", "") + attendee_list.append(f"- {display_name} ({response_status})") + + # Build markdown content + markdown_content = f"# {summary}\n\n" + markdown_content += f"**Start:** {start_formatted}\n" + markdown_content += f"**End:** {end_formatted}\n" + + if location: + markdown_content += f"**Location:** {location}\n" + + markdown_content += "\n" + + if description: + markdown_content += f"## Description\n\n{description}\n\n" + + if attendee_list: + markdown_content += "## Attendees\n\n" + markdown_content += "\n".join(attendee_list) + markdown_content += "\n\n" + + # Add event metadata + markdown_content += "## Event Details\n\n" + markdown_content += f"- **Event ID:** {event.get('id', 'Unknown')}\n" + markdown_content += f"- **Created:** {event.get('created', 'Unknown')}\n" + markdown_content += f"- **Updated:** {event.get('updated', 'Unknown')}\n" + + return markdown_content + + except Exception as e: + return f"Error formatting event to markdown: {e!s}" + + +# ============ Indexer Functions ============ + + +async def index_composio_google_calendar( + session: AsyncSession, + connector, + connector_id: int, + search_space_id: int, + user_id: str, + start_date: str | None, + end_date: str | None, + task_logger: TaskLoggingService, + log_entry, + update_last_indexed: bool = True, + max_items: int = 2500, +) -> tuple[int, str]: + """Index Google Calendar events via Composio.""" + try: + composio_connector = ComposioGoogleCalendarConnector(session, connector_id) + + await task_logger.log_task_progress( + log_entry, + f"Fetching Google Calendar events via Composio for connector {connector_id}", + {"stage": "fetching_events"}, + ) + + # Normalize date values - handle "undefined" strings from frontend + if start_date == "undefined" or start_date == "": + start_date = None + if end_date == "undefined" or end_date == "": + end_date = None + + # Use provided dates directly if both are provided, otherwise calculate from last_indexed_at + # This ensures user-selected dates are respected (matching non-Composio Calendar connector behavior) + if start_date is not None and end_date is not None: + # User provided both dates - use them directly + start_date_str = start_date + end_date_str = end_date + else: + # Calculate date range with defaults (uses last_indexed_at or 365 days back) + # This ensures indexing works even when user doesn't specify dates + start_date_str, end_date_str = calculate_date_range( + connector, start_date, end_date, default_days_back=365 + ) + + # Build time range for API call + time_min = f"{start_date_str}T00:00:00Z" + time_max = f"{end_date_str}T23:59:59Z" + + logger.info( + f"Google Calendar query for connector {connector_id}: " + f"(start_date={start_date_str}, end_date={end_date_str})" + ) + + events, error = await composio_connector.list_calendar_events( + time_min=time_min, + time_max=time_max, + max_results=max_items, + ) + + if error: + await task_logger.log_task_failure( + log_entry, f"Failed to fetch Calendar events: {error}", {} + ) + return 0, f"Failed to fetch Calendar events: {error}" + + if not events: + success_msg = "No Google Calendar events found in the specified date range" + await task_logger.log_task_success( + log_entry, success_msg, {"events_count": 0} + ) + # CRITICAL: Update timestamp even when no events found so Electric SQL syncs and UI shows indexed status + await update_connector_last_indexed(session, connector, update_last_indexed) + await session.commit() + return ( + 0, + None, + ) # Return None (not error) when no items found - this is success with 0 items + + logger.info(f"Found {len(events)} Google Calendar events to index via Composio") + + documents_indexed = 0 + documents_skipped = 0 + + for event in events: + try: + # Handle both standard Google API and potential Composio variations + event_id = event.get("id", "") or event.get("eventId", "") + summary = ( + event.get("summary", "") or event.get("title", "") or "No Title" + ) + + if not event_id: + documents_skipped += 1 + continue + + # Format to markdown + markdown_content = composio_connector.format_calendar_event_to_markdown( + event + ) + + # Generate unique identifier + document_type = DocumentType(TOOLKIT_TO_DOCUMENT_TYPE["googlecalendar"]) + unique_identifier_hash = generate_unique_identifier_hash( + document_type, f"calendar_{event_id}", search_space_id + ) + + content_hash = generate_content_hash(markdown_content, search_space_id) + + existing_document = await check_document_by_unique_identifier( + session, unique_identifier_hash + ) + + # Extract event times + start = event.get("start", {}) + end = event.get("end", {}) + start_time = start.get("dateTime") or start.get("date", "") + end_time = end.get("dateTime") or end.get("date", "") + location = event.get("location", "") + + if existing_document: + if existing_document.content_hash == content_hash: + documents_skipped += 1 + continue + + # Update existing + user_llm = await get_user_long_context_llm( + session, user_id, search_space_id + ) + + if user_llm: + document_metadata = { + "event_id": event_id, + "summary": summary, + "start_time": start_time, + "document_type": "Google Calendar Event (Composio)", + } + ( + summary_content, + summary_embedding, + ) = await generate_document_summary( + markdown_content, user_llm, document_metadata + ) + else: + summary_content = f"Calendar: {summary}\n\nStart: {start_time}\nEnd: {end_time}" + if location: + summary_content += f"\nLocation: {location}" + summary_embedding = config.embedding_model_instance.embed( + summary_content + ) + + chunks = await create_document_chunks(markdown_content) + + existing_document.title = f"Calendar: {summary}" + existing_document.content = summary_content + existing_document.content_hash = content_hash + existing_document.embedding = summary_embedding + existing_document.document_metadata = { + "event_id": event_id, + "summary": summary, + "start_time": start_time, + "end_time": end_time, + "location": location, + "connector_id": connector_id, + "source": "composio", + } + existing_document.chunks = chunks + existing_document.updated_at = get_current_timestamp() + + documents_indexed += 1 + + # Batch commit every 10 documents + if documents_indexed % 10 == 0: + logger.info( + f"Committing batch: {documents_indexed} Google Calendar events processed so far" + ) + await session.commit() + continue + + # Create new document + user_llm = await get_user_long_context_llm( + session, user_id, search_space_id + ) + + if user_llm: + document_metadata = { + "event_id": event_id, + "summary": summary, + "start_time": start_time, + "document_type": "Google Calendar Event (Composio)", + } + ( + summary_content, + summary_embedding, + ) = await generate_document_summary( + markdown_content, user_llm, document_metadata + ) + else: + summary_content = ( + f"Calendar: {summary}\n\nStart: {start_time}\nEnd: {end_time}" + ) + if location: + summary_content += f"\nLocation: {location}" + summary_embedding = config.embedding_model_instance.embed( + summary_content + ) + + chunks = await create_document_chunks(markdown_content) + + document = Document( + search_space_id=search_space_id, + title=f"Calendar: {summary}", + document_type=DocumentType( + TOOLKIT_TO_DOCUMENT_TYPE["googlecalendar"] + ), + document_metadata={ + "event_id": event_id, + "summary": summary, + "start_time": start_time, + "end_time": end_time, + "location": location, + "connector_id": connector_id, + "toolkit_id": "googlecalendar", + "source": "composio", + }, + content=summary_content, + content_hash=content_hash, + unique_identifier_hash=unique_identifier_hash, + embedding=summary_embedding, + chunks=chunks, + updated_at=get_current_timestamp(), + ) + session.add(document) + documents_indexed += 1 + + # Batch commit every 10 documents + if documents_indexed % 10 == 0: + logger.info( + f"Committing batch: {documents_indexed} Google Calendar events processed so far" + ) + await session.commit() + + except Exception as e: + logger.error(f"Error processing Calendar event: {e!s}", exc_info=True) + documents_skipped += 1 + continue + + # CRITICAL: Always update timestamp (even if 0 documents indexed) so Electric SQL syncs + # This ensures the UI shows "Last indexed" instead of "Never indexed" + await update_connector_last_indexed(session, connector, update_last_indexed) + + # Final commit to ensure all documents are persisted (safety net) + # This matches the pattern used in non-Composio Gmail indexer + logger.info( + f"Final commit: Total {documents_indexed} Google Calendar events processed" + ) + await session.commit() + logger.info( + "Successfully committed all Composio Google Calendar document changes to database" + ) + + await task_logger.log_task_success( + log_entry, + f"Successfully completed Google Calendar indexing via Composio for connector {connector_id}", + { + "documents_indexed": documents_indexed, + "documents_skipped": documents_skipped, + }, + ) + + return documents_indexed, None + + except Exception as e: + logger.error( + f"Failed to index Google Calendar via Composio: {e!s}", exc_info=True + ) + return 0, f"Failed to index Google Calendar via Composio: {e!s}" + diff --git a/surfsense_backend/app/connectors/composio_google_drive_connector.py b/surfsense_backend/app/connectors/composio_google_drive_connector.py new file mode 100644 index 000000000..e19436611 --- /dev/null +++ b/surfsense_backend/app/connectors/composio_google_drive_connector.py @@ -0,0 +1,1162 @@ +""" +Composio Google Drive Connector Module. + +Provides Google Drive specific methods for data retrieval and indexing via Composio. +""" + +import logging +import os +import tempfile +from datetime import UTC, datetime +from pathlib import Path +from typing import Any + +from sqlalchemy.ext.asyncio import AsyncSession +from sqlalchemy.orm.attributes import flag_modified + +from app.config import config +from app.connectors.composio_connector import ComposioConnector +from app.db import Document, DocumentType, Log +from app.services.composio_service import TOOLKIT_TO_DOCUMENT_TYPE +from app.services.llm_service import get_user_long_context_llm +from app.services.task_logging_service import TaskLoggingService +from app.utils.document_converters import ( + create_document_chunks, + generate_content_hash, + generate_document_summary, + generate_unique_identifier_hash, +) + +logger = logging.getLogger(__name__) + + +# Binary file extensions that need file processor +BINARY_FILE_EXTENSIONS = { + ".pdf", + ".doc", + ".docx", + ".xls", + ".xlsx", + ".ppt", + ".pptx", + ".png", + ".jpg", + ".jpeg", + ".gif", + ".bmp", + ".tiff", + ".webp", + ".zip", + ".tar", + ".gz", + ".rar", + ".7z", + ".mp3", + ".mp4", + ".wav", + ".avi", + ".mov", + ".exe", + ".dll", + ".so", + ".bin", +} + +# Text file extensions that can be decoded as UTF-8 +TEXT_FILE_EXTENSIONS = { + ".txt", + ".md", + ".markdown", + ".json", + ".xml", + ".html", + ".htm", + ".css", + ".js", + ".ts", + ".py", + ".java", + ".c", + ".cpp", + ".h", + ".yaml", + ".yml", + ".toml", + ".ini", + ".cfg", + ".conf", + ".sh", + ".bash", + ".zsh", + ".fish", + ".sql", + ".csv", + ".tsv", + ".rst", + ".tex", + ".log", +} + + +def get_current_timestamp() -> datetime: + """Get the current timestamp with timezone for updated_at field.""" + return datetime.now(UTC) + + +def _is_binary_file(file_name: str, mime_type: str) -> bool: + """Check if a file is binary based on extension or mime type.""" + extension = Path(file_name).suffix.lower() + + # Check extension first + if extension in BINARY_FILE_EXTENSIONS: + return True + if extension in TEXT_FILE_EXTENSIONS: + return False + + # Check mime type + if mime_type: + if mime_type.startswith(("image/", "audio/", "video/", "application/pdf")): + return True + if mime_type.startswith(("text/", "application/json", "application/xml")): + return False + # Office documents + if ( + "spreadsheet" in mime_type + or "document" in mime_type + or "presentation" in mime_type + ): + return True + + # Default to text for unknown types + return False + + +class ComposioGoogleDriveConnector(ComposioConnector): + """ + Google Drive specific Composio connector. + + Provides methods for listing files, downloading content, and tracking changes + from Google Drive via Composio. + """ + + async def list_drive_files( + self, + folder_id: str | None = None, + page_token: str | None = None, + page_size: int = 100, + ) -> tuple[list[dict[str, Any]], str | None, str | None]: + """ + List files from Google Drive via Composio. + + Args: + folder_id: Optional folder ID to list contents of. + page_token: Pagination token. + page_size: Number of files per page. + + Returns: + Tuple of (files list, next_page_token, error message). + """ + connected_account_id = await self.get_connected_account_id() + if not connected_account_id: + return [], None, "No connected account ID found" + + entity_id = await self.get_entity_id() + service = await self._get_service() + return await service.get_drive_files( + connected_account_id=connected_account_id, + entity_id=entity_id, + folder_id=folder_id, + page_token=page_token, + page_size=page_size, + ) + + async def get_drive_file_content( + self, file_id: str + ) -> tuple[bytes | None, str | None]: + """ + Download file content from Google Drive via Composio. + + Args: + file_id: Google Drive file ID. + + Returns: + Tuple of (file content bytes, error message). + """ + connected_account_id = await self.get_connected_account_id() + if not connected_account_id: + return None, "No connected account ID found" + + entity_id = await self.get_entity_id() + service = await self._get_service() + return await service.get_drive_file_content( + connected_account_id=connected_account_id, + entity_id=entity_id, + file_id=file_id, + ) + + async def get_drive_start_page_token(self) -> tuple[str | None, str | None]: + """ + Get the starting page token for Google Drive change tracking. + + Returns: + Tuple of (start_page_token, error message). + """ + connected_account_id = await self.get_connected_account_id() + if not connected_account_id: + return None, "No connected account ID found" + + entity_id = await self.get_entity_id() + service = await self._get_service() + return await service.get_drive_start_page_token( + connected_account_id=connected_account_id, + entity_id=entity_id, + ) + + async def list_drive_changes( + self, + page_token: str | None = None, + page_size: int = 100, + include_removed: bool = True, + ) -> tuple[list[dict[str, Any]], str | None, str | None]: + """ + List changes in Google Drive since the given page token. + + Args: + page_token: Page token from previous sync (optional). + page_size: Number of changes per page. + include_removed: Whether to include removed items. + + Returns: + Tuple of (changes list, new_start_page_token, error message). + """ + connected_account_id = await self.get_connected_account_id() + if not connected_account_id: + return [], None, "No connected account ID found" + + entity_id = await self.get_entity_id() + service = await self._get_service() + return await service.list_drive_changes( + connected_account_id=connected_account_id, + entity_id=entity_id, + page_token=page_token, + page_size=page_size, + include_removed=include_removed, + ) + + +# ============ File Processing Utilities ============ + + +async def _process_file_content( + content: bytes | str, + file_name: str, + file_id: str, + mime_type: str, + search_space_id: int, + user_id: str, + session: AsyncSession, + task_logger: TaskLoggingService, + log_entry: Log, + processing_errors: list[str], +) -> str: + """ + Process file content and return markdown text. + + For binary files (PDFs, images, etc.), uses Surfsense's ETL service. + For text files, decodes as UTF-8. + + Args: + content: File content as bytes or string + file_name: Name of the file + file_id: Google Drive file ID + mime_type: MIME type of the file + search_space_id: Search space ID + user_id: User ID + session: Database session + task_logger: Task logging service + log_entry: Log entry for tracking + processing_errors: List to append errors to + + Returns: + Markdown content string + """ + # Ensure content is bytes + if isinstance(content, str): + content = content.encode("utf-8") + + # Check if this is a binary file + if _is_binary_file(file_name, mime_type): + # Use ETL service for binary files (PDF, Office docs, etc.) + temp_file_path = None + try: + # Get file extension + extension = Path(file_name).suffix or ".bin" + + # Write to temp file + with tempfile.NamedTemporaryFile( + delete=False, suffix=extension + ) as tmp_file: + tmp_file.write(content) + temp_file_path = tmp_file.name + + # Use the configured ETL service to extract text + extracted_text = await _extract_text_with_etl( + temp_file_path, file_name, task_logger, log_entry + ) + + if extracted_text: + return extracted_text + else: + # Fallback if extraction fails + logger.warning(f"Could not extract text from binary file {file_name}") + return f"# {file_name}\n\n[Binary file - text extraction failed]\n\n**File ID:** {file_id}\n**Type:** {mime_type}\n" + + except Exception as e: + error_msg = f"Error processing binary file {file_name}: {e!s}" + logger.error(error_msg) + processing_errors.append(error_msg) + return f"# {file_name}\n\n[Binary file - processing error]\n\n**File ID:** {file_id}\n**Type:** {mime_type}\n" + finally: + # Cleanup temp file + if temp_file_path and os.path.exists(temp_file_path): + try: + os.unlink(temp_file_path) + except Exception as e: + logger.debug(f"Could not delete temp file {temp_file_path}: {e}") + else: + # Text file - try to decode as UTF-8 + try: + return content.decode("utf-8") + except UnicodeDecodeError: + # Try other encodings + for encoding in ["latin-1", "cp1252", "iso-8859-1"]: + try: + return content.decode(encoding) + except UnicodeDecodeError: + continue + + # If all encodings fail, treat as binary + error_msg = f"Could not decode text file {file_name} with any encoding" + logger.warning(error_msg) + processing_errors.append(error_msg) + return f"# {file_name}\n\n[File content could not be decoded]\n\n**File ID:** {file_id}\n**Type:** {mime_type}\n" + + +async def _extract_text_with_etl( + file_path: str, + file_name: str, + task_logger: TaskLoggingService, + log_entry: Log, +) -> str | None: + """ + Extract text from a file using the configured ETL service. + + Args: + file_path: Path to the file + file_name: Name of the file + task_logger: Task logging service + log_entry: Log entry for tracking + + Returns: + Extracted text as markdown, or None if extraction fails + """ + import warnings + from logging import ERROR, getLogger + + etl_service = config.ETL_SERVICE + + try: + if etl_service == "UNSTRUCTURED": + from langchain_unstructured import UnstructuredLoader + + from app.utils.document_converters import convert_document_to_markdown + + loader = UnstructuredLoader( + file_path, + mode="elements", + post_processors=[], + languages=["eng"], + include_orig_elements=False, + include_metadata=False, + strategy="auto", + ) + + docs = await loader.aload() + if docs: + return await convert_document_to_markdown(docs) + return None + + elif etl_service == "LLAMACLOUD": + from app.tasks.document_processors.file_processors import ( + parse_with_llamacloud_retry, + ) + + # Estimate pages (rough estimate based on file size) + file_size = os.path.getsize(file_path) + estimated_pages = max(1, file_size // (80 * 1024)) + + result = await parse_with_llamacloud_retry( + file_path=file_path, + estimated_pages=estimated_pages, + task_logger=task_logger, + log_entry=log_entry, + ) + + markdown_documents = await result.aget_markdown_documents( + split_by_page=False + ) + if markdown_documents: + return markdown_documents[0].text + return None + + elif etl_service == "DOCLING": + from app.services.docling_service import create_docling_service + + docling_service = create_docling_service() + + # Suppress pdfminer warnings + pdfminer_logger = getLogger("pdfminer") + original_level = pdfminer_logger.level + + with warnings.catch_warnings(): + warnings.filterwarnings( + "ignore", category=UserWarning, module="pdfminer" + ) + warnings.filterwarnings( + "ignore", message=".*Cannot set gray non-stroke color.*" + ) + warnings.filterwarnings("ignore", message=".*invalid float value.*") + + pdfminer_logger.setLevel(ERROR) + + try: + result = await docling_service.process_document( + file_path, file_name + ) + finally: + pdfminer_logger.setLevel(original_level) + + return result.get("content") + else: + logger.warning(f"Unknown ETL service: {etl_service}") + return None + + except Exception as e: + logger.error(f"ETL extraction failed for {file_name}: {e!s}") + return None + + +# ============ Indexer Functions ============ + + +async def check_document_by_unique_identifier( + session: AsyncSession, unique_identifier_hash: str +) -> Document | None: + """Check if a document with the given unique identifier hash already exists.""" + from sqlalchemy.orm import selectinload + from sqlalchemy.future import select + + existing_doc_result = await session.execute( + select(Document) + .options(selectinload(Document.chunks)) + .where(Document.unique_identifier_hash == unique_identifier_hash) + ) + return existing_doc_result.scalars().first() + + +async def update_connector_last_indexed( + session: AsyncSession, + connector, + update_last_indexed: bool = True, +) -> None: + """Update the last_indexed_at timestamp for a connector.""" + if update_last_indexed: + connector.last_indexed_at = datetime.now( + UTC + ) # Use UTC for timezone consistency + logger.info(f"Updated last_indexed_at to {connector.last_indexed_at}") + + +async def index_composio_google_drive( + session: AsyncSession, + connector, + connector_id: int, + search_space_id: int, + user_id: str, + task_logger: TaskLoggingService, + log_entry, + update_last_indexed: bool = True, + max_items: int = 1000, +) -> tuple[int, str]: + """Index Google Drive files via Composio with delta sync support. + + Delta Sync Flow: + 1. First sync: Full scan + get initial page token + 2. Subsequent syncs: Use LIST_CHANGES to process only changed files + + Supports folder/file selection via connector config: + - selected_folders: List of {id, name} for folders to index + - selected_files: List of {id, name} for individual files to index + - indexing_options: {max_files_per_folder, incremental_sync, include_subfolders} + """ + try: + composio_connector = ComposioGoogleDriveConnector(session, connector_id) + connector_config = await composio_connector.get_config() + + # Get folder/file selection configuration + selected_folders = connector_config.get("selected_folders", []) + selected_files = connector_config.get("selected_files", []) + indexing_options = connector_config.get("indexing_options", {}) + + # Check for stored page token for delta sync + stored_page_token = connector_config.get("drive_page_token") + use_delta_sync = stored_page_token and connector.last_indexed_at + + max_files_per_folder = indexing_options.get("max_files_per_folder", 100) + include_subfolders = indexing_options.get("include_subfolders", True) + + # Route to delta sync or full scan + if use_delta_sync: + logger.info(f"Using delta sync for Composio Google Drive connector {connector_id}") + await task_logger.log_task_progress( + log_entry, + f"Starting delta sync for Google Drive via Composio (connector {connector_id})", + {"stage": "delta_sync", "token": stored_page_token[:20] + "..."}, + ) + + documents_indexed, documents_skipped, processing_errors = await _index_composio_drive_delta_sync( + session=session, + composio_connector=composio_connector, + connector_id=connector_id, + search_space_id=search_space_id, + user_id=user_id, + page_token=stored_page_token, + max_items=max_items, + task_logger=task_logger, + log_entry=log_entry, + ) + else: + logger.info(f"Using full scan for Composio Google Drive connector {connector_id} (first sync or no token)") + await task_logger.log_task_progress( + log_entry, + f"Fetching Google Drive files via Composio for connector {connector_id}", + { + "stage": "full_scan", + "selected_folders": len(selected_folders), + "selected_files": len(selected_files), + }, + ) + + documents_indexed, documents_skipped, processing_errors = await _index_composio_drive_full_scan( + session=session, + composio_connector=composio_connector, + connector_id=connector_id, + search_space_id=search_space_id, + user_id=user_id, + selected_folders=selected_folders, + selected_files=selected_files, + max_files_per_folder=max_files_per_folder, + include_subfolders=include_subfolders, + max_items=max_items, + task_logger=task_logger, + log_entry=log_entry, + ) + + # Get new page token for next sync (always update after successful sync) + new_token, token_error = await composio_connector.get_drive_start_page_token() + if new_token and not token_error: + # Refresh connector to avoid stale state + await session.refresh(connector) + + if not connector.config: + connector.config = {} + connector.config["drive_page_token"] = new_token + flag_modified(connector, "config") + logger.info(f"Updated drive_page_token for connector {connector_id}") + elif token_error: + logger.warning(f"Failed to get new page token: {token_error}") + + # CRITICAL: Always update timestamp so Electric SQL syncs and UI shows indexed status + await update_connector_last_indexed(session, connector, update_last_indexed) + + # Final commit + logger.info(f"Final commit: Total {documents_indexed} Google Drive files processed") + await session.commit() + logger.info("Successfully committed all Composio Google Drive document changes to database") + + # Handle processing errors + error_message = None + if processing_errors: + if len(processing_errors) == 1: + error_message = processing_errors[0] + else: + error_message = f"Failed to process {len(processing_errors)} file(s). First error: {processing_errors[0]}" + await task_logger.log_task_failure( + log_entry, + f"Completed Google Drive indexing with {len(processing_errors)} error(s) for connector {connector_id}", + { + "documents_indexed": documents_indexed, + "documents_skipped": documents_skipped, + "sync_type": "delta" if use_delta_sync else "full", + "errors": processing_errors, + }, + ) + else: + await task_logger.log_task_success( + log_entry, + f"Successfully completed Google Drive indexing via Composio for connector {connector_id}", + { + "documents_indexed": documents_indexed, + "documents_skipped": documents_skipped, + "sync_type": "delta" if use_delta_sync else "full", + }, + ) + + return documents_indexed, error_message + + except Exception as e: + logger.error(f"Failed to index Google Drive via Composio: {e!s}", exc_info=True) + return 0, f"Failed to index Google Drive via Composio: {e!s}" + + +async def _index_composio_drive_delta_sync( + session: AsyncSession, + composio_connector: ComposioGoogleDriveConnector, + connector_id: int, + search_space_id: int, + user_id: str, + page_token: str, + max_items: int, + task_logger: TaskLoggingService, + log_entry, +) -> tuple[int, int, list[str]]: + """Index Google Drive files using delta sync (only changed files). + + Uses GOOGLEDRIVE_LIST_CHANGES to fetch only files that changed since last sync. + Handles: new files, modified files, and deleted files. + """ + documents_indexed = 0 + documents_skipped = 0 + processing_errors = [] + + # Fetch all changes with pagination + all_changes = [] + current_token = page_token + + while len(all_changes) < max_items: + changes, next_token, error = await composio_connector.list_drive_changes( + page_token=current_token, + page_size=100, + include_removed=True, + ) + + if error: + logger.error(f"Error fetching Drive changes: {error}") + processing_errors.append(f"Failed to fetch changes: {error}") + break + + all_changes.extend(changes) + + if not next_token or next_token == current_token: + break + current_token = next_token + + if not all_changes: + logger.info("No changes detected since last sync") + return 0, 0, [] + + logger.info(f"Processing {len(all_changes)} changes from delta sync") + + for change in all_changes[:max_items]: + try: + # Handle removed files + is_removed = change.get("removed", False) + file_info = change.get("file", {}) + file_id = change.get("fileId") or file_info.get("id", "") + + if not file_id: + documents_skipped += 1 + continue + + # Check if file was trashed or removed + if is_removed or file_info.get("trashed", False): + # Remove document from database + document_type = DocumentType(TOOLKIT_TO_DOCUMENT_TYPE["googledrive"]) + unique_identifier_hash = generate_unique_identifier_hash( + document_type, f"drive_{file_id}", search_space_id + ) + existing_document = await check_document_by_unique_identifier( + session, unique_identifier_hash + ) + if existing_document: + await session.delete(existing_document) + documents_indexed += 1 + logger.info(f"Deleted document for removed/trashed file: {file_id}") + continue + + # Process changed file + file_name = file_info.get("name", "") or "Untitled" + mime_type = file_info.get("mimeType", "") or file_info.get("mime_type", "") + + # Skip folders + if mime_type == "application/vnd.google-apps.folder": + continue + + # Process the file + indexed, skipped, errors = await _process_single_drive_file( + session=session, + composio_connector=composio_connector, + file_id=file_id, + file_name=file_name, + mime_type=mime_type, + connector_id=connector_id, + search_space_id=search_space_id, + user_id=user_id, + task_logger=task_logger, + log_entry=log_entry, + ) + + documents_indexed += indexed + documents_skipped += skipped + processing_errors.extend(errors) + + # Batch commit every 10 documents + if documents_indexed > 0 and documents_indexed % 10 == 0: + await session.commit() + logger.info(f"Committed batch: {documents_indexed} changes processed") + + except Exception as e: + error_msg = f"Error processing change for file {file_id}: {e!s}" + logger.error(error_msg, exc_info=True) + processing_errors.append(error_msg) + documents_skipped += 1 + + logger.info(f"Delta sync complete: {documents_indexed} indexed, {documents_skipped} skipped") + return documents_indexed, documents_skipped, processing_errors + + +async def _index_composio_drive_full_scan( + session: AsyncSession, + composio_connector: ComposioGoogleDriveConnector, + connector_id: int, + search_space_id: int, + user_id: str, + selected_folders: list[dict], + selected_files: list[dict], + max_files_per_folder: int, + include_subfolders: bool, + max_items: int, + task_logger: TaskLoggingService, + log_entry, +) -> tuple[int, int, list[str]]: + """Index Google Drive files using full scan (first sync or when no delta token).""" + documents_indexed = 0 + documents_skipped = 0 + processing_errors = [] + + all_files = [] + + # If specific folders/files are selected, fetch from those + if selected_folders or selected_files: + # Fetch files from selected folders + for folder in selected_folders: + folder_id = folder.get("id") + folder_name = folder.get("name", "Unknown") + + if not folder_id: + continue + + # Handle special case for "root" folder + actual_folder_id = None if folder_id == "root" else folder_id + + logger.info(f"Fetching files from folder: {folder_name} ({folder_id})") + + # Fetch files from this folder + folder_files = [] + page_token = None + + while len(folder_files) < max_files_per_folder: + ( + files, + next_token, + error, + ) = await composio_connector.list_drive_files( + folder_id=actual_folder_id, + page_token=page_token, + page_size=min(100, max_files_per_folder - len(folder_files)), + ) + + if error: + logger.warning( + f"Failed to fetch files from folder {folder_name}: {error}" + ) + break + + # Process files + for file_info in files: + mime_type = file_info.get("mimeType", "") or file_info.get( + "mime_type", "" + ) + + # If it's a folder and include_subfolders is enabled, recursively fetch + if mime_type == "application/vnd.google-apps.folder": + if include_subfolders: + # Add subfolder files recursively + subfolder_files = await _fetch_folder_files_recursively( + composio_connector, + file_info.get("id"), + max_files=max_files_per_folder, + current_count=len(folder_files), + ) + folder_files.extend(subfolder_files) + else: + folder_files.append(file_info) + + if not next_token: + break + page_token = next_token + + all_files.extend(folder_files[:max_files_per_folder]) + logger.info(f"Found {len(folder_files)} files in folder {folder_name}") + + # Add specifically selected files + for selected_file in selected_files: + file_id = selected_file.get("id") + file_name = selected_file.get("name", "Unknown") + + if not file_id: + continue + + # Add file info (we'll fetch content later during indexing) + all_files.append( + { + "id": file_id, + "name": file_name, + "mimeType": "", # Will be determined later + } + ) + else: + # No selection specified - fetch all files (original behavior) + page_token = None + + while len(all_files) < max_items: + files, next_token, error = await composio_connector.list_drive_files( + page_token=page_token, + page_size=min(100, max_items - len(all_files)), + ) + + if error: + return 0, 0, [f"Failed to fetch Drive files: {error}"] + + all_files.extend(files) + + if not next_token: + break + page_token = next_token + + if not all_files: + logger.info("No Google Drive files found") + return 0, 0, [] + + logger.info(f"Found {len(all_files)} Google Drive files to index via Composio (full scan)") + + for file_info in all_files: + try: + # Handle both standard Google API and potential Composio variations + file_id = file_info.get("id", "") or file_info.get("fileId", "") + file_name = ( + file_info.get("name", "") + or file_info.get("fileName", "") + or "Untitled" + ) + mime_type = file_info.get("mimeType", "") or file_info.get( + "mime_type", "" + ) + + if not file_id: + documents_skipped += 1 + continue + + # Skip folders + if mime_type == "application/vnd.google-apps.folder": + continue + + # Process the file + indexed, skipped, errors = await _process_single_drive_file( + session=session, + composio_connector=composio_connector, + file_id=file_id, + file_name=file_name, + mime_type=mime_type, + connector_id=connector_id, + search_space_id=search_space_id, + user_id=user_id, + task_logger=task_logger, + log_entry=log_entry, + ) + + documents_indexed += indexed + documents_skipped += skipped + processing_errors.extend(errors) + + # Batch commit every 10 documents + if documents_indexed > 0 and documents_indexed % 10 == 0: + logger.info(f"Committing batch: {documents_indexed} Google Drive files processed so far") + await session.commit() + + except Exception as e: + error_msg = f"Error processing Drive file {file_name or 'unknown'}: {e!s}" + logger.error(error_msg, exc_info=True) + processing_errors.append(error_msg) + documents_skipped += 1 + + logger.info(f"Full scan complete: {documents_indexed} indexed, {documents_skipped} skipped") + return documents_indexed, documents_skipped, processing_errors + + +async def _process_single_drive_file( + session: AsyncSession, + composio_connector: ComposioGoogleDriveConnector, + file_id: str, + file_name: str, + mime_type: str, + connector_id: int, + search_space_id: int, + user_id: str, + task_logger: TaskLoggingService, + log_entry, +) -> tuple[int, int, list[str]]: + """Process a single Google Drive file for indexing. + + Returns: + Tuple of (documents_indexed, documents_skipped, processing_errors) + """ + processing_errors = [] + + # Generate unique identifier hash + document_type = DocumentType(TOOLKIT_TO_DOCUMENT_TYPE["googledrive"]) + unique_identifier_hash = generate_unique_identifier_hash( + document_type, f"drive_{file_id}", search_space_id + ) + + # Check if document exists + existing_document = await check_document_by_unique_identifier( + session, unique_identifier_hash + ) + + # Get file content + content, content_error = await composio_connector.get_drive_file_content(file_id) + + if content_error or not content: + logger.warning( + f"Could not get content for file {file_name}: {content_error}" + ) + # Use metadata as content fallback + markdown_content = f"# {file_name}\n\n" + markdown_content += f"**File ID:** {file_id}\n" + markdown_content += f"**Type:** {mime_type}\n" + elif isinstance(content, dict): + # Safety check: if content is still a dict, log error and use fallback + error_msg = f"Unexpected dict content format for file {file_name}: {list(content.keys())}" + logger.error(error_msg) + processing_errors.append(error_msg) + markdown_content = f"# {file_name}\n\n" + markdown_content += f"**File ID:** {file_id}\n" + markdown_content += f"**Type:** {mime_type}\n" + else: + # Process content based on file type + markdown_content = await _process_file_content( + content=content, + file_name=file_name, + file_id=file_id, + mime_type=mime_type, + search_space_id=search_space_id, + user_id=user_id, + session=session, + task_logger=task_logger, + log_entry=log_entry, + processing_errors=processing_errors, + ) + + content_hash = generate_content_hash(markdown_content, search_space_id) + + if existing_document: + if existing_document.content_hash == content_hash: + return 0, 1, processing_errors # Skipped + + # Update existing document + user_llm = await get_user_long_context_llm( + session, user_id, search_space_id + ) + + if user_llm: + document_metadata = { + "file_id": file_id, + "file_name": file_name, + "mime_type": mime_type, + "document_type": "Google Drive File (Composio)", + } + ( + summary_content, + summary_embedding, + ) = await generate_document_summary( + markdown_content, user_llm, document_metadata + ) + else: + summary_content = ( + f"Google Drive File: {file_name}\n\nType: {mime_type}" + ) + summary_embedding = config.embedding_model_instance.embed( + summary_content + ) + + chunks = await create_document_chunks(markdown_content) + + existing_document.title = f"Drive: {file_name}" + existing_document.content = summary_content + existing_document.content_hash = content_hash + existing_document.embedding = summary_embedding + existing_document.document_metadata = { + "file_id": file_id, + "file_name": file_name, + "FILE_NAME": file_name, # For compatibility + "mime_type": mime_type, + "connector_id": connector_id, + "source": "composio", + } + existing_document.chunks = chunks + existing_document.updated_at = get_current_timestamp() + + return 1, 0, processing_errors # Indexed + + # Create new document + user_llm = await get_user_long_context_llm( + session, user_id, search_space_id + ) + + if user_llm: + document_metadata = { + "file_id": file_id, + "file_name": file_name, + "mime_type": mime_type, + "document_type": "Google Drive File (Composio)", + } + ( + summary_content, + summary_embedding, + ) = await generate_document_summary( + markdown_content, user_llm, document_metadata + ) + else: + summary_content = ( + f"Google Drive File: {file_name}\n\nType: {mime_type}" + ) + summary_embedding = config.embedding_model_instance.embed( + summary_content + ) + + chunks = await create_document_chunks(markdown_content) + + document = Document( + search_space_id=search_space_id, + title=f"Drive: {file_name}", + document_type=DocumentType(TOOLKIT_TO_DOCUMENT_TYPE["googledrive"]), + document_metadata={ + "file_id": file_id, + "file_name": file_name, + "FILE_NAME": file_name, # For compatibility + "mime_type": mime_type, + "connector_id": connector_id, + "toolkit_id": "googledrive", + "source": "composio", + }, + content=summary_content, + content_hash=content_hash, + unique_identifier_hash=unique_identifier_hash, + embedding=summary_embedding, + chunks=chunks, + updated_at=get_current_timestamp(), + ) + session.add(document) + + return 1, 0, processing_errors # Indexed + + +async def _fetch_folder_files_recursively( + composio_connector: ComposioGoogleDriveConnector, + folder_id: str, + max_files: int = 100, + current_count: int = 0, + depth: int = 0, + max_depth: int = 10, +) -> list[dict[str, Any]]: + """ + Recursively fetch files from a Google Drive folder via Composio. + + Args: + composio_connector: The Composio connector instance + folder_id: Google Drive folder ID + max_files: Maximum number of files to fetch + current_count: Current number of files already fetched + depth: Current recursion depth + max_depth: Maximum recursion depth to prevent infinite loops + + Returns: + List of file info dictionaries + """ + if depth >= max_depth: + logger.warning(f"Max recursion depth reached for folder {folder_id}") + return [] + + if current_count >= max_files: + return [] + + all_files = [] + page_token = None + + try: + while len(all_files) + current_count < max_files: + files, next_token, error = await composio_connector.list_drive_files( + folder_id=folder_id, + page_token=page_token, + page_size=min(100, max_files - len(all_files) - current_count), + ) + + if error: + logger.warning( + f"Error fetching files from subfolder {folder_id}: {error}" + ) + break + + for file_info in files: + mime_type = file_info.get("mimeType", "") or file_info.get( + "mime_type", "" + ) + + if mime_type == "application/vnd.google-apps.folder": + # Recursively fetch from subfolders + subfolder_files = await _fetch_folder_files_recursively( + composio_connector, + file_info.get("id"), + max_files=max_files, + current_count=current_count + len(all_files), + depth=depth + 1, + max_depth=max_depth, + ) + all_files.extend(subfolder_files) + else: + all_files.append(file_info) + + if len(all_files) + current_count >= max_files: + break + + if not next_token: + break + page_token = next_token + + return all_files[: max_files - current_count] + + except Exception as e: + logger.error(f"Error in recursive folder fetch: {e!s}") + return all_files + diff --git a/surfsense_backend/app/routes/composio_routes.py b/surfsense_backend/app/routes/composio_routes.py index 5af332760..5ad2266b7 100644 --- a/surfsense_backend/app/routes/composio_routes.py +++ b/surfsense_backend/app/routes/composio_routes.py @@ -46,6 +46,13 @@ logger = logging.getLogger(__name__) router = APIRouter() +# Map toolkit_id to frontend connector ID +TOOLKIT_TO_FRONTEND_CONNECTOR_ID = { + "googledrive": "composio-googledrive", + "gmail": "composio-gmail", + "googlecalendar": "composio-googlecalendar", +} + # Initialize security utilities _state_manager = None @@ -327,8 +334,12 @@ async def composio_callback( await session.commit() await session.refresh(existing_connector) + # Get the frontend connector ID based on toolkit_id + frontend_connector_id = TOOLKIT_TO_FRONTEND_CONNECTOR_ID.get( + toolkit_id, "composio-connector" + ) return RedirectResponse( - url=f"{config.NEXT_FRONTEND_URL}/dashboard/{space_id}/new-chat?modal=connectors&tab=all&success=true&connector=composio-connector&connectorId={existing_connector.id}" + url=f"{config.NEXT_FRONTEND_URL}/dashboard/{space_id}/new-chat?modal=connectors&tab=all&success=true&connector={frontend_connector_id}&connectorId={existing_connector.id}" ) try: @@ -358,8 +369,12 @@ async def composio_callback( f"Successfully created Composio connector {db_connector.id} for user {user_id}, toolkit {toolkit_id}" ) + # Get the frontend connector ID based on toolkit_id + frontend_connector_id = TOOLKIT_TO_FRONTEND_CONNECTOR_ID.get( + toolkit_id, "composio-connector" + ) return RedirectResponse( - url=f"{config.NEXT_FRONTEND_URL}/dashboard/{space_id}/new-chat?modal=connectors&tab=all&success=true&connector=composio-connector&connectorId={db_connector.id}" + url=f"{config.NEXT_FRONTEND_URL}/dashboard/{space_id}/new-chat?modal=connectors&tab=all&success=true&connector={frontend_connector_id}&connectorId={db_connector.id}" ) except IntegrityError as e: diff --git a/surfsense_backend/app/services/composio_service.py b/surfsense_backend/app/services/composio_service.py index 3810f03a4..3ea2d1bf2 100644 --- a/surfsense_backend/app/services/composio_service.py +++ b/surfsense_backend/app/services/composio_service.py @@ -53,6 +53,27 @@ TOOLKIT_TO_DOCUMENT_TYPE = { "googlecalendar": "COMPOSIO_GOOGLE_CALENDAR_CONNECTOR", } +# Mapping of toolkit IDs to their indexer functions +# Format: toolkit_id -> (module_path, function_name, supports_date_filter) +# supports_date_filter: True if the indexer accepts start_date/end_date params +TOOLKIT_TO_INDEXER = { + "googledrive": ( + "app.connectors.composio_google_drive_connector", + "index_composio_google_drive", + False, # Google Drive doesn't use date filtering + ), + "gmail": ( + "app.connectors.composio_gmail_connector", + "index_composio_gmail", + True, # Gmail uses date filtering + ), + "googlecalendar": ( + "app.connectors.composio_google_calendar_connector", + "index_composio_google_calendar", + True, # Calendar uses date filtering + ), +} + class ComposioService: """Service for interacting with Composio API.""" diff --git a/surfsense_backend/app/tasks/composio_indexer.py b/surfsense_backend/app/tasks/composio_indexer.py index 3eed8470e..f97652114 100644 --- a/surfsense_backend/app/tasks/composio_indexer.py +++ b/surfsense_backend/app/tasks/composio_indexer.py @@ -2,65 +2,39 @@ Composio connector indexer. Routes indexing requests to toolkit-specific handlers (Google Drive, Gmail, Calendar). +Uses a registry pattern for clean, extensible connector routing. Note: This module is intentionally placed in app/tasks/ (not in connector_indexers/) to avoid circular import issues with the connector_indexers package. """ import logging -import os -import tempfile -from datetime import UTC, datetime -from pathlib import Path -from typing import Any +from importlib import import_module from sqlalchemy.exc import SQLAlchemyError from sqlalchemy.ext.asyncio import AsyncSession from sqlalchemy.future import select -from sqlalchemy.orm import selectinload -from app.config import config -from app.connectors.composio_connector import ComposioConnector from app.db import ( - Document, - DocumentType, - Log, SearchSourceConnector, SearchSourceConnectorType, ) -from app.services.composio_service import INDEXABLE_TOOLKITS, TOOLKIT_TO_DOCUMENT_TYPE -from app.services.llm_service import get_user_long_context_llm +from app.services.composio_service import INDEXABLE_TOOLKITS, TOOLKIT_TO_INDEXER from app.services.task_logging_service import TaskLoggingService -from app.tasks.connector_indexers.base import calculate_date_range -from app.utils.document_converters import ( - create_document_chunks, - generate_content_hash, - generate_document_summary, - generate_unique_identifier_hash, -) # Set up logging logger = logging.getLogger(__name__) -# ============ Utility functions (copied from connector_indexers.base to avoid circular imports) ============ +# Valid Composio connector types +COMPOSIO_CONNECTOR_TYPES = { + SearchSourceConnectorType.COMPOSIO_GOOGLE_DRIVE_CONNECTOR, + SearchSourceConnectorType.COMPOSIO_GMAIL_CONNECTOR, + SearchSourceConnectorType.COMPOSIO_GOOGLE_CALENDAR_CONNECTOR, +} -def get_current_timestamp() -> datetime: - """Get the current timestamp with timezone for updated_at field.""" - return datetime.now(UTC) - - -async def check_document_by_unique_identifier( - session: AsyncSession, unique_identifier_hash: str -) -> Document | None: - """Check if a document with the given unique identifier hash already exists.""" - existing_doc_result = await session.execute( - select(Document) - .options(selectinload(Document.chunks)) - .where(Document.unique_identifier_hash == unique_identifier_hash) - ) - return existing_doc_result.scalars().first() +# ============ Utility functions ============ async def get_connector_by_id( @@ -78,312 +52,26 @@ async def get_connector_by_id( return result.scalars().first() -async def update_connector_last_indexed( - session: AsyncSession, - connector: SearchSourceConnector, - update_last_indexed: bool = True, -) -> None: - """Update the last_indexed_at timestamp for a connector.""" - if update_last_indexed: - connector.last_indexed_at = datetime.now( - UTC - ) # Use UTC for timezone consistency - logger.info(f"Updated last_indexed_at to {connector.last_indexed_at}") - - -# Binary file extensions that need file processor -BINARY_FILE_EXTENSIONS = { - ".pdf", - ".doc", - ".docx", - ".xls", - ".xlsx", - ".ppt", - ".pptx", - ".png", - ".jpg", - ".jpeg", - ".gif", - ".bmp", - ".tiff", - ".webp", - ".zip", - ".tar", - ".gz", - ".rar", - ".7z", - ".mp3", - ".mp4", - ".wav", - ".avi", - ".mov", - ".exe", - ".dll", - ".so", - ".bin", -} - -# Text file extensions that can be decoded as UTF-8 -TEXT_FILE_EXTENSIONS = { - ".txt", - ".md", - ".markdown", - ".json", - ".xml", - ".html", - ".htm", - ".css", - ".js", - ".ts", - ".py", - ".java", - ".c", - ".cpp", - ".h", - ".yaml", - ".yml", - ".toml", - ".ini", - ".cfg", - ".conf", - ".sh", - ".bash", - ".zsh", - ".fish", - ".sql", - ".csv", - ".tsv", - ".rst", - ".tex", - ".log", -} - - -def _is_binary_file(file_name: str, mime_type: str) -> bool: - """Check if a file is binary based on extension or mime type.""" - extension = Path(file_name).suffix.lower() - - # Check extension first - if extension in BINARY_FILE_EXTENSIONS: - return True - if extension in TEXT_FILE_EXTENSIONS: - return False - - # Check mime type - if mime_type: - if mime_type.startswith(("image/", "audio/", "video/", "application/pdf")): - return True - if mime_type.startswith(("text/", "application/json", "application/xml")): - return False - # Office documents - if ( - "spreadsheet" in mime_type - or "document" in mime_type - or "presentation" in mime_type - ): - return True - - # Default to text for unknown types - return False - - -async def _process_file_content( - content: bytes | str, - file_name: str, - file_id: str, - mime_type: str, - search_space_id: int, - user_id: str, - session: AsyncSession, - task_logger: TaskLoggingService, - log_entry: Log, - processing_errors: list[str], -) -> str: +def get_indexer_function(toolkit_id: str): """ - Process file content and return markdown text. - - For binary files (PDFs, images, etc.), uses Surfsense's ETL service. - For text files, decodes as UTF-8. + Dynamically import and return the indexer function for a toolkit. Args: - content: File content as bytes or string - file_name: Name of the file - file_id: Google Drive file ID - mime_type: MIME type of the file - search_space_id: Search space ID - user_id: User ID - session: Database session - task_logger: Task logging service - log_entry: Log entry for tracking - processing_errors: List to append errors to + toolkit_id: The toolkit ID (e.g., "googledrive", "gmail") Returns: - Markdown content string + Tuple of (indexer_function, supports_date_filter) + + Raises: + ValueError: If toolkit not found in registry """ - # Ensure content is bytes - if isinstance(content, str): - content = content.encode("utf-8") + if toolkit_id not in TOOLKIT_TO_INDEXER: + raise ValueError(f"No indexer registered for toolkit: {toolkit_id}") - # Check if this is a binary file - if _is_binary_file(file_name, mime_type): - # Use ETL service for binary files (PDF, Office docs, etc.) - temp_file_path = None - try: - # Get file extension - extension = Path(file_name).suffix or ".bin" - - # Write to temp file - with tempfile.NamedTemporaryFile( - delete=False, suffix=extension - ) as tmp_file: - tmp_file.write(content) - temp_file_path = tmp_file.name - - # Use the configured ETL service to extract text - extracted_text = await _extract_text_with_etl( - temp_file_path, file_name, task_logger, log_entry - ) - - if extracted_text: - return extracted_text - else: - # Fallback if extraction fails - logger.warning(f"Could not extract text from binary file {file_name}") - return f"# {file_name}\n\n[Binary file - text extraction failed]\n\n**File ID:** {file_id}\n**Type:** {mime_type}\n" - - except Exception as e: - error_msg = f"Error processing binary file {file_name}: {e!s}" - logger.error(error_msg) - processing_errors.append(error_msg) - return f"# {file_name}\n\n[Binary file - processing error]\n\n**File ID:** {file_id}\n**Type:** {mime_type}\n" - finally: - # Cleanup temp file - if temp_file_path and os.path.exists(temp_file_path): - try: - os.unlink(temp_file_path) - except Exception as e: - logger.debug(f"Could not delete temp file {temp_file_path}: {e}") - else: - # Text file - try to decode as UTF-8 - try: - return content.decode("utf-8") - except UnicodeDecodeError: - # Try other encodings - for encoding in ["latin-1", "cp1252", "iso-8859-1"]: - try: - return content.decode(encoding) - except UnicodeDecodeError: - continue - - # If all encodings fail, treat as binary - error_msg = f"Could not decode text file {file_name} with any encoding" - logger.warning(error_msg) - processing_errors.append(error_msg) - return f"# {file_name}\n\n[File content could not be decoded]\n\n**File ID:** {file_id}\n**Type:** {mime_type}\n" - - -async def _extract_text_with_etl( - file_path: str, - file_name: str, - task_logger: TaskLoggingService, - log_entry: Log, -) -> str | None: - """ - Extract text from a file using the configured ETL service. - - Args: - file_path: Path to the file - file_name: Name of the file - task_logger: Task logging service - log_entry: Log entry for tracking - - Returns: - Extracted text as markdown, or None if extraction fails - """ - import warnings - from logging import ERROR, getLogger - - etl_service = config.ETL_SERVICE - - try: - if etl_service == "UNSTRUCTURED": - from langchain_unstructured import UnstructuredLoader - - from app.utils.document_converters import convert_document_to_markdown - - loader = UnstructuredLoader( - file_path, - mode="elements", - post_processors=[], - languages=["eng"], - include_orig_elements=False, - include_metadata=False, - strategy="auto", - ) - - docs = await loader.aload() - if docs: - return await convert_document_to_markdown(docs) - return None - - elif etl_service == "LLAMACLOUD": - from app.tasks.document_processors.file_processors import ( - parse_with_llamacloud_retry, - ) - - # Estimate pages (rough estimate based on file size) - file_size = os.path.getsize(file_path) - estimated_pages = max(1, file_size // (80 * 1024)) - - result = await parse_with_llamacloud_retry( - file_path=file_path, - estimated_pages=estimated_pages, - task_logger=task_logger, - log_entry=log_entry, - ) - - markdown_documents = await result.aget_markdown_documents( - split_by_page=False - ) - if markdown_documents: - return markdown_documents[0].text - return None - - elif etl_service == "DOCLING": - from app.services.docling_service import create_docling_service - - docling_service = create_docling_service() - - # Suppress pdfminer warnings - pdfminer_logger = getLogger("pdfminer") - original_level = pdfminer_logger.level - - with warnings.catch_warnings(): - warnings.filterwarnings( - "ignore", category=UserWarning, module="pdfminer" - ) - warnings.filterwarnings( - "ignore", message=".*Cannot set gray non-stroke color.*" - ) - warnings.filterwarnings("ignore", message=".*invalid float value.*") - - pdfminer_logger.setLevel(ERROR) - - try: - result = await docling_service.process_document( - file_path, file_name - ) - finally: - pdfminer_logger.setLevel(original_level) - - return result.get("content") - else: - logger.warning(f"Unknown ETL service: {etl_service}") - return None - - except Exception as e: - logger.error(f"ETL extraction failed for {file_name}: {e!s}") - return None + module_path, function_name, supports_date_filter = TOOLKIT_TO_INDEXER[toolkit_id] + module = import_module(module_path) + indexer_func = getattr(module, function_name) + return indexer_func, supports_date_filter # ============ Main indexer function ============ @@ -403,6 +91,7 @@ async def index_composio_connector( Index content from a Composio connector. Routes to toolkit-specific indexing based on the connector's toolkit_id. + Uses a registry pattern for clean, extensible connector routing. Args: session: Database session @@ -435,19 +124,10 @@ async def index_composio_connector( try: # Get connector by id - accept any Composio connector type - # We'll check the actual type after loading - connector = await get_connector_by_id( - session, - connector_id, - None, # Don't filter by type, we'll validate after - ) + connector = await get_connector_by_id(session, connector_id, None) # Validate it's a Composio connector - if connector and connector.connector_type not in [ - SearchSourceConnectorType.COMPOSIO_GOOGLE_DRIVE_CONNECTOR, - SearchSourceConnectorType.COMPOSIO_GMAIL_CONNECTOR, - SearchSourceConnectorType.COMPOSIO_GOOGLE_CALENDAR_CONNECTOR, - ]: + if connector and connector.connector_type not in COMPOSIO_CONNECTOR_TYPES: error_msg = f"Connector {connector_id} is not a Composio connector" await task_logger.log_task_failure( log_entry, error_msg, {"error_type": "InvalidConnectorType"} @@ -480,53 +160,35 @@ async def index_composio_connector( ) return 0, error_msg - # Route to toolkit-specific indexer - if toolkit_id == "googledrive": - return await _index_composio_google_drive( - session=session, - connector=connector, - connector_id=connector_id, - search_space_id=search_space_id, - user_id=user_id, - task_logger=task_logger, - log_entry=log_entry, - update_last_indexed=update_last_indexed, - max_items=max_items, - ) - elif toolkit_id == "gmail": - return await _index_composio_gmail( - session=session, - connector=connector, - connector_id=connector_id, - search_space_id=search_space_id, - user_id=user_id, - start_date=start_date, - end_date=end_date, - task_logger=task_logger, - log_entry=log_entry, - update_last_indexed=update_last_indexed, - max_items=max_items, - ) - elif toolkit_id == "googlecalendar": - return await _index_composio_google_calendar( - session=session, - connector=connector, - connector_id=connector_id, - search_space_id=search_space_id, - user_id=user_id, - start_date=start_date, - end_date=end_date, - task_logger=task_logger, - log_entry=log_entry, - update_last_indexed=update_last_indexed, - max_items=max_items, - ) - else: - error_msg = f"No indexer implemented for toolkit: {toolkit_id}" + # Get indexer function from registry + try: + indexer_func, supports_date_filter = get_indexer_function(toolkit_id) + except ValueError as e: await task_logger.log_task_failure( - log_entry, error_msg, {"error_type": "NoIndexerImplemented"} + log_entry, str(e), {"error_type": "NoIndexerImplemented"} ) - return 0, error_msg + return 0, str(e) + + # Build kwargs for the indexer function + kwargs = { + "session": session, + "connector": connector, + "connector_id": connector_id, + "search_space_id": search_space_id, + "user_id": user_id, + "task_logger": task_logger, + "log_entry": log_entry, + "update_last_indexed": update_last_indexed, + "max_items": max_items, + } + + # Add date params for toolkits that support them + if supports_date_filter: + kwargs["start_date"] = start_date + kwargs["end_date"] = end_date + + # Call the toolkit-specific indexer + return await indexer_func(**kwargs) except SQLAlchemyError as db_error: await session.rollback() @@ -548,1378 +210,3 @@ async def index_composio_connector( ) logger.error(f"Failed to index Composio connector: {e!s}", exc_info=True) return 0, f"Failed to index Composio connector: {e!s}" - - -async def _index_composio_google_drive( - session: AsyncSession, - connector, - connector_id: int, - search_space_id: int, - user_id: str, - task_logger: TaskLoggingService, - log_entry, - update_last_indexed: bool = True, - max_items: int = 1000, -) -> tuple[int, str]: - """Index Google Drive files via Composio with delta sync support. - - Delta Sync Flow: - 1. First sync: Full scan + get initial page token - 2. Subsequent syncs: Use LIST_CHANGES to process only changed files - - Supports folder/file selection via connector config: - - selected_folders: List of {id, name} for folders to index - - selected_files: List of {id, name} for individual files to index - - indexing_options: {max_files_per_folder, incremental_sync, include_subfolders} - """ - try: - composio_connector = ComposioConnector(session, connector_id) - connector_config = await composio_connector.get_config() - - # Get folder/file selection configuration - selected_folders = connector_config.get("selected_folders", []) - selected_files = connector_config.get("selected_files", []) - indexing_options = connector_config.get("indexing_options", {}) - - # Check for stored page token for delta sync - stored_page_token = connector_config.get("drive_page_token") - use_delta_sync = stored_page_token and connector.last_indexed_at - - max_files_per_folder = indexing_options.get("max_files_per_folder", 100) - include_subfolders = indexing_options.get("include_subfolders", True) - - # Route to delta sync or full scan - if use_delta_sync: - logger.info(f"Using delta sync for Composio Google Drive connector {connector_id}") - await task_logger.log_task_progress( - log_entry, - f"Starting delta sync for Google Drive via Composio (connector {connector_id})", - {"stage": "delta_sync", "token": stored_page_token[:20] + "..."}, - ) - - documents_indexed, documents_skipped, processing_errors = await _index_composio_drive_delta_sync( - session=session, - composio_connector=composio_connector, - connector_id=connector_id, - search_space_id=search_space_id, - user_id=user_id, - page_token=stored_page_token, - max_items=max_items, - task_logger=task_logger, - log_entry=log_entry, - ) - else: - logger.info(f"Using full scan for Composio Google Drive connector {connector_id} (first sync or no token)") - await task_logger.log_task_progress( - log_entry, - f"Fetching Google Drive files via Composio for connector {connector_id}", - { - "stage": "full_scan", - "selected_folders": len(selected_folders), - "selected_files": len(selected_files), - }, - ) - - documents_indexed, documents_skipped, processing_errors = await _index_composio_drive_full_scan( - session=session, - composio_connector=composio_connector, - connector_id=connector_id, - search_space_id=search_space_id, - user_id=user_id, - selected_folders=selected_folders, - selected_files=selected_files, - max_files_per_folder=max_files_per_folder, - include_subfolders=include_subfolders, - max_items=max_items, - task_logger=task_logger, - log_entry=log_entry, - ) - - # Get new page token for next sync (always update after successful sync) - new_token, token_error = await composio_connector.get_drive_start_page_token() - if new_token and not token_error: - from sqlalchemy.orm.attributes import flag_modified - - # Refresh connector to avoid stale state - await session.refresh(connector) - - if not connector.config: - connector.config = {} - connector.config["drive_page_token"] = new_token - flag_modified(connector, "config") - logger.info(f"Updated drive_page_token for connector {connector_id}") - elif token_error: - logger.warning(f"Failed to get new page token: {token_error}") - - # CRITICAL: Always update timestamp so Electric SQL syncs and UI shows indexed status - await update_connector_last_indexed(session, connector, update_last_indexed) - - # Final commit - logger.info(f"Final commit: Total {documents_indexed} Google Drive files processed") - await session.commit() - logger.info("Successfully committed all Composio Google Drive document changes to database") - - # Handle processing errors - error_message = None - if processing_errors: - if len(processing_errors) == 1: - error_message = processing_errors[0] - else: - error_message = f"Failed to process {len(processing_errors)} file(s). First error: {processing_errors[0]}" - await task_logger.log_task_failure( - log_entry, - f"Completed Google Drive indexing with {len(processing_errors)} error(s) for connector {connector_id}", - { - "documents_indexed": documents_indexed, - "documents_skipped": documents_skipped, - "sync_type": "delta" if use_delta_sync else "full", - "errors": processing_errors, - }, - ) - else: - await task_logger.log_task_success( - log_entry, - f"Successfully completed Google Drive indexing via Composio for connector {connector_id}", - { - "documents_indexed": documents_indexed, - "documents_skipped": documents_skipped, - "sync_type": "delta" if use_delta_sync else "full", - }, - ) - - return documents_indexed, error_message - - except Exception as e: - logger.error(f"Failed to index Google Drive via Composio: {e!s}", exc_info=True) - return 0, f"Failed to index Google Drive via Composio: {e!s}" - - -async def _index_composio_drive_delta_sync( - session: AsyncSession, - composio_connector: ComposioConnector, - connector_id: int, - search_space_id: int, - user_id: str, - page_token: str, - max_items: int, - task_logger: TaskLoggingService, - log_entry, -) -> tuple[int, int, list[str]]: - """Index Google Drive files using delta sync (only changed files). - - Uses GOOGLEDRIVE_LIST_CHANGES to fetch only files that changed since last sync. - Handles: new files, modified files, and deleted files. - """ - documents_indexed = 0 - documents_skipped = 0 - processing_errors = [] - - # Fetch all changes with pagination - all_changes = [] - current_token = page_token - - while len(all_changes) < max_items: - changes, next_token, error = await composio_connector.list_drive_changes( - page_token=current_token, - page_size=100, - include_removed=True, - ) - - if error: - logger.error(f"Error fetching Drive changes: {error}") - processing_errors.append(f"Failed to fetch changes: {error}") - break - - all_changes.extend(changes) - - if not next_token or next_token == current_token: - break - current_token = next_token - - if not all_changes: - logger.info("No changes detected since last sync") - return 0, 0, [] - - logger.info(f"Processing {len(all_changes)} changes from delta sync") - - for change in all_changes[:max_items]: - try: - # Handle removed files - is_removed = change.get("removed", False) - file_info = change.get("file", {}) - file_id = change.get("fileId") or file_info.get("id", "") - - if not file_id: - documents_skipped += 1 - continue - - # Check if file was trashed or removed - if is_removed or file_info.get("trashed", False): - # Remove document from database - document_type = DocumentType(TOOLKIT_TO_DOCUMENT_TYPE["googledrive"]) - unique_identifier_hash = generate_unique_identifier_hash( - document_type, f"drive_{file_id}", search_space_id - ) - existing_document = await check_document_by_unique_identifier( - session, unique_identifier_hash - ) - if existing_document: - await session.delete(existing_document) - documents_indexed += 1 - logger.info(f"Deleted document for removed/trashed file: {file_id}") - continue - - # Process changed file - file_name = file_info.get("name", "") or "Untitled" - mime_type = file_info.get("mimeType", "") or file_info.get("mime_type", "") - - # Skip folders - if mime_type == "application/vnd.google-apps.folder": - continue - - # Process the file - indexed, skipped, errors = await _process_single_drive_file( - session=session, - composio_connector=composio_connector, - file_id=file_id, - file_name=file_name, - mime_type=mime_type, - connector_id=connector_id, - search_space_id=search_space_id, - user_id=user_id, - task_logger=task_logger, - log_entry=log_entry, - ) - - documents_indexed += indexed - documents_skipped += skipped - processing_errors.extend(errors) - - # Batch commit every 10 documents - if documents_indexed > 0 and documents_indexed % 10 == 0: - await session.commit() - logger.info(f"Committed batch: {documents_indexed} changes processed") - - except Exception as e: - error_msg = f"Error processing change for file {file_id}: {e!s}" - logger.error(error_msg, exc_info=True) - processing_errors.append(error_msg) - documents_skipped += 1 - - logger.info(f"Delta sync complete: {documents_indexed} indexed, {documents_skipped} skipped") - return documents_indexed, documents_skipped, processing_errors - - -async def _index_composio_drive_full_scan( - session: AsyncSession, - composio_connector: ComposioConnector, - connector_id: int, - search_space_id: int, - user_id: str, - selected_folders: list[dict], - selected_files: list[dict], - max_files_per_folder: int, - include_subfolders: bool, - max_items: int, - task_logger: TaskLoggingService, - log_entry, -) -> tuple[int, int, list[str]]: - """Index Google Drive files using full scan (first sync or when no delta token).""" - documents_indexed = 0 - documents_skipped = 0 - processing_errors = [] - - all_files = [] - - # If specific folders/files are selected, fetch from those - if selected_folders or selected_files: - # Fetch files from selected folders - for folder in selected_folders: - folder_id = folder.get("id") - folder_name = folder.get("name", "Unknown") - - if not folder_id: - continue - - # Handle special case for "root" folder - actual_folder_id = None if folder_id == "root" else folder_id - - logger.info(f"Fetching files from folder: {folder_name} ({folder_id})") - - # Fetch files from this folder - folder_files = [] - page_token = None - - while len(folder_files) < max_files_per_folder: - ( - files, - next_token, - error, - ) = await composio_connector.list_drive_files( - folder_id=actual_folder_id, - page_token=page_token, - page_size=min(100, max_files_per_folder - len(folder_files)), - ) - - if error: - logger.warning( - f"Failed to fetch files from folder {folder_name}: {error}" - ) - break - - # Process files - for file_info in files: - mime_type = file_info.get("mimeType", "") or file_info.get( - "mime_type", "" - ) - - # If it's a folder and include_subfolders is enabled, recursively fetch - if mime_type == "application/vnd.google-apps.folder": - if include_subfolders: - # Add subfolder files recursively - subfolder_files = await _fetch_folder_files_recursively( - composio_connector, - file_info.get("id"), - max_files=max_files_per_folder, - current_count=len(folder_files), - ) - folder_files.extend(subfolder_files) - else: - folder_files.append(file_info) - - if not next_token: - break - page_token = next_token - - all_files.extend(folder_files[:max_files_per_folder]) - logger.info(f"Found {len(folder_files)} files in folder {folder_name}") - - # Add specifically selected files - for selected_file in selected_files: - file_id = selected_file.get("id") - file_name = selected_file.get("name", "Unknown") - - if not file_id: - continue - - # Add file info (we'll fetch content later during indexing) - all_files.append( - { - "id": file_id, - "name": file_name, - "mimeType": "", # Will be determined later - } - ) - else: - # No selection specified - fetch all files (original behavior) - page_token = None - - while len(all_files) < max_items: - files, next_token, error = await composio_connector.list_drive_files( - page_token=page_token, - page_size=min(100, max_items - len(all_files)), - ) - - if error: - return 0, 0, [f"Failed to fetch Drive files: {error}"] - - all_files.extend(files) - - if not next_token: - break - page_token = next_token - - if not all_files: - logger.info("No Google Drive files found") - return 0, 0, [] - - logger.info(f"Found {len(all_files)} Google Drive files to index via Composio (full scan)") - - for file_info in all_files: - try: - # Handle both standard Google API and potential Composio variations - file_id = file_info.get("id", "") or file_info.get("fileId", "") - file_name = ( - file_info.get("name", "") - or file_info.get("fileName", "") - or "Untitled" - ) - mime_type = file_info.get("mimeType", "") or file_info.get( - "mime_type", "" - ) - - if not file_id: - documents_skipped += 1 - continue - - # Skip folders - if mime_type == "application/vnd.google-apps.folder": - continue - - # Process the file - indexed, skipped, errors = await _process_single_drive_file( - session=session, - composio_connector=composio_connector, - file_id=file_id, - file_name=file_name, - mime_type=mime_type, - connector_id=connector_id, - search_space_id=search_space_id, - user_id=user_id, - task_logger=task_logger, - log_entry=log_entry, - ) - - documents_indexed += indexed - documents_skipped += skipped - processing_errors.extend(errors) - - # Batch commit every 10 documents - if documents_indexed > 0 and documents_indexed % 10 == 0: - logger.info(f"Committing batch: {documents_indexed} Google Drive files processed so far") - await session.commit() - - except Exception as e: - error_msg = f"Error processing Drive file {file_name or 'unknown'}: {e!s}" - logger.error(error_msg, exc_info=True) - processing_errors.append(error_msg) - documents_skipped += 1 - - logger.info(f"Full scan complete: {documents_indexed} indexed, {documents_skipped} skipped") - return documents_indexed, documents_skipped, processing_errors - - -async def _process_single_drive_file( - session: AsyncSession, - composio_connector: ComposioConnector, - file_id: str, - file_name: str, - mime_type: str, - connector_id: int, - search_space_id: int, - user_id: str, - task_logger: TaskLoggingService, - log_entry, -) -> tuple[int, int, list[str]]: - """Process a single Google Drive file for indexing. - - Returns: - Tuple of (documents_indexed, documents_skipped, processing_errors) - """ - processing_errors = [] - - # Generate unique identifier hash - document_type = DocumentType(TOOLKIT_TO_DOCUMENT_TYPE["googledrive"]) - unique_identifier_hash = generate_unique_identifier_hash( - document_type, f"drive_{file_id}", search_space_id - ) - - # Check if document exists - existing_document = await check_document_by_unique_identifier( - session, unique_identifier_hash - ) - - # Get file content - content, content_error = await composio_connector.get_drive_file_content(file_id) - - if content_error or not content: - logger.warning( - f"Could not get content for file {file_name}: {content_error}" - ) - # Use metadata as content fallback - markdown_content = f"# {file_name}\n\n" - markdown_content += f"**File ID:** {file_id}\n" - markdown_content += f"**Type:** {mime_type}\n" - elif isinstance(content, dict): - # Safety check: if content is still a dict, log error and use fallback - error_msg = f"Unexpected dict content format for file {file_name}: {list(content.keys())}" - logger.error(error_msg) - processing_errors.append(error_msg) - markdown_content = f"# {file_name}\n\n" - markdown_content += f"**File ID:** {file_id}\n" - markdown_content += f"**Type:** {mime_type}\n" - else: - # Process content based on file type - markdown_content = await _process_file_content( - content=content, - file_name=file_name, - file_id=file_id, - mime_type=mime_type, - search_space_id=search_space_id, - user_id=user_id, - session=session, - task_logger=task_logger, - log_entry=log_entry, - processing_errors=processing_errors, - ) - - content_hash = generate_content_hash(markdown_content, search_space_id) - - if existing_document: - if existing_document.content_hash == content_hash: - return 0, 1, processing_errors # Skipped - - # Update existing document - user_llm = await get_user_long_context_llm( - session, user_id, search_space_id - ) - - if user_llm: - document_metadata = { - "file_id": file_id, - "file_name": file_name, - "mime_type": mime_type, - "document_type": "Google Drive File (Composio)", - } - ( - summary_content, - summary_embedding, - ) = await generate_document_summary( - markdown_content, user_llm, document_metadata - ) - else: - summary_content = ( - f"Google Drive File: {file_name}\n\nType: {mime_type}" - ) - summary_embedding = config.embedding_model_instance.embed( - summary_content - ) - - chunks = await create_document_chunks(markdown_content) - - existing_document.title = f"Drive: {file_name}" - existing_document.content = summary_content - existing_document.content_hash = content_hash - existing_document.embedding = summary_embedding - existing_document.document_metadata = { - "file_id": file_id, - "file_name": file_name, - "FILE_NAME": file_name, # For compatibility - "mime_type": mime_type, - "connector_id": connector_id, - "source": "composio", - } - existing_document.chunks = chunks - existing_document.updated_at = get_current_timestamp() - - return 1, 0, processing_errors # Indexed - - # Create new document - user_llm = await get_user_long_context_llm( - session, user_id, search_space_id - ) - - if user_llm: - document_metadata = { - "file_id": file_id, - "file_name": file_name, - "mime_type": mime_type, - "document_type": "Google Drive File (Composio)", - } - ( - summary_content, - summary_embedding, - ) = await generate_document_summary( - markdown_content, user_llm, document_metadata - ) - else: - summary_content = ( - f"Google Drive File: {file_name}\n\nType: {mime_type}" - ) - summary_embedding = config.embedding_model_instance.embed( - summary_content - ) - - chunks = await create_document_chunks(markdown_content) - - document = Document( - search_space_id=search_space_id, - title=f"Drive: {file_name}", - document_type=DocumentType(TOOLKIT_TO_DOCUMENT_TYPE["googledrive"]), - document_metadata={ - "file_id": file_id, - "file_name": file_name, - "FILE_NAME": file_name, # For compatibility - "mime_type": mime_type, - "connector_id": connector_id, - "toolkit_id": "googledrive", - "source": "composio", - }, - content=summary_content, - content_hash=content_hash, - unique_identifier_hash=unique_identifier_hash, - embedding=summary_embedding, - chunks=chunks, - updated_at=get_current_timestamp(), - ) - session.add(document) - - return 1, 0, processing_errors # Indexed - - -async def _fetch_folder_files_recursively( - composio_connector: ComposioConnector, - folder_id: str, - max_files: int = 100, - current_count: int = 0, - depth: int = 0, - max_depth: int = 10, -) -> list[dict[str, Any]]: - """ - Recursively fetch files from a Google Drive folder via Composio. - - Args: - composio_connector: The Composio connector instance - folder_id: Google Drive folder ID - max_files: Maximum number of files to fetch - current_count: Current number of files already fetched - depth: Current recursion depth - max_depth: Maximum recursion depth to prevent infinite loops - - Returns: - List of file info dictionaries - """ - if depth >= max_depth: - logger.warning(f"Max recursion depth reached for folder {folder_id}") - return [] - - if current_count >= max_files: - return [] - - all_files = [] - page_token = None - - try: - while len(all_files) + current_count < max_files: - files, next_token, error = await composio_connector.list_drive_files( - folder_id=folder_id, - page_token=page_token, - page_size=min(100, max_files - len(all_files) - current_count), - ) - - if error: - logger.warning( - f"Error fetching files from subfolder {folder_id}: {error}" - ) - break - - for file_info in files: - mime_type = file_info.get("mimeType", "") or file_info.get( - "mime_type", "" - ) - - if mime_type == "application/vnd.google-apps.folder": - # Recursively fetch from subfolders - subfolder_files = await _fetch_folder_files_recursively( - composio_connector, - file_info.get("id"), - max_files=max_files, - current_count=current_count + len(all_files), - depth=depth + 1, - max_depth=max_depth, - ) - all_files.extend(subfolder_files) - else: - all_files.append(file_info) - - if len(all_files) + current_count >= max_files: - break - - if not next_token: - break - page_token = next_token - - return all_files[: max_files - current_count] - - except Exception as e: - logger.error(f"Error in recursive folder fetch: {e!s}") - return all_files - - -async def _process_gmail_message_batch( - session: AsyncSession, - messages: list[dict[str, Any]], - composio_connector: ComposioConnector, - connector_id: int, - search_space_id: int, - user_id: str, - total_documents_indexed: int = 0, -) -> tuple[int, int]: - """ - Process a batch of Gmail messages and index them. - - Args: - total_documents_indexed: Running total of documents indexed so far (for batch commits). - - Returns: - Tuple of (documents_indexed, documents_skipped) - """ - documents_indexed = 0 - documents_skipped = 0 - - for message in messages: - try: - # Composio uses 'messageId' (camelCase), not 'id' - message_id = message.get("messageId", "") or message.get("id", "") - if not message_id: - documents_skipped += 1 - continue - - # Composio's GMAIL_FETCH_EMAILS already returns full message content - # No need for a separate detail API call - - # Extract message info from Composio response - # Composio structure: messageId, messageText, messageTimestamp, payload.headers, labelIds - payload = message.get("payload", {}) - headers = payload.get("headers", []) - - subject = "No Subject" - sender = "Unknown Sender" - date_str = message.get("messageTimestamp", "Unknown Date") - - for header in headers: - name = header.get("name", "").lower() - value = header.get("value", "") - if name == "subject": - subject = value - elif name == "from": - sender = value - elif name == "date": - date_str = value - - # Format to markdown using the full message data - markdown_content = composio_connector.format_gmail_message_to_markdown( - message - ) - - # Check for empty content (defensive parsing per Composio best practices) - if not markdown_content.strip(): - logger.warning(f"Skipping Gmail message with no content: {subject}") - documents_skipped += 1 - continue - - # Generate unique identifier - document_type = DocumentType(TOOLKIT_TO_DOCUMENT_TYPE["gmail"]) - unique_identifier_hash = generate_unique_identifier_hash( - document_type, f"gmail_{message_id}", search_space_id - ) - - content_hash = generate_content_hash(markdown_content, search_space_id) - - existing_document = await check_document_by_unique_identifier( - session, unique_identifier_hash - ) - - # Get label IDs from Composio response - label_ids = message.get("labelIds", []) - # Extract thread_id if available (for consistency with non-Composio implementation) - thread_id = message.get("threadId", "") or message.get("thread_id", "") - - if existing_document: - if existing_document.content_hash == content_hash: - documents_skipped += 1 - continue - - # Update existing - user_llm = await get_user_long_context_llm( - session, user_id, search_space_id - ) - - if user_llm: - document_metadata = { - "message_id": message_id, - "thread_id": thread_id, - "subject": subject, - "sender": sender, - "document_type": "Gmail Message (Composio)", - } - ( - summary_content, - summary_embedding, - ) = await generate_document_summary( - markdown_content, user_llm, document_metadata - ) - else: - summary_content = ( - f"Gmail: {subject}\n\nFrom: {sender}\nDate: {date_str}" - ) - summary_embedding = config.embedding_model_instance.embed( - summary_content - ) - - chunks = await create_document_chunks(markdown_content) - - existing_document.title = f"Gmail: {subject}" - existing_document.content = summary_content - existing_document.content_hash = content_hash - existing_document.embedding = summary_embedding - existing_document.document_metadata = { - "message_id": message_id, - "thread_id": thread_id, - "subject": subject, - "sender": sender, - "date": date_str, - "labels": label_ids, - "connector_id": connector_id, - "source": "composio", - } - existing_document.chunks = chunks - existing_document.updated_at = get_current_timestamp() - - documents_indexed += 1 - - # Batch commit every 10 documents - current_total = total_documents_indexed + documents_indexed - if current_total % 10 == 0: - logger.info( - f"Committing batch: {current_total} Gmail messages processed so far" - ) - await session.commit() - continue - - # Create new document - user_llm = await get_user_long_context_llm( - session, user_id, search_space_id - ) - - if user_llm: - document_metadata = { - "message_id": message_id, - "thread_id": thread_id, - "subject": subject, - "sender": sender, - "document_type": "Gmail Message (Composio)", - } - summary_content, summary_embedding = await generate_document_summary( - markdown_content, user_llm, document_metadata - ) - else: - summary_content = ( - f"Gmail: {subject}\n\nFrom: {sender}\nDate: {date_str}" - ) - summary_embedding = config.embedding_model_instance.embed( - summary_content - ) - - chunks = await create_document_chunks(markdown_content) - - document = Document( - search_space_id=search_space_id, - title=f"Gmail: {subject}", - document_type=DocumentType(TOOLKIT_TO_DOCUMENT_TYPE["gmail"]), - document_metadata={ - "message_id": message_id, - "thread_id": thread_id, - "subject": subject, - "sender": sender, - "date": date_str, - "labels": label_ids, - "connector_id": connector_id, - "toolkit_id": "gmail", - "source": "composio", - }, - content=summary_content, - content_hash=content_hash, - unique_identifier_hash=unique_identifier_hash, - embedding=summary_embedding, - chunks=chunks, - updated_at=get_current_timestamp(), - ) - session.add(document) - documents_indexed += 1 - - # Batch commit every 10 documents - current_total = total_documents_indexed + documents_indexed - if current_total % 10 == 0: - logger.info( - f"Committing batch: {current_total} Gmail messages processed so far" - ) - await session.commit() - - except Exception as e: - logger.error(f"Error processing Gmail message: {e!s}", exc_info=True) - documents_skipped += 1 - # Rollback on error to avoid partial state (per Composio best practices) - try: - await session.rollback() - except Exception as rollback_error: - logger.error( - f"Error during rollback: {rollback_error!s}", exc_info=True - ) - continue - - return documents_indexed, documents_skipped - - -async def _index_composio_gmail( - session: AsyncSession, - connector, - connector_id: int, - search_space_id: int, - user_id: str, - start_date: str | None, - end_date: str | None, - task_logger: TaskLoggingService, - log_entry, - update_last_indexed: bool = True, - max_items: int = 1000, -) -> tuple[int, str]: - """Index Gmail messages via Composio with pagination and incremental processing.""" - try: - composio_connector = ComposioConnector(session, connector_id) - - # Normalize date values - handle "undefined" strings from frontend - if start_date == "undefined" or start_date == "": - start_date = None - if end_date == "undefined" or end_date == "": - end_date = None - - # Use provided dates directly if both are provided, otherwise calculate from last_indexed_at - # This ensures user-selected dates are respected (matching non-Composio Gmail connector behavior) - if start_date is not None and end_date is not None: - # User provided both dates - use them directly - start_date_str = start_date - end_date_str = end_date - else: - # Calculate date range with defaults (uses last_indexed_at or 365 days back) - # This ensures indexing works even when user doesn't specify dates - start_date_str, end_date_str = calculate_date_range( - connector, start_date, end_date, default_days_back=365 - ) - - # Build query with date range - query_parts = [] - if start_date_str: - query_parts.append(f"after:{start_date_str.replace('-', '/')}") - if end_date_str: - query_parts.append(f"before:{end_date_str.replace('-', '/')}") - query = " ".join(query_parts) if query_parts else "" - - logger.info( - f"Gmail query for connector {connector_id}: '{query}' " - f"(start_date={start_date_str}, end_date={end_date_str})" - ) - - # Use smaller batch size to avoid 413 payload too large errors - batch_size = 50 - page_token = None - total_documents_indexed = 0 - total_documents_skipped = 0 - total_messages_fetched = 0 - result_size_estimate = None # Will be set from first API response - - while total_messages_fetched < max_items: - # Calculate how many messages to fetch in this batch - remaining = max_items - total_messages_fetched - current_batch_size = min(batch_size, remaining) - - # Use result_size_estimate if available, otherwise fall back to max_items - estimated_total = ( - result_size_estimate if result_size_estimate is not None else max_items - ) - # Cap estimated_total at max_items to avoid showing misleading progress - estimated_total = min(estimated_total, max_items) - - await task_logger.log_task_progress( - log_entry, - f"Fetching Gmail messages batch via Composio for connector {connector_id} " - f"({total_messages_fetched}/{estimated_total} fetched, {total_documents_indexed} indexed)", - { - "stage": "fetching_messages", - "batch_size": current_batch_size, - "total_fetched": total_messages_fetched, - "total_indexed": total_documents_indexed, - "estimated_total": estimated_total, - }, - ) - - # Fetch batch of messages - ( - messages, - next_token, - result_size_estimate_batch, - error, - ) = await composio_connector.list_gmail_messages( - query=query, - max_results=current_batch_size, - page_token=page_token, - ) - - if error: - await task_logger.log_task_failure( - log_entry, f"Failed to fetch Gmail messages: {error}", {} - ) - return 0, f"Failed to fetch Gmail messages: {error}" - - if not messages: - # No more messages available - break - - # Update result_size_estimate from first response (Gmail provides this estimate) - if result_size_estimate is None and result_size_estimate_batch is not None: - result_size_estimate = result_size_estimate_batch - logger.info( - f"Gmail API estimated {result_size_estimate} total messages for query: '{query}'" - ) - - total_messages_fetched += len(messages) - # Recalculate estimated_total after potentially updating result_size_estimate - estimated_total = ( - result_size_estimate if result_size_estimate is not None else max_items - ) - estimated_total = min(estimated_total, max_items) - - logger.info( - f"Fetched batch of {len(messages)} Gmail messages " - f"(total: {total_messages_fetched}/{estimated_total})" - ) - - # Process batch incrementally - batch_indexed, batch_skipped = await _process_gmail_message_batch( - session=session, - messages=messages, - composio_connector=composio_connector, - connector_id=connector_id, - search_space_id=search_space_id, - user_id=user_id, - total_documents_indexed=total_documents_indexed, - ) - - total_documents_indexed += batch_indexed - total_documents_skipped += batch_skipped - - logger.info( - f"Processed batch: {batch_indexed} indexed, {batch_skipped} skipped " - f"(total: {total_documents_indexed} indexed, {total_documents_skipped} skipped)" - ) - - # Batch commits happen in _process_gmail_message_batch every 10 documents - # This ensures progress is saved incrementally, preventing data loss on crashes - - # Check if we should continue - if not next_token: - # No more pages available - break - - if len(messages) < current_batch_size: - # Last page had fewer items than requested, we're done - break - - # Continue with next page - page_token = next_token - - if total_messages_fetched == 0: - success_msg = "No Gmail messages found in the specified date range" - await task_logger.log_task_success( - log_entry, success_msg, {"messages_count": 0} - ) - # CRITICAL: Update timestamp even when no messages found so Electric SQL syncs and UI shows indexed status - await update_connector_last_indexed(session, connector, update_last_indexed) - await session.commit() - return 0, None # Return None (not error) when no items found - - # CRITICAL: Always update timestamp (even if 0 documents indexed) so Electric SQL syncs - # This ensures the UI shows "Last indexed" instead of "Never indexed" - await update_connector_last_indexed(session, connector, update_last_indexed) - - # Final commit to ensure all documents are persisted (safety net) - # This matches the pattern used in non-Composio Gmail indexer - logger.info( - f"Final commit: Total {total_documents_indexed} Gmail messages processed" - ) - await session.commit() - logger.info( - "Successfully committed all Composio Gmail document changes to database" - ) - - await task_logger.log_task_success( - log_entry, - f"Successfully completed Gmail indexing via Composio for connector {connector_id}", - { - "documents_indexed": total_documents_indexed, - "documents_skipped": total_documents_skipped, - "messages_fetched": total_messages_fetched, - }, - ) - - return total_documents_indexed, None - - except Exception as e: - logger.error(f"Failed to index Gmail via Composio: {e!s}", exc_info=True) - return 0, f"Failed to index Gmail via Composio: {e!s}" - - -async def _index_composio_google_calendar( - session: AsyncSession, - connector, - connector_id: int, - search_space_id: int, - user_id: str, - start_date: str | None, - end_date: str | None, - task_logger: TaskLoggingService, - log_entry, - update_last_indexed: bool = True, - max_items: int = 2500, -) -> tuple[int, str]: - """Index Google Calendar events via Composio.""" - try: - composio_connector = ComposioConnector(session, connector_id) - - await task_logger.log_task_progress( - log_entry, - f"Fetching Google Calendar events via Composio for connector {connector_id}", - {"stage": "fetching_events"}, - ) - - # Normalize date values - handle "undefined" strings from frontend - if start_date == "undefined" or start_date == "": - start_date = None - if end_date == "undefined" or end_date == "": - end_date = None - - # Use provided dates directly if both are provided, otherwise calculate from last_indexed_at - # This ensures user-selected dates are respected (matching non-Composio Calendar connector behavior) - if start_date is not None and end_date is not None: - # User provided both dates - use them directly - start_date_str = start_date - end_date_str = end_date - else: - # Calculate date range with defaults (uses last_indexed_at or 365 days back) - # This ensures indexing works even when user doesn't specify dates - start_date_str, end_date_str = calculate_date_range( - connector, start_date, end_date, default_days_back=365 - ) - - # Build time range for API call - time_min = f"{start_date_str}T00:00:00Z" - time_max = f"{end_date_str}T23:59:59Z" - - logger.info( - f"Google Calendar query for connector {connector_id}: " - f"(start_date={start_date_str}, end_date={end_date_str})" - ) - - events, error = await composio_connector.list_calendar_events( - time_min=time_min, - time_max=time_max, - max_results=max_items, - ) - - if error: - await task_logger.log_task_failure( - log_entry, f"Failed to fetch Calendar events: {error}", {} - ) - return 0, f"Failed to fetch Calendar events: {error}" - - if not events: - success_msg = "No Google Calendar events found in the specified date range" - await task_logger.log_task_success( - log_entry, success_msg, {"events_count": 0} - ) - # CRITICAL: Update timestamp even when no events found so Electric SQL syncs and UI shows indexed status - await update_connector_last_indexed(session, connector, update_last_indexed) - await session.commit() - return ( - 0, - None, - ) # Return None (not error) when no items found - this is success with 0 items - - logger.info(f"Found {len(events)} Google Calendar events to index via Composio") - - documents_indexed = 0 - documents_skipped = 0 - - for event in events: - try: - # Handle both standard Google API and potential Composio variations - event_id = event.get("id", "") or event.get("eventId", "") - summary = ( - event.get("summary", "") or event.get("title", "") or "No Title" - ) - - if not event_id: - documents_skipped += 1 - continue - - # Format to markdown - markdown_content = composio_connector.format_calendar_event_to_markdown( - event - ) - - # Generate unique identifier - document_type = DocumentType(TOOLKIT_TO_DOCUMENT_TYPE["googlecalendar"]) - unique_identifier_hash = generate_unique_identifier_hash( - document_type, f"calendar_{event_id}", search_space_id - ) - - content_hash = generate_content_hash(markdown_content, search_space_id) - - existing_document = await check_document_by_unique_identifier( - session, unique_identifier_hash - ) - - # Extract event times - start = event.get("start", {}) - end = event.get("end", {}) - start_time = start.get("dateTime") or start.get("date", "") - end_time = end.get("dateTime") or end.get("date", "") - location = event.get("location", "") - - if existing_document: - if existing_document.content_hash == content_hash: - documents_skipped += 1 - continue - - # Update existing - user_llm = await get_user_long_context_llm( - session, user_id, search_space_id - ) - - if user_llm: - document_metadata = { - "event_id": event_id, - "summary": summary, - "start_time": start_time, - "document_type": "Google Calendar Event (Composio)", - } - ( - summary_content, - summary_embedding, - ) = await generate_document_summary( - markdown_content, user_llm, document_metadata - ) - else: - summary_content = f"Calendar: {summary}\n\nStart: {start_time}\nEnd: {end_time}" - if location: - summary_content += f"\nLocation: {location}" - summary_embedding = config.embedding_model_instance.embed( - summary_content - ) - - chunks = await create_document_chunks(markdown_content) - - existing_document.title = f"Calendar: {summary}" - existing_document.content = summary_content - existing_document.content_hash = content_hash - existing_document.embedding = summary_embedding - existing_document.document_metadata = { - "event_id": event_id, - "summary": summary, - "start_time": start_time, - "end_time": end_time, - "location": location, - "connector_id": connector_id, - "source": "composio", - } - existing_document.chunks = chunks - existing_document.updated_at = get_current_timestamp() - - documents_indexed += 1 - - # Batch commit every 10 documents - if documents_indexed % 10 == 0: - logger.info( - f"Committing batch: {documents_indexed} Google Calendar events processed so far" - ) - await session.commit() - continue - - # Create new document - user_llm = await get_user_long_context_llm( - session, user_id, search_space_id - ) - - if user_llm: - document_metadata = { - "event_id": event_id, - "summary": summary, - "start_time": start_time, - "document_type": "Google Calendar Event (Composio)", - } - ( - summary_content, - summary_embedding, - ) = await generate_document_summary( - markdown_content, user_llm, document_metadata - ) - else: - summary_content = ( - f"Calendar: {summary}\n\nStart: {start_time}\nEnd: {end_time}" - ) - if location: - summary_content += f"\nLocation: {location}" - summary_embedding = config.embedding_model_instance.embed( - summary_content - ) - - chunks = await create_document_chunks(markdown_content) - - document = Document( - search_space_id=search_space_id, - title=f"Calendar: {summary}", - document_type=DocumentType( - TOOLKIT_TO_DOCUMENT_TYPE["googlecalendar"] - ), - document_metadata={ - "event_id": event_id, - "summary": summary, - "start_time": start_time, - "end_time": end_time, - "location": location, - "connector_id": connector_id, - "toolkit_id": "googlecalendar", - "source": "composio", - }, - content=summary_content, - content_hash=content_hash, - unique_identifier_hash=unique_identifier_hash, - embedding=summary_embedding, - chunks=chunks, - updated_at=get_current_timestamp(), - ) - session.add(document) - documents_indexed += 1 - - # Batch commit every 10 documents - if documents_indexed % 10 == 0: - logger.info( - f"Committing batch: {documents_indexed} Google Calendar events processed so far" - ) - await session.commit() - - except Exception as e: - logger.error(f"Error processing Calendar event: {e!s}", exc_info=True) - documents_skipped += 1 - continue - - # CRITICAL: Always update timestamp (even if 0 documents indexed) so Electric SQL syncs - # This ensures the UI shows "Last indexed" instead of "Never indexed" - await update_connector_last_indexed(session, connector, update_last_indexed) - - # Final commit to ensure all documents are persisted (safety net) - # This matches the pattern used in non-Composio Gmail indexer - logger.info( - f"Final commit: Total {documents_indexed} Google Calendar events processed" - ) - await session.commit() - logger.info( - "Successfully committed all Composio Google Calendar document changes to database" - ) - - await task_logger.log_task_success( - log_entry, - f"Successfully completed Google Calendar indexing via Composio for connector {connector_id}", - { - "documents_indexed": documents_indexed, - "documents_skipped": documents_skipped, - }, - ) - - return documents_indexed, None - - except Exception as e: - logger.error( - f"Failed to index Google Calendar via Composio: {e!s}", exc_info=True - ) - return 0, f"Failed to index Google Calendar via Composio: {e!s}" diff --git a/surfsense_web/components/assistant-ui/connector-popup/components/composio-connector-card.tsx b/surfsense_web/components/assistant-ui/connector-popup/components/composio-connector-card.tsx deleted file mode 100644 index 671fc3ce6..000000000 --- a/surfsense_web/components/assistant-ui/connector-popup/components/composio-connector-card.tsx +++ /dev/null @@ -1,78 +0,0 @@ -"use client"; - -import { Zap } from "lucide-react"; -import Image from "next/image"; -import type { FC } from "react"; -import { Button } from "@/components/ui/button"; -import { cn } from "@/lib/utils"; - -interface ComposioConnectorCardProps { - id: string; - title: string; - description: string; - connectorCount?: number; - onConnect: () => void; -} - -export const ComposioConnectorCard: FC = ({ - id, - title, - description, - connectorCount = 0, - onConnect, -}) => { - const hasConnections = connectorCount > 0; - - return ( -
-
- Composio -
-
-
- {title} - -
- {hasConnections ? ( -

- - {connectorCount} {connectorCount === 1 ? "connection" : "connections"} - -

- ) : ( -

{description}

- )} -
- -
- ); -}; diff --git a/surfsense_web/components/assistant-ui/connector-popup/connector-configs/components/composio-calendar-config.tsx b/surfsense_web/components/assistant-ui/connector-popup/connector-configs/components/composio-calendar-config.tsx new file mode 100644 index 000000000..6e7a06073 --- /dev/null +++ b/surfsense_web/components/assistant-ui/connector-popup/connector-configs/components/composio-calendar-config.tsx @@ -0,0 +1,220 @@ +"use client"; + +import { Calendar, Clock } from "lucide-react"; +import type { FC } from "react"; +import { useEffect, useState } from "react"; +import { Label } from "@/components/ui/label"; +import { + Select, + SelectContent, + SelectItem, + SelectTrigger, + SelectValue, +} from "@/components/ui/select"; +import { Switch } from "@/components/ui/switch"; +import type { SearchSourceConnector } from "@/contracts/types/connector.types"; + +interface ComposioCalendarConfigProps { + connector: SearchSourceConnector; + onConfigChange?: (config: Record) => void; + onNameChange?: (name: string) => void; +} + +interface CalendarIndexingOptions { + max_events: number; + include_recurring: boolean; + include_past_events: boolean; + days_ahead: number; +} + +const DEFAULT_CALENDAR_OPTIONS: CalendarIndexingOptions = { + max_events: 500, + include_recurring: true, + include_past_events: true, + days_ahead: 365, +}; + +export const ComposioCalendarConfig: FC = ({ connector, onConfigChange }) => { + const isIndexable = connector.config?.is_indexable as boolean; + + // Initialize with existing options from connector config + const existingOptions = + (connector.config?.calendar_options as CalendarIndexingOptions | undefined) || DEFAULT_CALENDAR_OPTIONS; + + const [calendarOptions, setCalendarOptions] = useState(existingOptions); + + // Update options when connector config changes + useEffect(() => { + const options = + (connector.config?.calendar_options as CalendarIndexingOptions | undefined) || + DEFAULT_CALENDAR_OPTIONS; + setCalendarOptions(options); + }, [connector.config]); + + const updateConfig = (options: CalendarIndexingOptions) => { + if (onConfigChange) { + onConfigChange({ + ...connector.config, + calendar_options: options, + }); + } + }; + + const handleOptionChange = (key: keyof CalendarIndexingOptions, value: number | boolean) => { + const newOptions = { ...calendarOptions, [key]: value }; + setCalendarOptions(newOptions); + updateConfig(newOptions); + }; + + // Only show configuration if the connector is indexable + if (!isIndexable) { + return
; + } + + return ( +
+ {/* Calendar Indexing Options */} +
+
+
+ +

Calendar Indexing Options

+
+

+ Configure how events are indexed from your Google Calendar. +

+
+ + {/* Max events to index */} +
+
+
+ +

+ Maximum number of events to index per sync +

+
+ +
+
+ + {/* Days ahead */} +
+
+
+
+ + +
+

+ How far ahead to index future events +

+
+ +
+
+ + {/* Include recurring events toggle */} +
+
+ +

+ Index individual instances of recurring events +

+
+ + handleOptionChange("include_recurring", checked) + } + /> +
+ + {/* Include past events toggle */} +
+
+ +

+ Index events from before the selected date range +

+
+ + handleOptionChange("include_past_events", checked) + } + /> +
+
+
+ ); +}; + diff --git a/surfsense_web/components/assistant-ui/connector-popup/connector-configs/components/composio-config.tsx b/surfsense_web/components/assistant-ui/connector-popup/connector-configs/components/composio-config.tsx deleted file mode 100644 index fdff956e5..000000000 --- a/surfsense_web/components/assistant-ui/connector-popup/connector-configs/components/composio-config.tsx +++ /dev/null @@ -1,353 +0,0 @@ -"use client"; - -import { File, FileSpreadsheet, FileText, FolderClosed, Image, Presentation } from "lucide-react"; -import type { FC } from "react"; -import { useEffect, useState } from "react"; -import { ComposioDriveFolderTree } from "@/components/connectors/composio-drive-folder-tree"; -import { Badge } from "@/components/ui/badge"; -import { Button } from "@/components/ui/button"; -import { Label } from "@/components/ui/label"; -import { - Select, - SelectContent, - SelectItem, - SelectTrigger, - SelectValue, -} from "@/components/ui/select"; -import { Switch } from "@/components/ui/switch"; -import type { SearchSourceConnector } from "@/contracts/types/connector.types"; -import { cn } from "@/lib/utils"; - -interface ComposioConfigProps { - connector: SearchSourceConnector; - onConfigChange?: (config: Record) => void; - onNameChange?: (name: string) => void; -} - -interface SelectedFolder { - id: string; - name: string; -} - -interface IndexingOptions { - max_files_per_folder: number; - incremental_sync: boolean; - include_subfolders: boolean; -} - -const DEFAULT_INDEXING_OPTIONS: IndexingOptions = { - max_files_per_folder: 100, - incremental_sync: true, - include_subfolders: true, -}; - -// Helper to get appropriate icon for file type based on file name -function getFileIconFromName(fileName: string, className: string = "size-3.5 shrink-0") { - const lowerName = fileName.toLowerCase(); - // Spreadsheets - if ( - lowerName.endsWith(".xlsx") || - lowerName.endsWith(".xls") || - lowerName.endsWith(".csv") || - lowerName.includes("spreadsheet") - ) { - return ; - } - // Presentations - if ( - lowerName.endsWith(".pptx") || - lowerName.endsWith(".ppt") || - lowerName.includes("presentation") - ) { - return ; - } - // Documents (word, text only - not PDF) - if ( - lowerName.endsWith(".docx") || - lowerName.endsWith(".doc") || - lowerName.endsWith(".txt") || - lowerName.includes("document") || - lowerName.includes("word") || - lowerName.includes("text") - ) { - return ; - } - // Images - if ( - lowerName.endsWith(".png") || - lowerName.endsWith(".jpg") || - lowerName.endsWith(".jpeg") || - lowerName.endsWith(".gif") || - lowerName.endsWith(".webp") || - lowerName.endsWith(".svg") - ) { - return ; - } - // Default (including PDF) - return ; -} - -export const ComposioConfig: FC = ({ connector, onConfigChange }) => { - const toolkitId = connector.config?.toolkit_id as string; - const isIndexable = connector.config?.is_indexable as boolean; - const composioAccountId = connector.config?.composio_connected_account_id as string; - - // Check if this is a Google Drive Composio connector - const isGoogleDrive = toolkitId === "googledrive"; - - // Initialize with existing selected folders and files from connector config - const existingFolders = - (connector.config?.selected_folders as SelectedFolder[] | undefined) || []; - const existingFiles = (connector.config?.selected_files as SelectedFolder[] | undefined) || []; - const existingIndexingOptions = - (connector.config?.indexing_options as IndexingOptions | undefined) || DEFAULT_INDEXING_OPTIONS; - - const [selectedFolders, setSelectedFolders] = useState(existingFolders); - const [selectedFiles, setSelectedFiles] = useState(existingFiles); - const [showFolderSelector, setShowFolderSelector] = useState(false); - const [indexingOptions, setIndexingOptions] = useState(existingIndexingOptions); - - // Update selected folders and files when connector config changes - useEffect(() => { - const folders = (connector.config?.selected_folders as SelectedFolder[] | undefined) || []; - const files = (connector.config?.selected_files as SelectedFolder[] | undefined) || []; - const options = - (connector.config?.indexing_options as IndexingOptions | undefined) || - DEFAULT_INDEXING_OPTIONS; - setSelectedFolders(folders); - setSelectedFiles(files); - setIndexingOptions(options); - }, [connector.config]); - - const updateConfig = ( - folders: SelectedFolder[], - files: SelectedFolder[], - options: IndexingOptions - ) => { - if (onConfigChange) { - onConfigChange({ - ...connector.config, - selected_folders: folders, - selected_files: files, - indexing_options: options, - }); - } - }; - - const handleSelectFolders = (folders: SelectedFolder[]) => { - setSelectedFolders(folders); - updateConfig(folders, selectedFiles, indexingOptions); - }; - - const handleSelectFiles = (files: SelectedFolder[]) => { - setSelectedFiles(files); - updateConfig(selectedFolders, files, indexingOptions); - }; - - const handleIndexingOptionChange = (key: keyof IndexingOptions, value: number | boolean) => { - const newOptions = { ...indexingOptions, [key]: value }; - setIndexingOptions(newOptions); - updateConfig(selectedFolders, selectedFiles, newOptions); - }; - - const totalSelected = selectedFolders.length + selectedFiles.length; - - return ( -
- {/* Connection Details */} -
-

- Connection Details -

-
-
- Toolkit - {toolkitId} -
-
- Indexing Supported - - {isIndexable ? "Yes" : "Coming Soon"} - -
- {composioAccountId && ( -
- Account ID - - {composioAccountId} - -
- )} -
-
- - {/* Google Drive specific: Folder & File Selection */} - {isGoogleDrive && isIndexable && ( - <> -
-
-

Folder & File Selection

-

- Select specific folders and/or individual files to index. -

-
- - {totalSelected > 0 && ( -
-

- Selected {totalSelected} item{totalSelected > 1 ? "s" : ""}: {(() => { - const parts: string[] = []; - if (selectedFolders.length > 0) { - parts.push( - `${selectedFolders.length} folder${selectedFolders.length > 1 ? "s" : ""}` - ); - } - if (selectedFiles.length > 0) { - parts.push( - `${selectedFiles.length} file${selectedFiles.length > 1 ? "s" : ""}` - ); - } - return parts.length > 0 ? `(${parts.join(" ")})` : ""; - })()} -

-
- {selectedFolders.map((folder) => ( -

- - {folder.name} -

- ))} - {selectedFiles.map((file) => ( -

- {getFileIconFromName(file.name)} - {file.name} -

- ))} -
-
- )} - - {showFolderSelector ? ( -
- - -
- ) : ( - - )} -
- - {/* Indexing Options */} -
-
-

Indexing Options

-

- Configure how files are indexed from your Google Drive. -

-
- - {/* Max files per folder */} -
-
-
- -

- Maximum number of files to index from each folder -

-
- -
-
- - {/* Include subfolders toggle */} -
-
- -

- Recursively index files in subfolders of selected folders -

-
- - handleIndexingOptionChange("include_subfolders", checked) - } - /> -
-
- - )} -
- ); -}; diff --git a/surfsense_web/components/assistant-ui/connector-popup/connector-configs/components/composio-drive-config.tsx b/surfsense_web/components/assistant-ui/connector-popup/connector-configs/components/composio-drive-config.tsx new file mode 100644 index 000000000..755b91a5a --- /dev/null +++ b/surfsense_web/components/assistant-ui/connector-popup/connector-configs/components/composio-drive-config.tsx @@ -0,0 +1,313 @@ +"use client"; + +import { File, FileSpreadsheet, FileText, FolderClosed, Image, Presentation } from "lucide-react"; +import type { FC } from "react"; +import { useEffect, useState } from "react"; +import { ComposioDriveFolderTree } from "@/components/connectors/composio-drive-folder-tree"; +import { Button } from "@/components/ui/button"; +import { Label } from "@/components/ui/label"; +import { + Select, + SelectContent, + SelectItem, + SelectTrigger, + SelectValue, +} from "@/components/ui/select"; +import { Switch } from "@/components/ui/switch"; +import type { SearchSourceConnector } from "@/contracts/types/connector.types"; + +interface ComposioDriveConfigProps { + connector: SearchSourceConnector; + onConfigChange?: (config: Record) => void; + onNameChange?: (name: string) => void; +} + +interface SelectedFolder { + id: string; + name: string; +} + +interface IndexingOptions { + max_files_per_folder: number; + incremental_sync: boolean; + include_subfolders: boolean; +} + +const DEFAULT_INDEXING_OPTIONS: IndexingOptions = { + max_files_per_folder: 100, + incremental_sync: true, + include_subfolders: true, +}; + +// Helper to get appropriate icon for file type based on file name +function getFileIconFromName(fileName: string, className: string = "size-3.5 shrink-0") { + const lowerName = fileName.toLowerCase(); + // Spreadsheets + if ( + lowerName.endsWith(".xlsx") || + lowerName.endsWith(".xls") || + lowerName.endsWith(".csv") || + lowerName.includes("spreadsheet") + ) { + return ; + } + // Presentations + if ( + lowerName.endsWith(".pptx") || + lowerName.endsWith(".ppt") || + lowerName.includes("presentation") + ) { + return ; + } + // Documents (word, text only - not PDF) + if ( + lowerName.endsWith(".docx") || + lowerName.endsWith(".doc") || + lowerName.endsWith(".txt") || + lowerName.includes("document") || + lowerName.includes("word") || + lowerName.includes("text") + ) { + return ; + } + // Images + if ( + lowerName.endsWith(".png") || + lowerName.endsWith(".jpg") || + lowerName.endsWith(".jpeg") || + lowerName.endsWith(".gif") || + lowerName.endsWith(".webp") || + lowerName.endsWith(".svg") + ) { + return ; + } + // Default (including PDF) + return ; +} + +export const ComposioDriveConfig: FC = ({ connector, onConfigChange }) => { + const isIndexable = connector.config?.is_indexable as boolean; + + // Initialize with existing selected folders and files from connector config + const existingFolders = + (connector.config?.selected_folders as SelectedFolder[] | undefined) || []; + const existingFiles = (connector.config?.selected_files as SelectedFolder[] | undefined) || []; + const existingIndexingOptions = + (connector.config?.indexing_options as IndexingOptions | undefined) || DEFAULT_INDEXING_OPTIONS; + + const [selectedFolders, setSelectedFolders] = useState(existingFolders); + const [selectedFiles, setSelectedFiles] = useState(existingFiles); + const [showFolderSelector, setShowFolderSelector] = useState(false); + const [indexingOptions, setIndexingOptions] = useState(existingIndexingOptions); + + // Update selected folders and files when connector config changes + useEffect(() => { + const folders = (connector.config?.selected_folders as SelectedFolder[] | undefined) || []; + const files = (connector.config?.selected_files as SelectedFolder[] | undefined) || []; + const options = + (connector.config?.indexing_options as IndexingOptions | undefined) || + DEFAULT_INDEXING_OPTIONS; + setSelectedFolders(folders); + setSelectedFiles(files); + setIndexingOptions(options); + }, [connector.config]); + + const updateConfig = ( + folders: SelectedFolder[], + files: SelectedFolder[], + options: IndexingOptions + ) => { + if (onConfigChange) { + onConfigChange({ + ...connector.config, + selected_folders: folders, + selected_files: files, + indexing_options: options, + }); + } + }; + + const handleSelectFolders = (folders: SelectedFolder[]) => { + setSelectedFolders(folders); + updateConfig(folders, selectedFiles, indexingOptions); + }; + + const handleSelectFiles = (files: SelectedFolder[]) => { + setSelectedFiles(files); + updateConfig(selectedFolders, files, indexingOptions); + }; + + const handleIndexingOptionChange = (key: keyof IndexingOptions, value: number | boolean) => { + const newOptions = { ...indexingOptions, [key]: value }; + setIndexingOptions(newOptions); + updateConfig(selectedFolders, selectedFiles, newOptions); + }; + + const totalSelected = selectedFolders.length + selectedFiles.length; + + // Only show configuration if the connector is indexable + if (!isIndexable) { + return
; + } + + return ( +
+ {/* Folder & File Selection */} +
+
+

Folder & File Selection

+

+ Select specific folders and/or individual files to index from your Google Drive. +

+
+ + {totalSelected > 0 && ( +
+

+ Selected {totalSelected} item{totalSelected > 1 ? "s" : ""}: {(() => { + const parts: string[] = []; + if (selectedFolders.length > 0) { + parts.push( + `${selectedFolders.length} folder${selectedFolders.length > 1 ? "s" : ""}` + ); + } + if (selectedFiles.length > 0) { + parts.push( + `${selectedFiles.length} file${selectedFiles.length > 1 ? "s" : ""}` + ); + } + return parts.length > 0 ? `(${parts.join(" ")})` : ""; + })()} +

+
+ {selectedFolders.map((folder) => ( +

+ + {folder.name} +

+ ))} + {selectedFiles.map((file) => ( +

+ {getFileIconFromName(file.name)} + {file.name} +

+ ))} +
+
+ )} + + {showFolderSelector ? ( +
+ + +
+ ) : ( + + )} +
+ + {/* Indexing Options */} +
+
+

Indexing Options

+

+ Configure how files are indexed from your Google Drive. +

+
+ + {/* Max files per folder */} +
+
+
+ +

+ Maximum number of files to index from each folder +

+
+ +
+
+ + {/* Include subfolders toggle */} +
+
+ +

+ Recursively index files in subfolders of selected folders +

+
+ + handleIndexingOptionChange("include_subfolders", checked) + } + /> +
+
+
+ ); +}; + diff --git a/surfsense_web/components/assistant-ui/connector-popup/connector-configs/components/composio-gmail-config.tsx b/surfsense_web/components/assistant-ui/connector-popup/connector-configs/components/composio-gmail-config.tsx new file mode 100644 index 000000000..963753ab3 --- /dev/null +++ b/surfsense_web/components/assistant-ui/connector-popup/connector-configs/components/composio-gmail-config.tsx @@ -0,0 +1,174 @@ +"use client"; + +import { Mail, Tag } from "lucide-react"; +import type { FC } from "react"; +import { useEffect, useState } from "react"; +import { Input } from "@/components/ui/input"; +import { Label } from "@/components/ui/label"; +import { + Select, + SelectContent, + SelectItem, + SelectTrigger, + SelectValue, +} from "@/components/ui/select"; +import type { SearchSourceConnector } from "@/contracts/types/connector.types"; + +interface ComposioGmailConfigProps { + connector: SearchSourceConnector; + onConfigChange?: (config: Record) => void; + onNameChange?: (name: string) => void; +} + +interface GmailIndexingOptions { + max_emails: number; + label_filter: string; + search_query: string; +} + +const DEFAULT_GMAIL_OPTIONS: GmailIndexingOptions = { + max_emails: 500, + label_filter: "", + search_query: "", +}; + +export const ComposioGmailConfig: FC = ({ connector, onConfigChange }) => { + const isIndexable = connector.config?.is_indexable as boolean; + + // Initialize with existing options from connector config + const existingOptions = + (connector.config?.gmail_options as GmailIndexingOptions | undefined) || DEFAULT_GMAIL_OPTIONS; + + const [gmailOptions, setGmailOptions] = useState(existingOptions); + + // Update options when connector config changes + useEffect(() => { + const options = + (connector.config?.gmail_options as GmailIndexingOptions | undefined) || + DEFAULT_GMAIL_OPTIONS; + setGmailOptions(options); + }, [connector.config]); + + const updateConfig = (options: GmailIndexingOptions) => { + if (onConfigChange) { + onConfigChange({ + ...connector.config, + gmail_options: options, + }); + } + }; + + const handleOptionChange = (key: keyof GmailIndexingOptions, value: number | string) => { + const newOptions = { ...gmailOptions, [key]: value }; + setGmailOptions(newOptions); + updateConfig(newOptions); + }; + + // Only show configuration if the connector is indexable + if (!isIndexable) { + return
; + } + + return ( +
+ {/* Gmail Indexing Options */} +
+
+
+ +

Gmail Indexing Options

+
+

+ Configure how emails are indexed from your Gmail account. +

+
+ + {/* Max emails to index */} +
+
+
+ +

+ Maximum number of emails to index per sync +

+
+ +
+
+ + {/* Label filter */} +
+
+
+ + +
+

+ Only index emails with this label (e.g., "INBOX", "IMPORTANT", "work") +

+
+ handleOptionChange("label_filter", e.target.value)} + placeholder="Enter label name..." + className="bg-slate-400/5 dark:bg-slate-400/5 border-slate-400/20 text-xs sm:text-sm" + /> +
+ + {/* Search query */} +
+
+ +

+ Gmail search query to filter emails (e.g., "from:boss@company.com", "has:attachment") +

+
+ handleOptionChange("search_query", e.target.value)} + placeholder="Enter Gmail search query..." + className="bg-slate-400/5 dark:bg-slate-400/5 border-slate-400/20 text-xs sm:text-sm" + /> +
+
+
+ ); +}; + diff --git a/surfsense_web/components/assistant-ui/connector-popup/connector-configs/index.tsx b/surfsense_web/components/assistant-ui/connector-popup/connector-configs/index.tsx index 1a713a5a0..6b4d86b5a 100644 --- a/surfsense_web/components/assistant-ui/connector-popup/connector-configs/index.tsx +++ b/surfsense_web/components/assistant-ui/connector-popup/connector-configs/index.tsx @@ -6,7 +6,9 @@ import { BaiduSearchApiConfig } from "./components/baidu-search-api-config"; import { BookStackConfig } from "./components/bookstack-config"; import { CirclebackConfig } from "./components/circleback-config"; import { ClickUpConfig } from "./components/clickup-config"; -import { ComposioConfig } from "./components/composio-config"; +import { ComposioCalendarConfig } from "./components/composio-calendar-config"; +import { ComposioDriveConfig } from "./components/composio-drive-config"; +import { ComposioGmailConfig } from "./components/composio-gmail-config"; import { ConfluenceConfig } from "./components/confluence-config"; import { DiscordConfig } from "./components/discord-config"; import { ElasticsearchConfig } from "./components/elasticsearch-config"; @@ -78,9 +80,11 @@ export function getConnectorConfigComponent( case "OBSIDIAN_CONNECTOR": return ObsidianConfig; case "COMPOSIO_GOOGLE_DRIVE_CONNECTOR": + return ComposioDriveConfig; case "COMPOSIO_GMAIL_CONNECTOR": + return ComposioGmailConfig; case "COMPOSIO_GOOGLE_CALENDAR_CONNECTOR": - return ComposioConfig; + return ComposioCalendarConfig; // OAuth connectors (Gmail, Calendar, Airtable, Notion) and others don't need special config UI default: return null; diff --git a/surfsense_web/components/assistant-ui/connector-popup/connector-configs/views/connector-edit-view.tsx b/surfsense_web/components/assistant-ui/connector-popup/connector-configs/views/connector-edit-view.tsx index fbdffed7a..6b1a8c92b 100644 --- a/surfsense_web/components/assistant-ui/connector-popup/connector-configs/views/connector-edit-view.tsx +++ b/surfsense_web/components/assistant-ui/connector-popup/connector-configs/views/connector-edit-view.tsx @@ -206,8 +206,9 @@ export const ConnectorEditView: FC = ({ {/* Date range selector and periodic sync - only shown for indexable connectors */} {connector.is_indexable && ( <> - {/* Date range selector - not shown for Google Drive, Webcrawler, or GitHub (indexes full repo snapshots) */} + {/* Date range selector - not shown for Google Drive (regular and Composio), Webcrawler, or GitHub (indexes full repo snapshots) */} {connector.connector_type !== "GOOGLE_DRIVE_CONNECTOR" && + connector.connector_type !== "COMPOSIO_GOOGLE_DRIVE_CONNECTOR" && connector.connector_type !== "WEBCRAWLER_CONNECTOR" && connector.connector_type !== "GITHUB_CONNECTOR" && ( = ({ onEndDateChange={onEndDateChange} allowFutureDates={ connector.connector_type === "GOOGLE_CALENDAR_CONNECTOR" || + connector.connector_type === "COMPOSIO_GOOGLE_CALENDAR_CONNECTOR" || connector.connector_type === "LUMA_CONNECTOR" } /> diff --git a/surfsense_web/components/assistant-ui/connector-popup/connector-configs/views/indexing-configuration-view.tsx b/surfsense_web/components/assistant-ui/connector-popup/connector-configs/views/indexing-configuration-view.tsx index 68fc688c3..17995fdfa 100644 --- a/surfsense_web/components/assistant-ui/connector-popup/connector-configs/views/indexing-configuration-view.tsx +++ b/surfsense_web/components/assistant-ui/connector-popup/connector-configs/views/indexing-configuration-view.tsx @@ -9,11 +9,7 @@ import { getConnectorTypeDisplay } from "@/lib/connectors/utils"; import { cn } from "@/lib/utils"; import { DateRangeSelector } from "../../components/date-range-selector"; import { PeriodicSyncConfig } from "../../components/periodic-sync-config"; -import { - COMPOSIO_CONNECTORS, - type IndexingConfigState, - OAUTH_CONNECTORS, -} from "../../constants/connector-constants"; +import type { IndexingConfigState } from "../../constants/connector-constants"; import { getConnectorDisplayName } from "../../tabs/all-connectors-tab"; import { getConnectorConfigComponent } from "../index"; @@ -95,11 +91,6 @@ export const IndexingConfigurationView: FC = ({ }; }, [checkScrollState]); - // Check both OAUTH_CONNECTORS and COMPOSIO_CONNECTORS - const authConnector = - OAUTH_CONNECTORS.find((c) => c.connectorType === connector?.connector_type) || - COMPOSIO_CONNECTORS.find((c) => c.connectorType === connector?.connector_type); - return (
{/* Fixed Header */} @@ -158,8 +149,9 @@ export const IndexingConfigurationView: FC = ({ {/* Date range selector and periodic sync - only shown for indexable connectors */} {connector?.is_indexable && ( <> - {/* Date range selector - not shown for Google Drive, Webcrawler, or GitHub (indexes full repo snapshots) */} + {/* Date range selector - not shown for Google Drive (regular and Composio), Webcrawler, or GitHub (indexes full repo snapshots) */} {config.connectorType !== "GOOGLE_DRIVE_CONNECTOR" && + config.connectorType !== "COMPOSIO_GOOGLE_DRIVE_CONNECTOR" && config.connectorType !== "WEBCRAWLER_CONNECTOR" && config.connectorType !== "GITHUB_CONNECTOR" && ( = ({ onEndDateChange={onEndDateChange} allowFutureDates={ config.connectorType === "GOOGLE_CALENDAR_CONNECTOR" || + config.connectorType === "COMPOSIO_GOOGLE_CALENDAR_CONNECTOR" || config.connectorType === "LUMA_CONNECTOR" } /> )} - {/* Periodic sync - not shown for Google Drive */} - {config.connectorType !== "GOOGLE_DRIVE_CONNECTOR" && ( + {/* Periodic sync - not shown for Google Drive (regular and Composio) */} + {config.connectorType !== "GOOGLE_DRIVE_CONNECTOR" && + config.connectorType !== "COMPOSIO_GOOGLE_DRIVE_CONNECTOR" && ( { if ( params.success === "true" && - params.connector && searchSpaceId && params.modal === "connectors" ) { - const oauthConnector = OAUTH_CONNECTORS.find((c) => c.id === params.connector); - if (oauthConnector) { - refetchAllConnectors().then((result) => { - if (!result.data) return; + refetchAllConnectors().then((result) => { + if (!result.data) return; - let newConnector: SearchSourceConnector | undefined; - if (params.connectorId) { - const connectorId = parseInt(params.connectorId, 10); - newConnector = result.data.find((c: SearchSourceConnector) => c.id === connectorId); - } else { + let newConnector: SearchSourceConnector | undefined; + let oauthConnector: + | (typeof OAUTH_CONNECTORS)[number] + | (typeof COMPOSIO_CONNECTORS)[number] + | undefined; + + // First, try to find connector by connectorId if provided + if (params.connectorId) { + const connectorId = parseInt(params.connectorId, 10); + newConnector = result.data.find((c: SearchSourceConnector) => c.id === connectorId); + + // If we found the connector, find the matching OAuth/Composio connector by type + if (newConnector) { + oauthConnector = + OAUTH_CONNECTORS.find( + (c) => c.connectorType === newConnector!.connector_type + ) || + COMPOSIO_CONNECTORS.find( + (c) => c.connectorType === newConnector!.connector_type + ); + } + } + + // If we don't have a connector yet, try to find by connector param + if (!newConnector && params.connector) { + oauthConnector = + OAUTH_CONNECTORS.find((c) => c.id === params.connector) || + COMPOSIO_CONNECTORS.find((c) => c.id === params.connector); + + if (oauthConnector) { newConnector = result.data.find( - (c: SearchSourceConnector) => c.connector_type === oauthConnector.connectorType + (c: SearchSourceConnector) => c.connector_type === oauthConnector!.connectorType ); } + } - if (newConnector) { - const connectorValidation = searchSourceConnector.safeParse(newConnector); - if (connectorValidation.success) { - // Track connector connected event for OAuth connectors - trackConnectorConnected( - Number(searchSpaceId), - oauthConnector.connectorType, - newConnector.id - ); + if (newConnector && oauthConnector) { + const connectorValidation = searchSourceConnector.safeParse(newConnector); + if (connectorValidation.success) { + // Track connector connected event for OAuth/Composio connectors + trackConnectorConnected( + Number(searchSpaceId), + oauthConnector.connectorType, + newConnector.id + ); - const config = validateIndexingConfigState({ - connectorType: oauthConnector.connectorType, - connectorId: newConnector.id, - connectorTitle: oauthConnector.title, - }); - setIndexingConfig(config); - setIndexingConnector(newConnector); - setIndexingConnectorConfig(newConnector.config); - setIsOpen(true); - const url = new URL(window.location.href); - url.searchParams.delete("success"); - url.searchParams.set("connectorId", newConnector.id.toString()); - url.searchParams.set("view", "configure"); - window.history.replaceState({}, "", url.toString()); - } else { - console.warn("Invalid connector data after OAuth:", connectorValidation.error); - toast.error("Failed to validate connector data"); - } + const config = validateIndexingConfigState({ + connectorType: oauthConnector.connectorType, + connectorId: newConnector.id, + connectorTitle: oauthConnector.title, + }); + setIndexingConfig(config); + setIndexingConnector(newConnector); + setIndexingConnectorConfig(newConnector.config); + setIsOpen(true); + const url = new URL(window.location.href); + url.searchParams.delete("success"); + url.searchParams.set("connectorId", newConnector.id.toString()); + url.searchParams.set("view", "configure"); + window.history.replaceState({}, "", url.toString()); + } else { + console.warn("Invalid connector data after OAuth:", connectorValidation.error); + toast.error("Failed to validate connector data"); } - }); - } + } + }); } } catch (error) { // Invalid query params - log but don't crash @@ -863,9 +885,10 @@ export const useConnectorDialog = () => { async (refreshConnectors: () => void) => { if (!indexingConfig || !searchSpaceId) return; - // Validate date range (skip for Google Drive and Webcrawler) + // Validate date range (skip for Google Drive, Composio Drive, and Webcrawler) if ( indexingConfig.connectorType !== "GOOGLE_DRIVE_CONNECTOR" && + indexingConfig.connectorType !== "COMPOSIO_GOOGLE_DRIVE_CONNECTOR" && indexingConfig.connectorType !== "WEBCRAWLER_CONNECTOR" ) { const dateRangeValidation = dateRangeSchema.safeParse({ startDate, endDate }); @@ -910,8 +933,12 @@ export const useConnectorDialog = () => { }); } - // Handle Google Drive folder selection - if (indexingConfig.connectorType === "GOOGLE_DRIVE_CONNECTOR" && indexingConnectorConfig) { + // Handle Google Drive folder selection (regular and Composio) + if ( + (indexingConfig.connectorType === "GOOGLE_DRIVE_CONNECTOR" || + indexingConfig.connectorType === "COMPOSIO_GOOGLE_DRIVE_CONNECTOR") && + indexingConnectorConfig + ) { const selectedFolders = indexingConnectorConfig.selected_folders as | Array<{ id: string; name: string }> | undefined; diff --git a/surfsense_web/lib/connectors/utils.ts b/surfsense_web/lib/connectors/utils.ts index 34721a6aa..0ca1c1ea9 100644 --- a/surfsense_web/lib/connectors/utils.ts +++ b/surfsense_web/lib/connectors/utils.ts @@ -16,6 +16,9 @@ export const getConnectorTypeDisplay = (type: string): string => { GOOGLE_CALENDAR_CONNECTOR: "Google Calendar", GOOGLE_GMAIL_CONNECTOR: "Google Gmail", GOOGLE_DRIVE_CONNECTOR: "Google Drive", + COMPOSIO_GOOGLE_DRIVE_CONNECTOR: "Google Drive", + COMPOSIO_GMAIL_CONNECTOR: "Gmail", + COMPOSIO_GOOGLE_CALENDAR_CONNECTOR: "Google Calendar", AIRTABLE_CONNECTOR: "Airtable", LUMA_CONNECTOR: "Luma", ELASTICSEARCH_CONNECTOR: "Elasticsearch", From 12f45e1bd3a1d9c47b1543caf37558d76dbdec77 Mon Sep 17 00:00:00 2001 From: Anish Sarkar <104695310+AnishSarkar22@users.noreply.github.com> Date: Fri, 23 Jan 2026 20:19:04 +0530 Subject: [PATCH 13/28] feat: streamline Composio connector configurations and enhance UI interactions - Refactored Composio connector configuration components to improve modularity and maintainability. - Simplified the ComposioCalendarConfig, ComposioGmailConfig, and ComposioDriveConfig components by removing unnecessary state management and UI elements. - Added functionality to remove selected folders and files in the Google Drive and Composio Drive configurations, enhancing user experience. - Updated connector display names for better clarity in the UI. - Improved the overall structure of the connector edit view for better readability and usability. --- .../app/routes/composio_routes.py | 7 +- .../assistant-ui/connector-popup.tsx | 3 +- .../components/composio-calendar-config.tsx | 209 +----------------- .../components/composio-drive-config.tsx | 44 +++- .../components/composio-gmail-config.tsx | 163 +------------- .../components/google-drive-config.tsx | 44 +++- .../views/connector-edit-view.tsx | 3 +- .../tabs/active-connectors-tab.tsx | 5 +- 8 files changed, 88 insertions(+), 390 deletions(-) diff --git a/surfsense_backend/app/routes/composio_routes.py b/surfsense_backend/app/routes/composio_routes.py index 5ad2266b7..9e9b59f82 100644 --- a/surfsense_backend/app/routes/composio_routes.py +++ b/surfsense_backend/app/routes/composio_routes.py @@ -344,13 +344,16 @@ async def composio_callback( try: # Generate a unique, user-friendly connector name - connector_name = await generate_unique_connector_name( + # Pass just toolkit_name (without "(Composio)") to avoid redundancy + base_name = await generate_unique_connector_name( session, connector_type, space_id, user_id, - f"{toolkit_name} (Composio)", + toolkit_name, ) + # Append "(Composio)" suffix for identification + connector_name = f"{base_name} (Composio)" db_connector = SearchSourceConnector( name=connector_name, diff --git a/surfsense_web/components/assistant-ui/connector-popup.tsx b/surfsense_web/components/assistant-ui/connector-popup.tsx index a04e2a9fd..1ec8fad73 100644 --- a/surfsense_web/components/assistant-ui/connector-popup.tsx +++ b/surfsense_web/components/assistant-ui/connector-popup.tsx @@ -7,7 +7,7 @@ import type { FC } from "react"; import { activeSearchSpaceIdAtom } from "@/atoms/search-spaces/search-space-query.atoms"; import { currentUserAtom } from "@/atoms/user/user-query.atoms"; import { TooltipIconButton } from "@/components/assistant-ui/tooltip-icon-button"; -import { Dialog, DialogContent } from "@/components/ui/dialog"; +import { Dialog, DialogContent, DialogTitle } from "@/components/ui/dialog"; import { Tabs, TabsContent } from "@/components/ui/tabs"; import type { SearchSourceConnector } from "@/contracts/types/connector.types"; import { useConnectorsElectric } from "@/hooks/use-connectors-electric"; @@ -185,6 +185,7 @@ export const ConnectorIndicator: FC = () => { + Manage Connectors {/* YouTube Crawler View - shown when adding YouTube videos */} {isYouTubeView && searchSpaceId ? ( diff --git a/surfsense_web/components/assistant-ui/connector-popup/connector-configs/components/composio-calendar-config.tsx b/surfsense_web/components/assistant-ui/connector-popup/connector-configs/components/composio-calendar-config.tsx index 6e7a06073..ce5133a9d 100644 --- a/surfsense_web/components/assistant-ui/connector-popup/connector-configs/components/composio-calendar-config.tsx +++ b/surfsense_web/components/assistant-ui/connector-popup/connector-configs/components/composio-calendar-config.tsx @@ -1,17 +1,6 @@ "use client"; -import { Calendar, Clock } from "lucide-react"; import type { FC } from "react"; -import { useEffect, useState } from "react"; -import { Label } from "@/components/ui/label"; -import { - Select, - SelectContent, - SelectItem, - SelectTrigger, - SelectValue, -} from "@/components/ui/select"; -import { Switch } from "@/components/ui/switch"; import type { SearchSourceConnector } from "@/contracts/types/connector.types"; interface ComposioCalendarConfigProps { @@ -20,201 +9,7 @@ interface ComposioCalendarConfigProps { onNameChange?: (name: string) => void; } -interface CalendarIndexingOptions { - max_events: number; - include_recurring: boolean; - include_past_events: boolean; - days_ahead: number; -} - -const DEFAULT_CALENDAR_OPTIONS: CalendarIndexingOptions = { - max_events: 500, - include_recurring: true, - include_past_events: true, - days_ahead: 365, -}; - -export const ComposioCalendarConfig: FC = ({ connector, onConfigChange }) => { - const isIndexable = connector.config?.is_indexable as boolean; - - // Initialize with existing options from connector config - const existingOptions = - (connector.config?.calendar_options as CalendarIndexingOptions | undefined) || DEFAULT_CALENDAR_OPTIONS; - - const [calendarOptions, setCalendarOptions] = useState(existingOptions); - - // Update options when connector config changes - useEffect(() => { - const options = - (connector.config?.calendar_options as CalendarIndexingOptions | undefined) || - DEFAULT_CALENDAR_OPTIONS; - setCalendarOptions(options); - }, [connector.config]); - - const updateConfig = (options: CalendarIndexingOptions) => { - if (onConfigChange) { - onConfigChange({ - ...connector.config, - calendar_options: options, - }); - } - }; - - const handleOptionChange = (key: keyof CalendarIndexingOptions, value: number | boolean) => { - const newOptions = { ...calendarOptions, [key]: value }; - setCalendarOptions(newOptions); - updateConfig(newOptions); - }; - - // Only show configuration if the connector is indexable - if (!isIndexable) { - return
; - } - - return ( -
- {/* Calendar Indexing Options */} -
-
-
- -

Calendar Indexing Options

-
-

- Configure how events are indexed from your Google Calendar. -

-
- - {/* Max events to index */} -
-
-
- -

- Maximum number of events to index per sync -

-
- -
-
- - {/* Days ahead */} -
-
-
-
- - -
-

- How far ahead to index future events -

-
- -
-
- - {/* Include recurring events toggle */} -
-
- -

- Index individual instances of recurring events -

-
- - handleOptionChange("include_recurring", checked) - } - /> -
- - {/* Include past events toggle */} -
-
- -

- Index events from before the selected date range -

-
- - handleOptionChange("include_past_events", checked) - } - /> -
-
-
- ); +export const ComposioCalendarConfig: FC = () => { + return
; }; diff --git a/surfsense_web/components/assistant-ui/connector-popup/connector-configs/components/composio-drive-config.tsx b/surfsense_web/components/assistant-ui/connector-popup/connector-configs/components/composio-drive-config.tsx index 755b91a5a..0ab0869ff 100644 --- a/surfsense_web/components/assistant-ui/connector-popup/connector-configs/components/composio-drive-config.tsx +++ b/surfsense_web/components/assistant-ui/connector-popup/connector-configs/components/composio-drive-config.tsx @@ -1,6 +1,6 @@ "use client"; -import { File, FileSpreadsheet, FileText, FolderClosed, Image, Presentation } from "lucide-react"; +import { File, FileSpreadsheet, FileText, FolderClosed, Image, Presentation, X } from "lucide-react"; import type { FC } from "react"; import { useEffect, useState } from "react"; import { ComposioDriveFolderTree } from "@/components/connectors/composio-drive-folder-tree"; @@ -143,6 +143,18 @@ export const ComposioDriveConfig: FC = ({ connector, o updateConfig(selectedFolders, selectedFiles, newOptions); }; + const handleRemoveFolder = (folderId: string) => { + const newFolders = selectedFolders.filter((folder) => folder.id !== folderId); + setSelectedFolders(newFolders); + updateConfig(newFolders, selectedFiles, indexingOptions); + }; + + const handleRemoveFile = (fileId: string) => { + const newFiles = selectedFiles.filter((file) => file.id !== fileId); + setSelectedFiles(newFiles); + updateConfig(selectedFolders, newFiles, indexingOptions); + }; + const totalSelected = selectedFolders.length + selectedFiles.length; // Only show configuration if the connector is indexable @@ -176,29 +188,45 @@ export const ComposioDriveConfig: FC = ({ connector, o `${selectedFiles.length} file${selectedFiles.length > 1 ? "s" : ""}` ); } - return parts.length > 0 ? `(${parts.join(" ")})` : ""; + return parts.length > 0 ? `(${parts.join(", ")})` : ""; })()}

{selectedFolders.map((folder) => ( -

- {folder.name} -

+ {folder.name} + +
))} {selectedFiles.map((file) => ( -

{getFileIconFromName(file.name)} - {file.name} -

+ {file.name} + +
))}
diff --git a/surfsense_web/components/assistant-ui/connector-popup/connector-configs/components/composio-gmail-config.tsx b/surfsense_web/components/assistant-ui/connector-popup/connector-configs/components/composio-gmail-config.tsx index 963753ab3..4664e3e64 100644 --- a/surfsense_web/components/assistant-ui/connector-popup/connector-configs/components/composio-gmail-config.tsx +++ b/surfsense_web/components/assistant-ui/connector-popup/connector-configs/components/composio-gmail-config.tsx @@ -1,17 +1,6 @@ "use client"; -import { Mail, Tag } from "lucide-react"; import type { FC } from "react"; -import { useEffect, useState } from "react"; -import { Input } from "@/components/ui/input"; -import { Label } from "@/components/ui/label"; -import { - Select, - SelectContent, - SelectItem, - SelectTrigger, - SelectValue, -} from "@/components/ui/select"; import type { SearchSourceConnector } from "@/contracts/types/connector.types"; interface ComposioGmailConfigProps { @@ -20,155 +9,7 @@ interface ComposioGmailConfigProps { onNameChange?: (name: string) => void; } -interface GmailIndexingOptions { - max_emails: number; - label_filter: string; - search_query: string; -} - -const DEFAULT_GMAIL_OPTIONS: GmailIndexingOptions = { - max_emails: 500, - label_filter: "", - search_query: "", -}; - -export const ComposioGmailConfig: FC = ({ connector, onConfigChange }) => { - const isIndexable = connector.config?.is_indexable as boolean; - - // Initialize with existing options from connector config - const existingOptions = - (connector.config?.gmail_options as GmailIndexingOptions | undefined) || DEFAULT_GMAIL_OPTIONS; - - const [gmailOptions, setGmailOptions] = useState(existingOptions); - - // Update options when connector config changes - useEffect(() => { - const options = - (connector.config?.gmail_options as GmailIndexingOptions | undefined) || - DEFAULT_GMAIL_OPTIONS; - setGmailOptions(options); - }, [connector.config]); - - const updateConfig = (options: GmailIndexingOptions) => { - if (onConfigChange) { - onConfigChange({ - ...connector.config, - gmail_options: options, - }); - } - }; - - const handleOptionChange = (key: keyof GmailIndexingOptions, value: number | string) => { - const newOptions = { ...gmailOptions, [key]: value }; - setGmailOptions(newOptions); - updateConfig(newOptions); - }; - - // Only show configuration if the connector is indexable - if (!isIndexable) { - return
; - } - - return ( -
- {/* Gmail Indexing Options */} -
-
-
- -

Gmail Indexing Options

-
-

- Configure how emails are indexed from your Gmail account. -

-
- - {/* Max emails to index */} -
-
-
- -

- Maximum number of emails to index per sync -

-
- -
-
- - {/* Label filter */} -
-
-
- - -
-

- Only index emails with this label (e.g., "INBOX", "IMPORTANT", "work") -

-
- handleOptionChange("label_filter", e.target.value)} - placeholder="Enter label name..." - className="bg-slate-400/5 dark:bg-slate-400/5 border-slate-400/20 text-xs sm:text-sm" - /> -
- - {/* Search query */} -
-
- -

- Gmail search query to filter emails (e.g., "from:boss@company.com", "has:attachment") -

-
- handleOptionChange("search_query", e.target.value)} - placeholder="Enter Gmail search query..." - className="bg-slate-400/5 dark:bg-slate-400/5 border-slate-400/20 text-xs sm:text-sm" - /> -
-
-
- ); +export const ComposioGmailConfig: FC = () => { + return
; }; diff --git a/surfsense_web/components/assistant-ui/connector-popup/connector-configs/components/google-drive-config.tsx b/surfsense_web/components/assistant-ui/connector-popup/connector-configs/components/google-drive-config.tsx index 17f4a49a5..b6cfb39ae 100644 --- a/surfsense_web/components/assistant-ui/connector-popup/connector-configs/components/google-drive-config.tsx +++ b/surfsense_web/components/assistant-ui/connector-popup/connector-configs/components/google-drive-config.tsx @@ -1,6 +1,6 @@ "use client"; -import { File, FileSpreadsheet, FileText, FolderClosed, Image, Presentation } from "lucide-react"; +import { File, FileSpreadsheet, FileText, FolderClosed, Image, Presentation, X } from "lucide-react"; import type { FC } from "react"; import { useEffect, useState } from "react"; import { GoogleDriveFolderTree } from "@/components/connectors/google-drive-folder-tree"; @@ -135,6 +135,18 @@ export const GoogleDriveConfig: FC = ({ connector, onConfi updateConfig(selectedFolders, selectedFiles, newOptions); }; + const handleRemoveFolder = (folderId: string) => { + const newFolders = selectedFolders.filter((folder) => folder.id !== folderId); + setSelectedFolders(newFolders); + updateConfig(newFolders, selectedFiles, indexingOptions); + }; + + const handleRemoveFile = (fileId: string) => { + const newFiles = selectedFiles.filter((file) => file.id !== fileId); + setSelectedFiles(newFiles); + updateConfig(selectedFolders, newFiles, indexingOptions); + }; + const totalSelected = selectedFolders.length + selectedFiles.length; return ( @@ -161,29 +173,45 @@ export const GoogleDriveConfig: FC = ({ connector, onConfi if (selectedFiles.length > 0) { parts.push(`${selectedFiles.length} file${selectedFiles.length > 1 ? "s" : ""}`); } - return parts.length > 0 ? `(${parts.join(" ")})` : ""; + return parts.length > 0 ? `(${parts.join(", ")})` : ""; })()}

{selectedFolders.map((folder) => ( -

- {folder.name} -

+ {folder.name} + +
))} {selectedFiles.map((file) => ( -

{getFileIconFromName(file.name)} - {file.name} -

+ {file.name} + +
))}
diff --git a/surfsense_web/components/assistant-ui/connector-popup/connector-configs/views/connector-edit-view.tsx b/surfsense_web/components/assistant-ui/connector-popup/connector-configs/views/connector-edit-view.tsx index 6b1a8c92b..8951336c5 100644 --- a/surfsense_web/components/assistant-ui/connector-popup/connector-configs/views/connector-edit-view.tsx +++ b/surfsense_web/components/assistant-ui/connector-popup/connector-configs/views/connector-edit-view.tsx @@ -9,6 +9,7 @@ import { cn } from "@/lib/utils"; import { DateRangeSelector } from "../../components/date-range-selector"; import { PeriodicSyncConfig } from "../../components/periodic-sync-config"; import { getConnectorConfigComponent } from "../index"; +import { getConnectorDisplayName } from "../../tabs/all-connectors-tab"; interface ConnectorEditViewProps { connector: SearchSourceConnector; @@ -151,7 +152,7 @@ export const ConnectorEditView: FC = ({

- {connector.name} + {getConnectorDisplayName(connector.name)}

Manage your connector settings and sync configuration diff --git a/surfsense_web/components/assistant-ui/connector-popup/tabs/active-connectors-tab.tsx b/surfsense_web/components/assistant-ui/connector-popup/tabs/active-connectors-tab.tsx index e45888bb1..2067ca9ad 100644 --- a/surfsense_web/components/assistant-ui/connector-popup/tabs/active-connectors-tab.tsx +++ b/surfsense_web/components/assistant-ui/connector-popup/tabs/active-connectors-tab.tsx @@ -15,6 +15,7 @@ import { connectorsApiService } from "@/lib/apis/connectors-api.service"; import { cn } from "@/lib/utils"; import { COMPOSIO_CONNECTORS, OAUTH_CONNECTORS } from "../constants/connector-constants"; import { getDocumentCountForConnector } from "../utils/connector-document-mapping"; +import { getConnectorDisplayName } from "./all-connectors-tab"; interface ActiveConnectorsTabProps { searchQuery: string; @@ -263,8 +264,8 @@ export const ActiveConnectorsTab: FC = ({

-

- {connector.name} +

+ {getConnectorDisplayName(connector.name)}

{isIndexing ? ( From 08f16b43d72edff44bcd4621a43cad79a61ed103 Mon Sep 17 00:00:00 2001 From: Anish Sarkar <104695310+AnishSarkar22@users.noreply.github.com> Date: Fri, 23 Jan 2026 20:36:00 +0530 Subject: [PATCH 14/28] feat: enhance Composio connector naming logic and improve UI focus - Updated the Composio connector naming logic to dynamically generate user-friendly names based on existing connectors. - Introduced new utility functions for counting connectors and retrieving base names for specific connector types. - Enhanced the UI components to improve accessibility and focus management, ensuring a better user experience when interacting with connector dialogs. --- .../app/routes/composio_routes.py | 27 +++++++++++-------- .../app/utils/connector_naming.py | 3 +++ .../assistant-ui/connector-popup.tsx | 2 +- surfsense_web/components/ui/dialog.tsx | 2 +- 4 files changed, 21 insertions(+), 13 deletions(-) diff --git a/surfsense_backend/app/routes/composio_routes.py b/surfsense_backend/app/routes/composio_routes.py index 9e9b59f82..14ef9efcf 100644 --- a/surfsense_backend/app/routes/composio_routes.py +++ b/surfsense_backend/app/routes/composio_routes.py @@ -35,7 +35,10 @@ from app.services.composio_service import ( ComposioService, ) from app.users import current_active_user -from app.utils.connector_naming import generate_unique_connector_name +from app.utils.connector_naming import ( + count_connectors_of_type, + get_base_name_for_type, +) from app.utils.oauth_security import OAuthStateManager # Note: We no longer use check_duplicate_connector for Composio connectors because @@ -343,17 +346,19 @@ async def composio_callback( ) try: - # Generate a unique, user-friendly connector name - # Pass just toolkit_name (without "(Composio)") to avoid redundancy - base_name = await generate_unique_connector_name( - session, - connector_type, - space_id, - user_id, - toolkit_name, + # Count existing connectors of this type to determine the number + count = await count_connectors_of_type( + session, connector_type, space_id, user_id ) - # Append "(Composio)" suffix for identification - connector_name = f"{base_name} (Composio)" + + # Generate base name (e.g., "Gmail", "Google Drive") + base_name = get_base_name_for_type(connector_type) + + # Format: "Gmail (Composio) 1", "Gmail (Composio) 2", etc. + if count == 0: + connector_name = f"{base_name} (Composio) 1" + else: + connector_name = f"{base_name} (Composio) {count + 1}" db_connector = SearchSourceConnector( name=connector_name, diff --git a/surfsense_backend/app/utils/connector_naming.py b/surfsense_backend/app/utils/connector_naming.py index a2b748a3a..7d3efc001 100644 --- a/surfsense_backend/app/utils/connector_naming.py +++ b/surfsense_backend/app/utils/connector_naming.py @@ -28,6 +28,9 @@ BASE_NAME_FOR_TYPE = { SearchSourceConnectorType.CONFLUENCE_CONNECTOR: "Confluence", SearchSourceConnectorType.AIRTABLE_CONNECTOR: "Airtable", SearchSourceConnectorType.MCP_CONNECTOR: "Model Context Protocol (MCP)", + SearchSourceConnectorType.COMPOSIO_GMAIL_CONNECTOR: "Gmail", + SearchSourceConnectorType.COMPOSIO_GOOGLE_DRIVE_CONNECTOR: "Google Drive", + SearchSourceConnectorType.COMPOSIO_GOOGLE_CALENDAR_CONNECTOR: "Google Calendar", } diff --git a/surfsense_web/components/assistant-ui/connector-popup.tsx b/surfsense_web/components/assistant-ui/connector-popup.tsx index 1ec8fad73..e656c06d6 100644 --- a/surfsense_web/components/assistant-ui/connector-popup.tsx +++ b/surfsense_web/components/assistant-ui/connector-popup.tsx @@ -184,7 +184,7 @@ export const ConnectorIndicator: FC = () => { )} - + Manage Connectors {/* YouTube Crawler View - shown when adding YouTube videos */} {isYouTubeView && searchSpaceId ? ( diff --git a/surfsense_web/components/ui/dialog.tsx b/surfsense_web/components/ui/dialog.tsx index d04d76520..f3fa856d3 100644 --- a/surfsense_web/components/ui/dialog.tsx +++ b/surfsense_web/components/ui/dialog.tsx @@ -38,7 +38,7 @@ const DialogContent = React.forwardRef< Date: Fri, 23 Jan 2026 10:48:43 -0500 Subject: [PATCH 15/28] Reworded README.md around LLM compatibility (Based on discussion with Sid) --- README.md | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 7f50b924c..0c5f06029 100644 --- a/README.md +++ b/README.md @@ -52,8 +52,10 @@ https://github.com/user-attachments/assets/a0a16566-6967-4374-ac51-9b3e07fbecd7 - Interact in Natural Language and get cited answers. ### 📄 **Cited Answers** - Get Cited answers just like Perplexity. +### 🧩 **Universal Compatibility** +- Connect virtually any inference provider via the OpenAI spec and LiteLLM. ### 🔔 **Privacy & Local LLM Support** -- Works Flawlessly with Ollama local LLMs. +- Works Flawlessly with local LLMs like vLLM and Ollama. ### 🏠 **Self Hostable** - Open source and easy to deploy locally. ### 👥 **Team Collaboration with RBAC** @@ -61,6 +63,7 @@ https://github.com/user-attachments/assets/a0a16566-6967-4374-ac51-9b3e07fbecd7 - Invite team members with customizable roles (Owner, Admin, Editor, Viewer) - Granular permissions for documents, chats, connectors, and settings - Share knowledge bases securely within your organization +- Team chats update in real-time and "Chat about the chat" in comment threads ### 🎙️ Podcasts - Blazingly fast podcast generation agent. (Creates a 3-minute podcast in under 20 seconds.) - Convert your chat conversations into engaging audio content @@ -237,6 +240,8 @@ Before self-hosting installation, make sure to complete the [prerequisite setup ### **BackEnd** +- **LiteLLM**: Universal LLM integration supporting 100+ models (OpenAI, Anthropic, Ollama, etc.) + - **FastAPI**: Modern, fast web framework for building APIs with Python - **PostgreSQL with pgvector**: Database with vector search capabilities for similarity searches @@ -253,8 +258,6 @@ Before self-hosting installation, make sure to complete the [prerequisite setup - **LangChain**: Framework for developing AI-powered applications. -- **LiteLLM**: Universal LLM integration supporting 100+ models (OpenAI, Anthropic, Ollama, etc.) - - **Rerankers**: Advanced result ranking for improved search relevance - **Hybrid Search**: Combines vector similarity and full-text search for optimal results using Reciprocal Rank Fusion (RRF) From d20bb385b5439abc1c1a0dd4e73c275970c68bea Mon Sep 17 00:00:00 2001 From: Anish Sarkar <104695310+AnishSarkar22@users.noreply.github.com> Date: Fri, 23 Jan 2026 23:03:29 +0530 Subject: [PATCH 16/28] feat: enhance date handling and indexing logic across connectors - Added normalization for "undefined" strings to None in date parameters to prevent parsing errors. - Improved date range validation to ensure start_date is strictly before end_date, adjusting end_date if necessary. - Updated Google Calendar and Composio connector indexing logic to handle duplicate content more effectively, logging warnings for skipped events. - Enhanced error handling during final commits to manage integrity errors gracefully. - Refactored date handling in various connector indexers for consistency and reliability. --- .../app/connectors/google_gmail_connector.py | 7 +++++ .../routes/search_source_connectors_routes.py | 26 ++++++++++++++----- .../app/tasks/connector_indexers/base.py | 7 +++++ .../google_calendar_indexer.py | 19 ++++++++++++++ .../tasks/connector_indexers/luma_indexer.py | 7 +++++ .../assistant-ui/connector-popup.tsx | 8 +++++- .../views/connector-edit-view.tsx | 3 +-- .../views/indexing-configuration-view.tsx | 3 +-- .../hooks/use-connector-dialog.ts | 16 ++++++++++-- 9 files changed, 83 insertions(+), 13 deletions(-) diff --git a/surfsense_backend/app/connectors/google_gmail_connector.py b/surfsense_backend/app/connectors/google_gmail_connector.py index 8c0e4690e..c86a96413 100644 --- a/surfsense_backend/app/connectors/google_gmail_connector.py +++ b/surfsense_backend/app/connectors/google_gmail_connector.py @@ -285,6 +285,13 @@ class GoogleGmailConnector: try: from datetime import datetime, timedelta + # Normalize date values - handle "undefined" strings from frontend + # This prevents "time data 'undefined' does not match format" errors + if start_date == "undefined" or start_date == "": + start_date = None + if end_date == "undefined" or end_date == "": + end_date = None + # Build date query query_parts = [] diff --git a/surfsense_backend/app/routes/search_source_connectors_routes.py b/surfsense_backend/app/routes/search_source_connectors_routes.py index 82f452c61..928327d9a 100644 --- a/surfsense_backend/app/routes/search_source_connectors_routes.py +++ b/surfsense_backend/app/routes/search_source_connectors_routes.py @@ -644,20 +644,30 @@ async def index_connector_content( # Handle different connector types response_message = "" - today_str = datetime.now().strftime("%Y-%m-%d") + # Use UTC for consistency with last_indexed_at storage + today_str = datetime.now(UTC).strftime("%Y-%m-%d") # Determine the actual date range to use if start_date is None: # Use last_indexed_at or default to 365 days ago if connector.last_indexed_at: - today = datetime.now().date() - if connector.last_indexed_at.date() == today: + # Convert last_indexed_at to timezone-naive for comparison (like calculate_date_range does) + last_indexed_naive = ( + connector.last_indexed_at.replace(tzinfo=None) + if connector.last_indexed_at.tzinfo + else connector.last_indexed_at + ) + # Use UTC for "today" to match how last_indexed_at is stored + today_utc = datetime.now(UTC).replace(tzinfo=None).date() + last_indexed_date = last_indexed_naive.date() + + if last_indexed_date == today_utc: # If last indexed today, go back 1 day to ensure we don't miss anything - indexing_from = (today - timedelta(days=1)).strftime("%Y-%m-%d") + indexing_from = (today_utc - timedelta(days=1)).strftime("%Y-%m-%d") else: - indexing_from = connector.last_indexed_at.strftime("%Y-%m-%d") + indexing_from = last_indexed_naive.strftime("%Y-%m-%d") else: - indexing_from = (datetime.now() - timedelta(days=365)).strftime( + indexing_from = (datetime.now(UTC).replace(tzinfo=None) - timedelta(days=365)).strftime( "%Y-%m-%d" ) else: @@ -666,6 +676,7 @@ async def index_connector_content( # For calendar connectors, default to today but allow future dates if explicitly provided if connector.connector_type in [ SearchSourceConnectorType.GOOGLE_CALENDAR_CONNECTOR, + SearchSourceConnectorType.COMPOSIO_GOOGLE_CALENDAR_CONNECTOR, SearchSourceConnectorType.LUMA_CONNECTOR, ]: # Default to today if no end_date provided (users can manually select future dates) @@ -977,6 +988,9 @@ async def index_connector_content( index_composio_connector_task, ) + # For Composio Gmail and Calendar, use the same date calculation logic as normal connectors + # This ensures consistent behavior and uses last_indexed_at to reduce API calls + # (includes special case: if indexed today, go back 1 day to avoid missing data) logger.info( f"Triggering Composio connector indexing for connector {connector_id} into search space {search_space_id} from {indexing_from} to {indexing_to}" ) diff --git a/surfsense_backend/app/tasks/connector_indexers/base.py b/surfsense_backend/app/tasks/connector_indexers/base.py index b9a99808e..b390937f0 100644 --- a/surfsense_backend/app/tasks/connector_indexers/base.py +++ b/surfsense_backend/app/tasks/connector_indexers/base.py @@ -112,6 +112,13 @@ def calculate_date_range( Returns: Tuple of (start_date_str, end_date_str) """ + # Normalize "undefined" strings to None (from frontend) + # This prevents parsing errors and ensures consistent behavior across all indexers + if start_date == "undefined" or start_date == "": + start_date = None + if end_date == "undefined" or end_date == "": + end_date = None + if start_date is not None and end_date is not None: return start_date, end_date diff --git a/surfsense_backend/app/tasks/connector_indexers/google_calendar_indexer.py b/surfsense_backend/app/tasks/connector_indexers/google_calendar_indexer.py index 09bb8de4b..7787560fa 100644 --- a/surfsense_backend/app/tasks/connector_indexers/google_calendar_indexer.py +++ b/surfsense_backend/app/tasks/connector_indexers/google_calendar_indexer.py @@ -4,6 +4,8 @@ Google Calendar connector indexer. from datetime import datetime, timedelta +import pytz +from dateutil.parser import isoparse from google.oauth2.credentials import Credentials from sqlalchemy.exc import SQLAlchemyError from sqlalchemy.ext.asyncio import AsyncSession @@ -205,6 +207,23 @@ async def index_google_calendar_events( # Use provided dates (including future dates) start_date_str = start_date end_date_str = end_date + + # If start_date and end_date are the same, adjust end_date to be one day later + # to ensure valid date range (start_date must be strictly before end_date) + if start_date_str == end_date_str: + # Parse the date and add one day to ensure valid range + dt = isoparse(end_date_str) + if dt.tzinfo is None: + dt = dt.replace(tzinfo=pytz.UTC) + else: + dt = dt.astimezone(pytz.UTC) + # Add one day to end_date to make it strictly after start_date + dt_end = dt + timedelta(days=1) + end_date_str = dt_end.strftime("%Y-%m-%d") + logger.info( + f"Adjusted end_date from {end_date} to {end_date_str} " + f"to ensure valid date range (start_date must be strictly before end_date)" + ) await task_logger.log_task_progress( log_entry, diff --git a/surfsense_backend/app/tasks/connector_indexers/luma_indexer.py b/surfsense_backend/app/tasks/connector_indexers/luma_indexer.py index 91f81ac20..0d7a979be 100644 --- a/surfsense_backend/app/tasks/connector_indexers/luma_indexer.py +++ b/surfsense_backend/app/tasks/connector_indexers/luma_indexer.py @@ -116,6 +116,13 @@ async def index_luma_events( luma_client = LumaConnector(api_key=api_key) + # Handle 'undefined' string from frontend (treat as None) + # This prevents "time data 'undefined' does not match format" errors + if start_date == "undefined" or start_date == "": + start_date = None + if end_date == "undefined" or end_date == "": + end_date = None + # Calculate date range # For calendar connectors, allow future dates to index upcoming events if start_date is None or end_date is None: diff --git a/surfsense_web/components/assistant-ui/connector-popup.tsx b/surfsense_web/components/assistant-ui/connector-popup.tsx index e656c06d6..68a548409 100644 --- a/surfsense_web/components/assistant-ui/connector-popup.tsx +++ b/surfsense_web/components/assistant-ui/connector-popup.tsx @@ -259,7 +259,13 @@ export const ConnectorIndicator: FC = () => { editingConnector.connector_type !== "GOOGLE_DRIVE_CONNECTOR" ? () => { startIndexing(editingConnector.id); - handleQuickIndexConnector(editingConnector.id, editingConnector.connector_type, stopIndexing); + handleQuickIndexConnector( + editingConnector.id, + editingConnector.connector_type, + stopIndexing, + startDate, + endDate + ); } : undefined } diff --git a/surfsense_web/components/assistant-ui/connector-popup/connector-configs/views/connector-edit-view.tsx b/surfsense_web/components/assistant-ui/connector-popup/connector-configs/views/connector-edit-view.tsx index 8951336c5..d12264fbd 100644 --- a/surfsense_web/components/assistant-ui/connector-popup/connector-configs/views/connector-edit-view.tsx +++ b/surfsense_web/components/assistant-ui/connector-popup/connector-configs/views/connector-edit-view.tsx @@ -272,8 +272,7 @@ export const ConnectorEditView: FC = ({ Re-indexing runs in the background

- You can continue using SurfSense while we sync your data. Check the Active tab - to see progress. + You can continue using SurfSense while we sync your data. Check inbox for updates.

diff --git a/surfsense_web/components/assistant-ui/connector-popup/connector-configs/views/indexing-configuration-view.tsx b/surfsense_web/components/assistant-ui/connector-popup/connector-configs/views/indexing-configuration-view.tsx index 17995fdfa..019e6b37f 100644 --- a/surfsense_web/components/assistant-ui/connector-popup/connector-configs/views/indexing-configuration-view.tsx +++ b/surfsense_web/components/assistant-ui/connector-popup/connector-configs/views/indexing-configuration-view.tsx @@ -189,8 +189,7 @@ export const IndexingConfigurationView: FC = ({

Indexing runs in the background

- You can continue using SurfSense while we sync your data. Check the Active tab - to see progress. + You can continue using SurfSense while we sync your data. Check inbox for updates.

diff --git a/surfsense_web/components/assistant-ui/connector-popup/hooks/use-connector-dialog.ts b/surfsense_web/components/assistant-ui/connector-popup/hooks/use-connector-dialog.ts index 3e9e1d930..1bcbd4263 100644 --- a/surfsense_web/components/assistant-ui/connector-popup/hooks/use-connector-dialog.ts +++ b/surfsense_web/components/assistant-ui/connector-popup/hooks/use-connector-dialog.ts @@ -1400,9 +1400,15 @@ export const useConnectorDialog = () => { [editingConnector, searchSpaceId, deleteConnector, router, cameFromMCPList] ); - // Handle quick index (index without date picker, uses backend defaults) + // Handle quick index (index with selected date range, or backend defaults if none selected) const handleQuickIndexConnector = useCallback( - async (connectorId: number, connectorType?: string, stopIndexing?: (id: number) => void) => { + async ( + connectorId: number, + connectorType?: string, + stopIndexing?: (id: number) => void, + startDate?: Date, + endDate?: Date + ) => { if (!searchSpaceId) return; // Track quick index clicked event @@ -1411,10 +1417,16 @@ export const useConnectorDialog = () => { } try { + // Format dates if provided, otherwise pass undefined (backend will use defaults) + const startDateStr = startDate ? format(startDate, "yyyy-MM-dd") : undefined; + const endDateStr = endDate ? format(endDate, "yyyy-MM-dd") : undefined; + await indexConnector({ connector_id: connectorId, queryParams: { search_space_id: searchSpaceId, + start_date: startDateStr, + end_date: endDateStr, }, }); toast.success("Indexing started", { From c48ba36fa47ccffb10f68a76231ab017321c5dbe Mon Sep 17 00:00:00 2001 From: Anish Sarkar <104695310+AnishSarkar22@users.noreply.github.com> Date: Fri, 23 Jan 2026 23:36:14 +0530 Subject: [PATCH 17/28] feat: improve indexing logic and duplicate handling in connectors - Enhanced Google Calendar and Composio connector indexing to track and log duplicate content, preventing re-indexing of already processed events. - Implemented robust error handling during final commits to manage integrity errors gracefully, ensuring successful indexing despite potential duplicates. - Updated notification service to differentiate between actual errors and warnings for duplicate content, improving user feedback. - Refactored date handling to ensure valid date ranges and adjusted end dates when necessary for better indexing accuracy. --- .../composio_google_calendar_connector.py | 59 +++++++++++++-- .../routes/search_source_connectors_routes.py | 72 +++++++++++++++---- .../app/services/notification_service.py | 28 ++++++-- .../google_calendar_indexer.py | 49 ++++++++++++- .../views/connector-edit-view.tsx | 14 ++-- .../hooks/use-connector-dialog.ts | 11 ++- 6 files changed, 198 insertions(+), 35 deletions(-) diff --git a/surfsense_backend/app/connectors/composio_google_calendar_connector.py b/surfsense_backend/app/connectors/composio_google_calendar_connector.py index ab8bde53c..3ac235848 100644 --- a/surfsense_backend/app/connectors/composio_google_calendar_connector.py +++ b/surfsense_backend/app/connectors/composio_google_calendar_connector.py @@ -18,7 +18,10 @@ from app.db import Document, DocumentType from app.services.composio_service import TOOLKIT_TO_DOCUMENT_TYPE from app.services.llm_service import get_user_long_context_llm from app.services.task_logging_service import TaskLoggingService -from app.tasks.connector_indexers.base import calculate_date_range +from app.tasks.connector_indexers.base import ( + calculate_date_range, + check_duplicate_document_by_hash, +) from app.utils.document_converters import ( create_document_chunks, generate_content_hash, @@ -256,6 +259,7 @@ async def index_composio_google_calendar( documents_indexed = 0 documents_skipped = 0 + duplicate_content_count = 0 # Track events skipped due to duplicate content_hash for event in events: try: @@ -349,7 +353,25 @@ async def index_composio_google_calendar( logger.info( f"Committing batch: {documents_indexed} Google Calendar events processed so far" ) - await session.commit() + await session.commit( ) + continue + + # Document doesn't exist by unique_identifier_hash + # Check if a document with the same content_hash exists (from standard connector) + with session.no_autoflush: + duplicate_by_content = await check_duplicate_document_by_hash( + session, content_hash + ) + + if duplicate_by_content: + # A document with the same content already exists (likely from standard connector) + logger.info( + f"Event {summary} already indexed by another connector " + f"(existing document ID: {duplicate_by_content.id}, " + f"type: {duplicate_by_content.document_type}). Skipping to avoid duplicate content." + ) + duplicate_content_count += 1 + documents_skipped += 1 continue # Create new document @@ -429,10 +451,28 @@ async def index_composio_google_calendar( logger.info( f"Final commit: Total {documents_indexed} Google Calendar events processed" ) - await session.commit() - logger.info( - "Successfully committed all Composio Google Calendar document changes to database" - ) + try: + await session.commit() + logger.info( + "Successfully committed all Composio Google Calendar document changes to database" + ) + except Exception as e: + # Handle any remaining integrity errors gracefully (race conditions, etc.) + if "duplicate key value violates unique constraint" in str(e).lower() or "uniqueviolationerror" in str(e).lower(): + logger.warning( + f"Duplicate content_hash detected during final commit. " + f"This may occur if the same event was indexed by multiple connectors. " + f"Rolling back and continuing. Error: {e!s}" + ) + await session.rollback() + # Don't fail the entire task - some documents may have been successfully indexed + else: + raise + + # Build warning message if duplicates were found + warning_message = None + if duplicate_content_count > 0: + warning_message = f"{duplicate_content_count} skipped (duplicate)" await task_logger.log_task_success( log_entry, @@ -440,10 +480,15 @@ async def index_composio_google_calendar( { "documents_indexed": documents_indexed, "documents_skipped": documents_skipped, + "duplicate_content_count": duplicate_content_count, }, ) - return documents_indexed, None + logger.info( + f"Composio Google Calendar indexing completed: {documents_indexed} new events, {documents_skipped} skipped " + f"({duplicate_content_count} due to duplicate content from other connectors)" + ) + return documents_indexed, warning_message except Exception as e: logger.error( diff --git a/surfsense_backend/app/routes/search_source_connectors_routes.py b/surfsense_backend/app/routes/search_source_connectors_routes.py index 928327d9a..3b98d7d7c 100644 --- a/surfsense_backend/app/routes/search_source_connectors_routes.py +++ b/surfsense_backend/app/routes/search_source_connectors_routes.py @@ -22,6 +22,8 @@ import logging from datetime import UTC, datetime, timedelta from typing import Any +import pytz +from dateutil.parser import isoparse from fastapi import APIRouter, Body, Depends, HTTPException, Query from pydantic import BaseModel, Field, ValidationError from sqlalchemy.exc import IntegrityError @@ -681,6 +683,22 @@ async def index_connector_content( ]: # Default to today if no end_date provided (users can manually select future dates) indexing_to = today_str if end_date is None else end_date + + # If start_date and end_date are the same, adjust end_date to be one day later + # to ensure valid date range (start_date must be strictly before end_date) + if indexing_from == indexing_to: + dt = isoparse(indexing_to) + if dt.tzinfo is None: + dt = dt.replace(tzinfo=pytz.UTC) + else: + dt = dt.astimezone(pytz.UTC) + # Add one day to end_date to make it strictly after start_date + dt_end = dt + timedelta(days=1) + indexing_to = dt_end.strftime("%Y-%m-%d") + logger.info( + f"Adjusted end_date from {end_date} to {indexing_to} " + f"to ensure valid date range (start_date must be strictly before end_date)" + ) else: # For non-calendar connectors, cap at today indexing_to = end_date if end_date else today_str @@ -1231,20 +1249,48 @@ async def _run_indexing_with_notifications( else: # No new documents processed - check if this is an error or just no changes if error_or_warning: - # Actual failure - logger.error(f"Indexing failed: {error_or_warning}") - if notification: - # Refresh notification to ensure it's not stale after indexing function commits - await session.refresh(notification) - await NotificationService.connector_indexing.notify_indexing_completed( - session=session, - notification=notification, - indexed_count=0, - error_message=error_or_warning, + # Check if this is a duplicate warning (success case) or an actual error + # Handle both normal and Composio calendar connectors + error_or_warning_lower = str(error_or_warning).lower() if error_or_warning else "" + is_duplicate_warning = "skipped (duplicate)" in error_or_warning_lower + + if is_duplicate_warning: + # Duplicate warnings are success cases - sync worked, just found duplicates + logger.info( + f"Indexing completed successfully: {error_or_warning}" ) - await ( - session.commit() - ) # Commit to ensure Electric SQL syncs the notification update + # Still update timestamp so ElectricSQL syncs and clears "Syncing" UI + if update_timestamp_func: + await update_timestamp_func(session, connector_id) + await session.commit() # Commit timestamp update + if notification: + # Refresh notification to ensure it's not stale after timestamp update commit + await session.refresh(notification) + await NotificationService.connector_indexing.notify_indexing_completed( + session=session, + notification=notification, + indexed_count=0, + error_message=error_or_warning, # Pass as warning, not error + is_warning=True, # Flag to indicate this is a warning, not an error + ) + await ( + session.commit() + ) # Commit to ensure Electric SQL syncs the notification update + else: + # Actual failure + logger.error(f"Indexing failed: {error_or_warning}") + if notification: + # Refresh notification to ensure it's not stale after indexing function commits + await session.refresh(notification) + await NotificationService.connector_indexing.notify_indexing_completed( + session=session, + notification=notification, + indexed_count=0, + error_message=error_or_warning, + ) + await ( + session.commit() + ) # Commit to ensure Electric SQL syncs the notification update else: # Success - just no new documents to index (all skipped/unchanged) logger.info( diff --git a/surfsense_backend/app/services/notification_service.py b/surfsense_backend/app/services/notification_service.py index 836daeb9e..9fcf807e7 100644 --- a/surfsense_backend/app/services/notification_service.py +++ b/surfsense_backend/app/services/notification_service.py @@ -335,6 +335,7 @@ class ConnectorIndexingNotificationHandler(BaseNotificationHandler): notification: Notification, indexed_count: int, error_message: str | None = None, + is_warning: bool = False, ) -> Notification: """ Update notification when connector indexing completes. @@ -343,7 +344,8 @@ class ConnectorIndexingNotificationHandler(BaseNotificationHandler): session: Database session notification: Notification to update indexed_count: Total number of items indexed - error_message: Error message if indexing failed (optional) + error_message: Error message if indexing failed, or warning message (optional) + is_warning: If True, treat error_message as a warning (success case) rather than an error Returns: Updated notification @@ -352,10 +354,26 @@ class ConnectorIndexingNotificationHandler(BaseNotificationHandler): "connector_name", "Connector" ) + # If there's an error message but items were indexed, treat it as a warning (partial success) + # If is_warning is True, treat it as success even with 0 items (e.g., duplicates found) + # Otherwise, treat it as a failure if error_message: - title = f"Failed: {connector_name}" - message = f"Sync failed: {error_message}" - status = "failed" + if indexed_count > 0: + # Partial success with warnings (e.g., duplicate content from other connectors) + title = f"Ready: {connector_name}" + item_text = "item" if indexed_count == 1 else "items" + message = f"Now searchable! {indexed_count} {item_text} synced. Note: {error_message}" + status = "completed" + elif is_warning: + # Warning case (e.g., duplicates found) - treat as success + title = f"Ready: {connector_name}" + message = f"Sync completed. {error_message}" + status = "completed" + else: + # Complete failure + title = f"Failed: {connector_name}" + message = f"Sync failed: {error_message}" + status = "failed" else: title = f"Ready: {connector_name}" if indexed_count == 0: @@ -367,7 +385,7 @@ class ConnectorIndexingNotificationHandler(BaseNotificationHandler): metadata_updates = { "indexed_count": indexed_count, - "sync_stage": "completed" if not error_message else "failed", + "sync_stage": "completed" if (not error_message or is_warning or indexed_count > 0) else "failed", "error_message": error_message, } diff --git a/surfsense_backend/app/tasks/connector_indexers/google_calendar_indexer.py b/surfsense_backend/app/tasks/connector_indexers/google_calendar_indexer.py index 7787560fa..5bc805549 100644 --- a/surfsense_backend/app/tasks/connector_indexers/google_calendar_indexer.py +++ b/surfsense_backend/app/tasks/connector_indexers/google_calendar_indexer.py @@ -23,6 +23,7 @@ from app.utils.document_converters import ( from .base import ( check_document_by_unique_identifier, + check_duplicate_document_by_hash, get_connector_by_id, get_current_timestamp, logger, @@ -289,6 +290,7 @@ async def index_google_calendar_events( documents_indexed = 0 documents_skipped = 0 skipped_events = [] + duplicate_content_count = 0 # Track events skipped due to duplicate content_hash for event in events: try: @@ -409,6 +411,27 @@ async def index_google_calendar_events( ) continue + # Document doesn't exist by unique_identifier_hash + # Check if a document with the same content_hash exists (from another connector) + with session.no_autoflush: + duplicate_by_content = await check_duplicate_document_by_hash( + session, content_hash + ) + + if duplicate_by_content: + # A document with the same content already exists (likely from Composio connector) + logger.info( + f"Event {event_summary} already indexed by another connector " + f"(existing document ID: {duplicate_by_content.id}, " + f"type: {duplicate_by_content.document_type}). Skipping to avoid duplicate content." + ) + duplicate_content_count += 1 + documents_skipped += 1 + skipped_events.append( + f"{event_summary} (already indexed by another connector)" + ) + continue + # Document doesn't exist - create new one # Generate summary with metadata user_llm = await get_user_long_context_llm( @@ -501,7 +524,25 @@ async def index_google_calendar_events( logger.info( f"Final commit: Total {documents_indexed} Google Calendar events processed" ) - await session.commit() + try: + await session.commit() + except Exception as e: + # Handle any remaining integrity errors gracefully (race conditions, etc.) + if "duplicate key value violates unique constraint" in str(e).lower() or "uniqueviolationerror" in str(e).lower(): + logger.warning( + f"Duplicate content_hash detected during final commit. " + f"This may occur if the same event was indexed by multiple connectors. " + f"Rolling back and continuing. Error: {e!s}" + ) + await session.rollback() + # Don't fail the entire task - some documents may have been successfully indexed + else: + raise + + # Build warning message if duplicates were found + warning_message = None + if duplicate_content_count > 0: + warning_message = f"{duplicate_content_count} skipped (duplicate)" await task_logger.log_task_success( log_entry, @@ -510,14 +551,16 @@ async def index_google_calendar_events( "events_processed": total_processed, "documents_indexed": documents_indexed, "documents_skipped": documents_skipped, + "duplicate_content_count": duplicate_content_count, "skipped_events_count": len(skipped_events), }, ) logger.info( - f"Google Calendar indexing completed: {documents_indexed} new events, {documents_skipped} skipped" + f"Google Calendar indexing completed: {documents_indexed} new events, {documents_skipped} skipped " + f"({duplicate_content_count} due to duplicate content from other connectors)" ) - return total_processed, None + return total_processed, warning_message except SQLAlchemyError as db_error: await session.rollback() diff --git a/surfsense_web/components/assistant-ui/connector-popup/connector-configs/views/connector-edit-view.tsx b/surfsense_web/components/assistant-ui/connector-popup/connector-configs/views/connector-edit-view.tsx index d12264fbd..8f58db542 100644 --- a/surfsense_web/components/assistant-ui/connector-popup/connector-configs/views/connector-edit-view.tsx +++ b/surfsense_web/components/assistant-ui/connector-popup/connector-configs/views/connector-edit-view.tsx @@ -100,10 +100,14 @@ export const ConnectorEditView: FC = ({ // Reset local quick indexing state when indexing completes or fails useEffect(() => { - if (!isIndexing) { - setIsQuickIndexing(false); + if (!isIndexing && isQuickIndexing) { + // Small delay to ensure smooth transition + const timer = setTimeout(() => { + setIsQuickIndexing(false); + }, 100); + return () => clearTimeout(timer); } - }, [isIndexing]); + }, [isIndexing, isQuickIndexing]); const handleDisconnectClick = () => { setShowDisconnectConfirm(true); @@ -119,11 +123,11 @@ export const ConnectorEditView: FC = ({ }; const handleQuickIndex = useCallback(() => { - if (onQuickIndex) { + if (onQuickIndex && !isQuickIndexing && !isIndexing) { setIsQuickIndexing(true); onQuickIndex(); } - }, [onQuickIndex]); + }, [onQuickIndex, isQuickIndexing, isIndexing]); return (
diff --git a/surfsense_web/components/assistant-ui/connector-popup/hooks/use-connector-dialog.ts b/surfsense_web/components/assistant-ui/connector-popup/hooks/use-connector-dialog.ts index 1bcbd4263..9a7f15b0c 100644 --- a/surfsense_web/components/assistant-ui/connector-popup/hooks/use-connector-dialog.ts +++ b/surfsense_web/components/assistant-ui/connector-popup/hooks/use-connector-dialog.ts @@ -1409,7 +1409,12 @@ export const useConnectorDialog = () => { startDate?: Date, endDate?: Date ) => { - if (!searchSpaceId) return; + if (!searchSpaceId) { + if (stopIndexing) { + stopIndexing(connectorId); + } + return; + } // Track quick index clicked event if (connectorType) { @@ -1437,6 +1442,8 @@ export const useConnectorDialog = () => { queryClient.invalidateQueries({ queryKey: cacheKeys.logs.summary(Number(searchSpaceId)), }); + // Note: Don't call stopIndexing here - let useIndexingConnectors hook + // detect when last_indexed_at changes via Electric SQL } catch (error) { console.error("Error indexing connector content:", error); toast.error(error instanceof Error ? error.message : "Failed to start indexing"); @@ -1446,7 +1453,7 @@ export const useConnectorDialog = () => { } } }, - [searchSpaceId, indexConnector] + [searchSpaceId, indexConnector, queryClient] ); // Handle going back from edit view From a7333853a283e040515188a481a8c8f935861ee6 Mon Sep 17 00:00:00 2001 From: Eric Lammertsma Date: Fri, 23 Jan 2026 13:14:23 -0500 Subject: [PATCH 18/28] Swapped Inbox and Documents in sidebar --- .../layout/providers/LayoutDataProvider.tsx | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/surfsense_web/components/layout/providers/LayoutDataProvider.tsx b/surfsense_web/components/layout/providers/LayoutDataProvider.tsx index 52dc7196a..1761c74a1 100644 --- a/surfsense_web/components/layout/providers/LayoutDataProvider.tsx +++ b/surfsense_web/components/layout/providers/LayoutDataProvider.tsx @@ -161,12 +161,6 @@ export function LayoutDataProvider({ // Navigation items const navItems: NavItem[] = useMemo( () => [ - { - title: "Documents", - url: `/dashboard/${searchSpaceId}/documents`, - icon: SquareLibrary, - isActive: pathname?.includes("/documents"), - }, { title: "Inbox", url: "#inbox", // Special URL to indicate this is handled differently @@ -174,6 +168,12 @@ export function LayoutDataProvider({ isActive: isInboxSidebarOpen, badge: unreadCount > 0 ? (unreadCount > 99 ? "99+" : unreadCount) : undefined, }, + { + title: "Documents", + url: `/dashboard/${searchSpaceId}/documents`, + icon: SquareLibrary, + isActive: pathname?.includes("/documents"), + }, ], [searchSpaceId, pathname, isInboxSidebarOpen, unreadCount] ); From 417ff58fad6ba8221c1a561a00ec3f44a99a93cc Mon Sep 17 00:00:00 2001 From: Eric Lammertsma Date: Fri, 23 Jan 2026 13:27:14 -0500 Subject: [PATCH 19/28] Fixed a bug where new chats weren't auto selected when created This additionally fixes a bug where the New Chat button wasn't working properly after creating a new chat --- .../layout/providers/LayoutDataProvider.tsx | 41 ++++++++++++++++--- 1 file changed, 35 insertions(+), 6 deletions(-) diff --git a/surfsense_web/components/layout/providers/LayoutDataProvider.tsx b/surfsense_web/components/layout/providers/LayoutDataProvider.tsx index 1761c74a1..37cb468ec 100644 --- a/surfsense_web/components/layout/providers/LayoutDataProvider.tsx +++ b/surfsense_web/components/layout/providers/LayoutDataProvider.tsx @@ -1,12 +1,13 @@ "use client"; import { useQuery, useQueryClient } from "@tanstack/react-query"; -import { useAtomValue } from "jotai"; +import { useAtomValue, useSetAtom } from "jotai"; import { Inbox, LogOut, SquareLibrary, Trash2 } from "lucide-react"; import { useParams, usePathname, useRouter } from "next/navigation"; import { useTranslations } from "next-intl"; import { useTheme } from "next-themes"; -import { useCallback, useMemo, useState } from "react"; +import { useCallback, useEffect, useMemo, useState } from "react"; +import { currentThreadAtom, resetCurrentThreadAtom } from "@/atoms/chat/current-thread.atom"; import { deleteSearchSpaceMutationAtom } from "@/atoms/search-spaces/search-space-mutation.atoms"; import { searchSpacesAtom } from "@/atoms/search-spaces/search-space-query.atoms"; import { currentUserAtom } from "@/atoms/user/user-query.atoms"; @@ -55,11 +56,16 @@ export function LayoutDataProvider({ const { data: user } = useAtomValue(currentUserAtom); const { data: searchSpacesData, refetch: refetchSearchSpaces } = useAtomValue(searchSpacesAtom); const { mutateAsync: deleteSearchSpace } = useAtomValue(deleteSearchSpaceMutationAtom); + const currentThreadState = useAtomValue(currentThreadAtom); + const resetCurrentThread = useSetAtom(resetCurrentThreadAtom); - // Current IDs from URL + // State for handling new chat navigation when router is out of sync + const [pendingNewChat, setPendingNewChat] = useState(false); + + // Current IDs from URL, with fallback to atom for replaceState updates const currentChatId = params?.chat_id ? Number(Array.isArray(params.chat_id) ? params.chat_id[0] : params.chat_id) - : null; + : currentThreadState.id; // Fetch current search space (for caching purposes) useQuery({ @@ -111,6 +117,17 @@ export function LayoutDataProvider({ const [isDeletingSearchSpace, setIsDeletingSearchSpace] = useState(false); const [isLeavingSearchSpace, setIsLeavingSearchSpace] = useState(false); + // Effect to complete new chat navigation after router syncs + // This runs when handleNewChat detected an out-of-sync state and triggered a sync + useEffect(() => { + if (pendingNewChat && params?.chat_id) { + // Router is now synced (chat_id is in params), complete navigation to new-chat + resetCurrentThread(); + router.push(`/dashboard/${searchSpaceId}/new-chat`); + setPendingNewChat(false); + } + }, [pendingNewChat, params?.chat_id, router, searchSpaceId, resetCurrentThread]); + const searchSpaces: SearchSpace[] = useMemo(() => { if (!searchSpacesData || !Array.isArray(searchSpacesData)) return []; return searchSpacesData.map((space) => ({ @@ -278,8 +295,20 @@ export function LayoutDataProvider({ ); const handleNewChat = useCallback(() => { - router.push(`/dashboard/${searchSpaceId}/new-chat`); - }, [router, searchSpaceId]); + // Check if router is out of sync (thread created via replaceState but params don't have chat_id) + const isOutOfSync = currentThreadState.id !== null && !params?.chat_id; + + if (isOutOfSync) { + // First sync Next.js router by navigating to the current chat's actual URL + // This updates the router's internal state to match the browser URL + router.replace(`/dashboard/${searchSpaceId}/new-chat/${currentThreadState.id}`); + // Set flag to trigger navigation to new-chat after params update + setPendingNewChat(true); + } else { + // Normal navigation - router is in sync + router.push(`/dashboard/${searchSpaceId}/new-chat`); + } + }, [router, searchSpaceId, currentThreadState.id, params?.chat_id]); const handleChatSelect = useCallback( (chat: ChatItem) => { From 6d14b49d3f4fb39994be6ba96bc93af3f1031831 Mon Sep 17 00:00:00 2001 From: Anish Sarkar <104695310+AnishSarkar22@users.noreply.github.com> Date: Sat, 24 Jan 2026 01:20:51 +0530 Subject: [PATCH 20/28] feat: enhance indexing state management and inbox count formatting - Improved indexing state management by refining the logic for handling notifications, ensuring accurate updates for in-progress, completed, and failed states. - Introduced a new utility function to format inbox counts, displaying numbers up to 999 and using "k+" for larger counts, enhancing user interface clarity. - Updated sidebar components to utilize the new inbox count formatting, improving the overall user experience. --- .../hooks/use-indexing-connectors.ts | 87 ++++++++++--------- .../layout/providers/LayoutDataProvider.tsx | 13 ++- .../layout/ui/sidebar/InboxSidebar.tsx | 15 +++- .../layout/ui/sidebar/NavSection.tsx | 4 +- 4 files changed, 75 insertions(+), 44 deletions(-) diff --git a/surfsense_web/components/assistant-ui/connector-popup/hooks/use-indexing-connectors.ts b/surfsense_web/components/assistant-ui/connector-popup/hooks/use-indexing-connectors.ts index e82a8eb29..289da475d 100644 --- a/surfsense_web/components/assistant-ui/connector-popup/hooks/use-indexing-connectors.ts +++ b/surfsense_web/components/assistant-ui/connector-popup/hooks/use-indexing-connectors.ts @@ -10,8 +10,9 @@ import { isConnectorIndexingMetadata } from "@/contracts/types/inbox.types"; * * This provides a better UX than polling by: * 1. Setting indexing state immediately when user triggers indexing (optimistic) - * 2. Clearing indexing state when Electric SQL detects last_indexed_at changed - * 3. Clearing indexing state when a failed notification is detected + * 2. Detecting in_progress notifications from Electric SQL to restore state after remounts + * 3. Clearing indexing state when notifications become completed or failed + * 4. Clearing indexing state when Electric SQL detects last_indexed_at changed * * The actual `last_indexed_at` value comes from Electric SQL/PGlite, not local state. */ @@ -28,65 +29,73 @@ export function useIndexingConnectors( // Detect when last_indexed_at changes (indexing completed) via Electric SQL useEffect(() => { const previousValues = previousLastIndexedAtRef.current; - const newIndexingIds = new Set(indexingConnectorIds); - let hasChanges = false; for (const connector of connectors) { const previousValue = previousValues.get(connector.id); const currentValue = connector.last_indexed_at; - // If last_indexed_at changed and connector was in indexing state, clear it + // If last_indexed_at changed, clear it from indexing state if ( previousValue !== undefined && // We've seen this connector before - previousValue !== currentValue && // Value changed - indexingConnectorIds.has(connector.id) // It was marked as indexing + previousValue !== currentValue // Value changed ) { - newIndexingIds.delete(connector.id); - hasChanges = true; + // Use functional update to access current state + setIndexingConnectorIds((prev) => { + if (prev.has(connector.id)) { + const next = new Set(prev); + next.delete(connector.id); + return next; + } + return prev; + }); } // Update previous value tracking previousValues.set(connector.id, currentValue); } + }, [connectors]); - if (hasChanges) { - setIndexingConnectorIds(newIndexingIds); - } - }, [connectors, indexingConnectorIds]); - - // Detect failed notifications and stop indexing state + // Detect notification status changes and update indexing state accordingly + // This restores spinner state after component remounts and handles all status transitions useEffect(() => { if (!inboxItems || inboxItems.length === 0) return; - const newIndexingIds = new Set(indexingConnectorIds); - let hasChanges = false; + setIndexingConnectorIds((prev) => { + const newIndexingIds = new Set(prev); + let hasChanges = false; - for (const item of inboxItems) { - // Only check connector_indexing notifications - if (item.type !== "connector_indexing") continue; + for (const item of inboxItems) { + // Only check connector_indexing notifications + if (item.type !== "connector_indexing") continue; - // Check if this notification indicates a failure - const metadata = isConnectorIndexingMetadata(item.metadata) - ? item.metadata - : null; - if (!metadata) continue; + const metadata = isConnectorIndexingMetadata(item.metadata) + ? item.metadata + : null; + if (!metadata) continue; - // Check if status is "failed" or if there's an error_message - const isFailed = - metadata.status === "failed" || - (metadata.error_message && metadata.error_message.trim().length > 0); - - // If failed and connector is in indexing state, clear it - if (isFailed && indexingConnectorIds.has(metadata.connector_id)) { - newIndexingIds.delete(metadata.connector_id); - hasChanges = true; + // If status is "in_progress", add connector to indexing set + if (metadata.status === "in_progress") { + if (!newIndexingIds.has(metadata.connector_id)) { + newIndexingIds.add(metadata.connector_id); + hasChanges = true; + } + } + // If status is "completed" or "failed", remove connector from indexing set + else if ( + metadata.status === "completed" || + metadata.status === "failed" || + (metadata.error_message && metadata.error_message.trim().length > 0) + ) { + if (newIndexingIds.has(metadata.connector_id)) { + newIndexingIds.delete(metadata.connector_id); + hasChanges = true; + } + } } - } - if (hasChanges) { - setIndexingConnectorIds(newIndexingIds); - } - }, [inboxItems, indexingConnectorIds]); + return hasChanges ? newIndexingIds : prev; + }); + }, [inboxItems]); // Add a connector to the indexing set (called when indexing starts) const startIndexing = useCallback((connectorId: number) => { diff --git a/surfsense_web/components/layout/providers/LayoutDataProvider.tsx b/surfsense_web/components/layout/providers/LayoutDataProvider.tsx index 52dc7196a..9e3f55c97 100644 --- a/surfsense_web/components/layout/providers/LayoutDataProvider.tsx +++ b/surfsense_web/components/layout/providers/LayoutDataProvider.tsx @@ -38,6 +38,17 @@ interface LayoutDataProviderProps { breadcrumb?: React.ReactNode; } +/** + * Format count for display: shows numbers up to 999, then "1k+", "2k+", etc. + */ +function formatInboxCount(count: number): string { + if (count <= 999) { + return count.toString(); + } + const thousands = Math.floor(count / 1000); + return `${thousands}k+`; +} + export function LayoutDataProvider({ searchSpaceId, children, @@ -172,7 +183,7 @@ export function LayoutDataProvider({ url: "#inbox", // Special URL to indicate this is handled differently icon: Inbox, isActive: isInboxSidebarOpen, - badge: unreadCount > 0 ? (unreadCount > 99 ? "99+" : unreadCount) : undefined, + badge: unreadCount > 0 ? formatInboxCount(unreadCount) : undefined, }, ], [searchSpaceId, pathname, isInboxSidebarOpen, unreadCount] diff --git a/surfsense_web/components/layout/ui/sidebar/InboxSidebar.tsx b/surfsense_web/components/layout/ui/sidebar/InboxSidebar.tsx index bb06d6a56..e80c6e62d 100644 --- a/surfsense_web/components/layout/ui/sidebar/InboxSidebar.tsx +++ b/surfsense_web/components/layout/ui/sidebar/InboxSidebar.tsx @@ -70,6 +70,17 @@ function getInitials(name: string | null | undefined, email: string | null | und return "U"; } +/** + * Format count for display: shows numbers up to 999, then "1k+", "2k+", etc. + */ +function formatInboxCount(count: number): string { + if (count <= 999) { + return count.toString(); + } + const thousands = Math.floor(count / 1000); + return `${thousands}k+`; +} + /** * Get display name for connector type */ @@ -732,7 +743,7 @@ export function InboxSidebar({ {t("mentions") || "Mentions"} - {unreadMentionsCount} + {formatInboxCount(unreadMentionsCount)} @@ -744,7 +755,7 @@ export function InboxSidebar({ {t("status") || "Status"} - {unreadStatusCount} + {formatInboxCount(unreadStatusCount)} diff --git a/surfsense_web/components/layout/ui/sidebar/NavSection.tsx b/surfsense_web/components/layout/ui/sidebar/NavSection.tsx index d2d926de8..742a27bbc 100644 --- a/surfsense_web/components/layout/ui/sidebar/NavSection.tsx +++ b/surfsense_web/components/layout/ui/sidebar/NavSection.tsx @@ -39,7 +39,7 @@ export function NavSection({ items, onItemClick, isCollapsed = false }: NavSecti > {item.badge && ( - + {item.badge} )} @@ -70,7 +70,7 @@ export function NavSection({ items, onItemClick, isCollapsed = false }: NavSecti {item.title} {item.badge && ( - + {item.badge} )} From f4b1192a063e71437bb24340342fcee2a69f6a1f Mon Sep 17 00:00:00 2001 From: Anish Sarkar <104695310+AnishSarkar22@users.noreply.github.com> Date: Sat, 24 Jan 2026 03:51:57 +0530 Subject: [PATCH 21/28] feat: refine indexing success case handling and notification messaging - Enhanced the logic for determining success cases during indexing by distinguishing between duplicate warnings and empty results. - Updated notification messages to provide clearer feedback for empty results, improving user understanding of indexing outcomes. - Ensured that notifications reflect accurate statuses, maintaining consistency in user feedback during the indexing process. --- .../app/routes/search_source_connectors_routes.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/surfsense_backend/app/routes/search_source_connectors_routes.py b/surfsense_backend/app/routes/search_source_connectors_routes.py index 3b98d7d7c..487a689dc 100644 --- a/surfsense_backend/app/routes/search_source_connectors_routes.py +++ b/surfsense_backend/app/routes/search_source_connectors_routes.py @@ -1249,13 +1249,15 @@ async def _run_indexing_with_notifications( else: # No new documents processed - check if this is an error or just no changes if error_or_warning: - # Check if this is a duplicate warning (success case) or an actual error + # Check if this is a duplicate warning or empty result (success cases) or an actual error # Handle both normal and Composio calendar connectors error_or_warning_lower = str(error_or_warning).lower() if error_or_warning else "" is_duplicate_warning = "skipped (duplicate)" in error_or_warning_lower + # "No X found" messages are success cases - sync worked, just found nothing in date range + is_empty_result = ("no " in error_or_warning_lower and "found" in error_or_warning_lower) - if is_duplicate_warning: - # Duplicate warnings are success cases - sync worked, just found duplicates + if is_duplicate_warning or is_empty_result: + # These are success cases - sync worked, just found nothing new logger.info( f"Indexing completed successfully: {error_or_warning}" ) @@ -1266,11 +1268,13 @@ async def _run_indexing_with_notifications( if notification: # Refresh notification to ensure it's not stale after timestamp update commit await session.refresh(notification) + # For empty results, use a cleaner message + notification_message = "No new items found in date range" if is_empty_result else error_or_warning await NotificationService.connector_indexing.notify_indexing_completed( session=session, notification=notification, indexed_count=0, - error_message=error_or_warning, # Pass as warning, not error + error_message=notification_message, # Pass as warning, not error is_warning=True, # Flag to indicate this is a warning, not an error ) await ( From 5cf6fb15ed9c0f875c584ac4af216d279ae9eb36 Mon Sep 17 00:00:00 2001 From: Anish Sarkar <104695310+AnishSarkar22@users.noreply.github.com> Date: Sat, 24 Jan 2026 03:59:17 +0530 Subject: [PATCH 22/28] fix: improve error logging for indexing tasks across multiple connectors - Updated error handling in the indexing functions for BookStack, Confluence, Google Calendar, Jira, Linear, and Luma connectors to log specific error messages when failures occur. - Enhanced logging for cases where no pages or events are found, providing clearer informational messages instead of treating them as critical errors. - Ensured consistent error reporting across all connector indexers, improving debugging and user feedback during indexing operations. --- .../app/tasks/connector_indexers/bookstack_indexer.py | 4 ++-- .../app/tasks/connector_indexers/confluence_indexer.py | 4 ++-- .../app/tasks/connector_indexers/google_calendar_indexer.py | 4 ++-- .../app/tasks/connector_indexers/jira_indexer.py | 4 ++-- .../app/tasks/connector_indexers/linear_indexer.py | 4 ++-- .../app/tasks/connector_indexers/luma_indexer.py | 4 ++-- 6 files changed, 12 insertions(+), 12 deletions(-) diff --git a/surfsense_backend/app/tasks/connector_indexers/bookstack_indexer.py b/surfsense_backend/app/tasks/connector_indexers/bookstack_indexer.py index 2793f78db..a1067255d 100644 --- a/surfsense_backend/app/tasks/connector_indexers/bookstack_indexer.py +++ b/surfsense_backend/app/tasks/connector_indexers/bookstack_indexer.py @@ -136,10 +136,9 @@ async def index_bookstack_pages( ) if error: - logger.error(f"Failed to get BookStack pages: {error}") - # Don't treat "No pages found" as an error that should stop indexing if "No pages found" in error: + logger.info(f"No BookStack pages found: {error}") logger.info( "No pages found is not a critical error, continuing with update" ) @@ -159,6 +158,7 @@ async def index_bookstack_pages( ) return 0, None else: + logger.error(f"Failed to get BookStack pages: {error}") await task_logger.log_task_failure( log_entry, f"Failed to get BookStack pages: {error}", diff --git a/surfsense_backend/app/tasks/connector_indexers/confluence_indexer.py b/surfsense_backend/app/tasks/connector_indexers/confluence_indexer.py index 7289b0ccd..ddbefafb9 100644 --- a/surfsense_backend/app/tasks/connector_indexers/confluence_indexer.py +++ b/surfsense_backend/app/tasks/connector_indexers/confluence_indexer.py @@ -120,10 +120,9 @@ async def index_confluence_pages( ) if error: - logger.error(f"Failed to get Confluence pages: {error}") - # Don't treat "No pages found" as an error that should stop indexing if "No pages found" in error: + logger.info(f"No Confluence pages found: {error}") logger.info( "No pages found is not a critical error, continuing with update" ) @@ -147,6 +146,7 @@ async def index_confluence_pages( await confluence_client.close() return 0, None else: + logger.error(f"Failed to get Confluence pages: {error}") await task_logger.log_task_failure( log_entry, f"Failed to get Confluence pages: {error}", diff --git a/surfsense_backend/app/tasks/connector_indexers/google_calendar_indexer.py b/surfsense_backend/app/tasks/connector_indexers/google_calendar_indexer.py index 5bc805549..ef1f821d2 100644 --- a/surfsense_backend/app/tasks/connector_indexers/google_calendar_indexer.py +++ b/surfsense_backend/app/tasks/connector_indexers/google_calendar_indexer.py @@ -243,10 +243,9 @@ async def index_google_calendar_events( ) if error: - logger.error(f"Failed to get Google Calendar events: {error}") - # Don't treat "No events found" as an error that should stop indexing if "No events found" in error: + logger.info(f"No Google Calendar events found: {error}") logger.info( "No events found is not a critical error, continuing with update" ) @@ -266,6 +265,7 @@ async def index_google_calendar_events( ) return 0, None else: + logger.error(f"Failed to get Google Calendar events: {error}") # Check if this is an authentication error that requires re-authentication error_message = error error_type = "APIError" diff --git a/surfsense_backend/app/tasks/connector_indexers/jira_indexer.py b/surfsense_backend/app/tasks/connector_indexers/jira_indexer.py index fdbeb93b0..4851a6466 100644 --- a/surfsense_backend/app/tasks/connector_indexers/jira_indexer.py +++ b/surfsense_backend/app/tasks/connector_indexers/jira_indexer.py @@ -126,10 +126,9 @@ async def index_jira_issues( ) if error: - logger.error(f"Failed to get Jira issues: {error}") - # Don't treat "No issues found" as an error that should stop indexing if "No issues found" in error: + logger.info(f"No Jira issues found: {error}") logger.info( "No issues found is not a critical error, continuing with update" ) @@ -149,6 +148,7 @@ async def index_jira_issues( ) return 0, None else: + logger.error(f"Failed to get Jira issues: {error}") await task_logger.log_task_failure( log_entry, f"Failed to get Jira issues: {error}", diff --git a/surfsense_backend/app/tasks/connector_indexers/linear_indexer.py b/surfsense_backend/app/tasks/connector_indexers/linear_indexer.py index f1bfd42e8..7d8e0c30e 100644 --- a/surfsense_backend/app/tasks/connector_indexers/linear_indexer.py +++ b/surfsense_backend/app/tasks/connector_indexers/linear_indexer.py @@ -145,10 +145,9 @@ async def index_linear_issues( ) if error: - logger.error(f"Failed to get Linear issues: {error}") - # Don't treat "No issues found" as an error that should stop indexing if "No issues found" in error: + logger.info(f"No Linear issues found: {error}") logger.info( "No issues found is not a critical error, continuing with update" ) @@ -162,6 +161,7 @@ async def index_linear_issues( ) return 0, None else: + logger.error(f"Failed to get Linear issues: {error}") return 0, f"Failed to get Linear issues: {error}" logger.info(f"Retrieved {len(issues)} issues from Linear API") diff --git a/surfsense_backend/app/tasks/connector_indexers/luma_indexer.py b/surfsense_backend/app/tasks/connector_indexers/luma_indexer.py index 0d7a979be..ead259a44 100644 --- a/surfsense_backend/app/tasks/connector_indexers/luma_indexer.py +++ b/surfsense_backend/app/tasks/connector_indexers/luma_indexer.py @@ -179,10 +179,9 @@ async def index_luma_events( ) if error: - logger.error(f"Failed to get Luma events: {error}") - # Don't treat "No events found" as an error that should stop indexing if "No events found" in error or "no events" in error.lower(): + logger.info(f"No Luma events found: {error}") logger.info( "No events found is not a critical error, continuing with update" ) @@ -202,6 +201,7 @@ async def index_luma_events( ) return 0, None else: + logger.error(f"Failed to get Luma events: {error}") await task_logger.log_task_failure( log_entry, f"Failed to get Luma events: {error}", From 97d7207bd4e76a5c76b1d6ed88a0784ea76f0445 Mon Sep 17 00:00:00 2001 From: Anish Sarkar <104695310+AnishSarkar22@users.noreply.github.com> Date: Sat, 24 Jan 2026 04:33:10 +0530 Subject: [PATCH 23/28] fix: update Google Drive indexer to use SQLAlchemy casting for metadata queries - Modified the Google Drive indexer to use SQLAlchemy's cast function for querying document metadata, ensuring proper type handling for file IDs. - Improved the consistency of metadata queries across the indexing functions, enhancing reliability in document retrieval and processing. --- .../app/tasks/connector_indexers/google_drive_indexer.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/surfsense_backend/app/tasks/connector_indexers/google_drive_indexer.py b/surfsense_backend/app/tasks/connector_indexers/google_drive_indexer.py index 48282a1af..af180c36b 100644 --- a/surfsense_backend/app/tasks/connector_indexers/google_drive_indexer.py +++ b/surfsense_backend/app/tasks/connector_indexers/google_drive_indexer.py @@ -578,7 +578,7 @@ async def _check_rename_only_update( - (True, message): Only filename changed, document was updated - (False, None): Content changed or new file, needs full processing """ - from sqlalchemy import select + from sqlalchemy import cast, select, String from sqlalchemy.orm.attributes import flag_modified from app.db import Document @@ -603,7 +603,7 @@ async def _check_rename_only_update( select(Document).where( Document.search_space_id == search_space_id, Document.document_type == DocumentType.GOOGLE_DRIVE_FILE, - Document.document_metadata["google_drive_file_id"].astext == file_id, + cast(Document.document_metadata["google_drive_file_id"], String) == file_id, ) ) existing_document = result.scalar_one_or_none() @@ -755,7 +755,7 @@ async def _remove_document(session: AsyncSession, file_id: str, search_space_id: Handles both new (file_id-based) and legacy (filename-based) hash schemes. """ - from sqlalchemy import select + from sqlalchemy import cast, select, String from app.db import Document @@ -774,7 +774,7 @@ async def _remove_document(session: AsyncSession, file_id: str, search_space_id: select(Document).where( Document.search_space_id == search_space_id, Document.document_type == DocumentType.GOOGLE_DRIVE_FILE, - Document.document_metadata["google_drive_file_id"].astext == file_id, + cast(Document.document_metadata["google_drive_file_id"], String) == file_id, ) ) existing_document = result.scalar_one_or_none() From a5103da3d74fded873e311108b601d8b36740fce Mon Sep 17 00:00:00 2001 From: Anish Sarkar <104695310+AnishSarkar22@users.noreply.github.com> Date: Sat, 24 Jan 2026 04:36:34 +0530 Subject: [PATCH 24/28] chore: ran linting --- .../connectors/composio_gmail_connector.py | 1 - .../composio_google_calendar_connector.py | 14 +- .../composio_google_drive_connector.py | 83 +++++----- .../connectors/google_calendar_connector.py | 17 ++- .../app/connectors/google_gmail_connector.py | 17 ++- .../app/routes/composio_routes.py | 4 +- .../routes/search_source_connectors_routes.py | 31 ++-- .../app/services/composio_service.py | 142 ++++++++++-------- .../app/services/notification_service.py | 4 +- .../google_calendar_indexer.py | 21 ++- .../google_drive_indexer.py | 10 +- .../google_gmail_indexer.py | 13 +- .../assistant-ui/connector-popup.tsx | 5 +- .../components/composio-calendar-config.tsx | 1 - .../components/composio-drive-config.tsx | 24 +-- .../components/composio-gmail-config.tsx | 1 - .../components/google-drive-config.tsx | 10 +- .../views/connector-edit-view.tsx | 3 +- .../views/indexing-configuration-view.tsx | 17 ++- .../hooks/use-connector-dialog.ts | 18 +-- .../hooks/use-indexing-connectors.ts | 4 +- 21 files changed, 259 insertions(+), 181 deletions(-) diff --git a/surfsense_backend/app/connectors/composio_gmail_connector.py b/surfsense_backend/app/connectors/composio_gmail_connector.py index 5a9645a66..953e2e8fc 100644 --- a/surfsense_backend/app/connectors/composio_gmail_connector.py +++ b/surfsense_backend/app/connectors/composio_gmail_connector.py @@ -611,4 +611,3 @@ async def index_composio_gmail( except Exception as e: logger.error(f"Failed to index Gmail via Composio: {e!s}", exc_info=True) return 0, f"Failed to index Gmail via Composio: {e!s}" - diff --git a/surfsense_backend/app/connectors/composio_google_calendar_connector.py b/surfsense_backend/app/connectors/composio_google_calendar_connector.py index 3ac235848..ec5b22b7f 100644 --- a/surfsense_backend/app/connectors/composio_google_calendar_connector.py +++ b/surfsense_backend/app/connectors/composio_google_calendar_connector.py @@ -259,7 +259,9 @@ async def index_composio_google_calendar( documents_indexed = 0 documents_skipped = 0 - duplicate_content_count = 0 # Track events skipped due to duplicate content_hash + duplicate_content_count = ( + 0 # Track events skipped due to duplicate content_hash + ) for event in events: try: @@ -353,7 +355,7 @@ async def index_composio_google_calendar( logger.info( f"Committing batch: {documents_indexed} Google Calendar events processed so far" ) - await session.commit( ) + await session.commit() continue # Document doesn't exist by unique_identifier_hash @@ -362,7 +364,7 @@ async def index_composio_google_calendar( duplicate_by_content = await check_duplicate_document_by_hash( session, content_hash ) - + if duplicate_by_content: # A document with the same content already exists (likely from standard connector) logger.info( @@ -458,7 +460,10 @@ async def index_composio_google_calendar( ) except Exception as e: # Handle any remaining integrity errors gracefully (race conditions, etc.) - if "duplicate key value violates unique constraint" in str(e).lower() or "uniqueviolationerror" in str(e).lower(): + if ( + "duplicate key value violates unique constraint" in str(e).lower() + or "uniqueviolationerror" in str(e).lower() + ): logger.warning( f"Duplicate content_hash detected during final commit. " f"This may occur if the same event was indexed by multiple connectors. " @@ -495,4 +500,3 @@ async def index_composio_google_calendar( f"Failed to index Google Calendar via Composio: {e!s}", exc_info=True ) return 0, f"Failed to index Google Calendar via Composio: {e!s}" - diff --git a/surfsense_backend/app/connectors/composio_google_drive_connector.py b/surfsense_backend/app/connectors/composio_google_drive_connector.py index e19436611..e3b988676 100644 --- a/surfsense_backend/app/connectors/composio_google_drive_connector.py +++ b/surfsense_backend/app/connectors/composio_google_drive_connector.py @@ -453,8 +453,8 @@ async def check_document_by_unique_identifier( session: AsyncSession, unique_identifier_hash: str ) -> Document | None: """Check if a document with the given unique identifier hash already exists.""" - from sqlalchemy.orm import selectinload from sqlalchemy.future import select + from sqlalchemy.orm import selectinload existing_doc_result = await session.execute( select(Document) @@ -517,14 +517,20 @@ async def index_composio_google_drive( # Route to delta sync or full scan if use_delta_sync: - logger.info(f"Using delta sync for Composio Google Drive connector {connector_id}") + logger.info( + f"Using delta sync for Composio Google Drive connector {connector_id}" + ) await task_logger.log_task_progress( log_entry, f"Starting delta sync for Google Drive via Composio (connector {connector_id})", {"stage": "delta_sync", "token": stored_page_token[:20] + "..."}, ) - documents_indexed, documents_skipped, processing_errors = await _index_composio_drive_delta_sync( + ( + documents_indexed, + documents_skipped, + processing_errors, + ) = await _index_composio_drive_delta_sync( session=session, composio_connector=composio_connector, connector_id=connector_id, @@ -536,7 +542,9 @@ async def index_composio_google_drive( log_entry=log_entry, ) else: - logger.info(f"Using full scan for Composio Google Drive connector {connector_id} (first sync or no token)") + logger.info( + f"Using full scan for Composio Google Drive connector {connector_id} (first sync or no token)" + ) await task_logger.log_task_progress( log_entry, f"Fetching Google Drive files via Composio for connector {connector_id}", @@ -547,7 +555,11 @@ async def index_composio_google_drive( }, ) - documents_indexed, documents_skipped, processing_errors = await _index_composio_drive_full_scan( + ( + documents_indexed, + documents_skipped, + processing_errors, + ) = await _index_composio_drive_full_scan( session=session, composio_connector=composio_connector, connector_id=connector_id, @@ -580,9 +592,13 @@ async def index_composio_google_drive( await update_connector_last_indexed(session, connector, update_last_indexed) # Final commit - logger.info(f"Final commit: Total {documents_indexed} Google Drive files processed") + logger.info( + f"Final commit: Total {documents_indexed} Google Drive files processed" + ) await session.commit() - logger.info("Successfully committed all Composio Google Drive document changes to database") + logger.info( + "Successfully committed all Composio Google Drive document changes to database" + ) # Handle processing errors error_message = None @@ -731,7 +747,9 @@ async def _index_composio_drive_delta_sync( processing_errors.append(error_msg) documents_skipped += 1 - logger.info(f"Delta sync complete: {documents_indexed} indexed, {documents_skipped} skipped") + logger.info( + f"Delta sync complete: {documents_indexed} indexed, {documents_skipped} skipped" + ) return documents_indexed, documents_skipped, processing_errors @@ -858,20 +876,18 @@ async def _index_composio_drive_full_scan( logger.info("No Google Drive files found") return 0, 0, [] - logger.info(f"Found {len(all_files)} Google Drive files to index via Composio (full scan)") + logger.info( + f"Found {len(all_files)} Google Drive files to index via Composio (full scan)" + ) for file_info in all_files: try: # Handle both standard Google API and potential Composio variations file_id = file_info.get("id", "") or file_info.get("fileId", "") file_name = ( - file_info.get("name", "") - or file_info.get("fileName", "") - or "Untitled" - ) - mime_type = file_info.get("mimeType", "") or file_info.get( - "mime_type", "" + file_info.get("name", "") or file_info.get("fileName", "") or "Untitled" ) + mime_type = file_info.get("mimeType", "") or file_info.get("mime_type", "") if not file_id: documents_skipped += 1 @@ -901,7 +917,9 @@ async def _index_composio_drive_full_scan( # Batch commit every 10 documents if documents_indexed > 0 and documents_indexed % 10 == 0: - logger.info(f"Committing batch: {documents_indexed} Google Drive files processed so far") + logger.info( + f"Committing batch: {documents_indexed} Google Drive files processed so far" + ) await session.commit() except Exception as e: @@ -910,7 +928,9 @@ async def _index_composio_drive_full_scan( processing_errors.append(error_msg) documents_skipped += 1 - logger.info(f"Full scan complete: {documents_indexed} indexed, {documents_skipped} skipped") + logger.info( + f"Full scan complete: {documents_indexed} indexed, {documents_skipped} skipped" + ) return documents_indexed, documents_skipped, processing_errors @@ -948,9 +968,7 @@ async def _process_single_drive_file( content, content_error = await composio_connector.get_drive_file_content(file_id) if content_error or not content: - logger.warning( - f"Could not get content for file {file_name}: {content_error}" - ) + logger.warning(f"Could not get content for file {file_name}: {content_error}") # Use metadata as content fallback markdown_content = f"# {file_name}\n\n" markdown_content += f"**File ID:** {file_id}\n" @@ -985,9 +1003,7 @@ async def _process_single_drive_file( return 0, 1, processing_errors # Skipped # Update existing document - user_llm = await get_user_long_context_llm( - session, user_id, search_space_id - ) + user_llm = await get_user_long_context_llm(session, user_id, search_space_id) if user_llm: document_metadata = { @@ -1003,12 +1019,8 @@ async def _process_single_drive_file( markdown_content, user_llm, document_metadata ) else: - summary_content = ( - f"Google Drive File: {file_name}\n\nType: {mime_type}" - ) - summary_embedding = config.embedding_model_instance.embed( - summary_content - ) + summary_content = f"Google Drive File: {file_name}\n\nType: {mime_type}" + summary_embedding = config.embedding_model_instance.embed(summary_content) chunks = await create_document_chunks(markdown_content) @@ -1030,9 +1042,7 @@ async def _process_single_drive_file( return 1, 0, processing_errors # Indexed # Create new document - user_llm = await get_user_long_context_llm( - session, user_id, search_space_id - ) + user_llm = await get_user_long_context_llm(session, user_id, search_space_id) if user_llm: document_metadata = { @@ -1048,12 +1058,8 @@ async def _process_single_drive_file( markdown_content, user_llm, document_metadata ) else: - summary_content = ( - f"Google Drive File: {file_name}\n\nType: {mime_type}" - ) - summary_embedding = config.embedding_model_instance.embed( - summary_content - ) + summary_content = f"Google Drive File: {file_name}\n\nType: {mime_type}" + summary_embedding = config.embedding_model_instance.embed(summary_content) chunks = await create_document_chunks(markdown_content) @@ -1159,4 +1165,3 @@ async def _fetch_folder_files_recursively( except Exception as e: logger.error(f"Error in recursive folder fetch: {e!s}") return all_files - diff --git a/surfsense_backend/app/connectors/google_calendar_connector.py b/surfsense_backend/app/connectors/google_calendar_connector.py index ac60b02a8..d8160cf25 100644 --- a/surfsense_backend/app/connectors/google_calendar_connector.py +++ b/surfsense_backend/app/connectors/google_calendar_connector.py @@ -144,7 +144,10 @@ class GoogleCalendarConnector: except Exception as e: error_str = str(e) # Check if this is an invalid_grant error (token expired/revoked) - if "invalid_grant" in error_str.lower() or "token has been expired or revoked" in error_str.lower(): + if ( + "invalid_grant" in error_str.lower() + or "token has been expired or revoked" in error_str.lower() + ): raise Exception( "Google Calendar authentication failed. Please re-authenticate." ) from e @@ -173,7 +176,11 @@ class GoogleCalendarConnector: except Exception as e: error_str = str(e) # If the error already contains a user-friendly re-authentication message, preserve it - if "re-authenticate" in error_str.lower() or "expired or been revoked" in error_str.lower() or "authentication failed" in error_str.lower(): + if ( + "re-authenticate" in error_str.lower() + or "expired or been revoked" in error_str.lower() + or "authentication failed" in error_str.lower() + ): raise Exception(error_str) from e raise Exception(f"Failed to create Google Calendar service: {e!s}") from e @@ -283,7 +290,11 @@ class GoogleCalendarConnector: except Exception as e: error_str = str(e) # If the error already contains a user-friendly re-authentication message, preserve it - if "re-authenticate" in error_str.lower() or "expired or been revoked" in error_str.lower() or "authentication failed" in error_str.lower(): + if ( + "re-authenticate" in error_str.lower() + or "expired or been revoked" in error_str.lower() + or "authentication failed" in error_str.lower() + ): return [], error_str return [], f"Error fetching events: {e!s}" diff --git a/surfsense_backend/app/connectors/google_gmail_connector.py b/surfsense_backend/app/connectors/google_gmail_connector.py index c86a96413..7c7262bff 100644 --- a/surfsense_backend/app/connectors/google_gmail_connector.py +++ b/surfsense_backend/app/connectors/google_gmail_connector.py @@ -143,7 +143,10 @@ class GoogleGmailConnector: except Exception as e: error_str = str(e) # Check if this is an invalid_grant error (token expired/revoked) - if "invalid_grant" in error_str.lower() or "token has been expired or revoked" in error_str.lower(): + if ( + "invalid_grant" in error_str.lower() + or "token has been expired or revoked" in error_str.lower() + ): raise Exception( "Gmail authentication failed. Please re-authenticate." ) from e @@ -172,7 +175,11 @@ class GoogleGmailConnector: except Exception as e: error_str = str(e) # If the error already contains a user-friendly re-authentication message, preserve it - if "re-authenticate" in error_str.lower() or "expired or been revoked" in error_str.lower() or "authentication failed" in error_str.lower(): + if ( + "re-authenticate" in error_str.lower() + or "expired or been revoked" in error_str.lower() + or "authentication failed" in error_str.lower() + ): raise Exception(error_str) from e raise Exception(f"Failed to create Gmail service: {e!s}") from e @@ -237,7 +244,11 @@ class GoogleGmailConnector: except Exception as e: error_str = str(e) # If the error already contains a user-friendly re-authentication message, preserve it - if "re-authenticate" in error_str.lower() or "expired or been revoked" in error_str.lower() or "authentication failed" in error_str.lower(): + if ( + "re-authenticate" in error_str.lower() + or "expired or been revoked" in error_str.lower() + or "authentication failed" in error_str.lower() + ): return [], error_str return [], f"Error fetching messages list: {e!s}" diff --git a/surfsense_backend/app/routes/composio_routes.py b/surfsense_backend/app/routes/composio_routes.py index 14ef9efcf..a28361132 100644 --- a/surfsense_backend/app/routes/composio_routes.py +++ b/surfsense_backend/app/routes/composio_routes.py @@ -350,10 +350,10 @@ async def composio_callback( count = await count_connectors_of_type( session, connector_type, space_id, user_id ) - + # Generate base name (e.g., "Gmail", "Google Drive") base_name = get_base_name_for_type(connector_type) - + # Format: "Gmail (Composio) 1", "Gmail (Composio) 2", etc. if count == 0: connector_name = f"{base_name} (Composio) 1" diff --git a/surfsense_backend/app/routes/search_source_connectors_routes.py b/surfsense_backend/app/routes/search_source_connectors_routes.py index 487a689dc..191c6f954 100644 --- a/surfsense_backend/app/routes/search_source_connectors_routes.py +++ b/surfsense_backend/app/routes/search_source_connectors_routes.py @@ -662,16 +662,16 @@ async def index_connector_content( # Use UTC for "today" to match how last_indexed_at is stored today_utc = datetime.now(UTC).replace(tzinfo=None).date() last_indexed_date = last_indexed_naive.date() - + if last_indexed_date == today_utc: # If last indexed today, go back 1 day to ensure we don't miss anything indexing_from = (today_utc - timedelta(days=1)).strftime("%Y-%m-%d") else: indexing_from = last_indexed_naive.strftime("%Y-%m-%d") else: - indexing_from = (datetime.now(UTC).replace(tzinfo=None) - timedelta(days=365)).strftime( - "%Y-%m-%d" - ) + indexing_from = ( + datetime.now(UTC).replace(tzinfo=None) - timedelta(days=365) + ).strftime("%Y-%m-%d") else: indexing_from = start_date @@ -683,7 +683,7 @@ async def index_connector_content( ]: # Default to today if no end_date provided (users can manually select future dates) indexing_to = today_str if end_date is None else end_date - + # If start_date and end_date are the same, adjust end_date to be one day later # to ensure valid date range (start_date must be strictly before end_date) if indexing_from == indexing_to: @@ -1251,16 +1251,19 @@ async def _run_indexing_with_notifications( if error_or_warning: # Check if this is a duplicate warning or empty result (success cases) or an actual error # Handle both normal and Composio calendar connectors - error_or_warning_lower = str(error_or_warning).lower() if error_or_warning else "" + error_or_warning_lower = ( + str(error_or_warning).lower() if error_or_warning else "" + ) is_duplicate_warning = "skipped (duplicate)" in error_or_warning_lower # "No X found" messages are success cases - sync worked, just found nothing in date range - is_empty_result = ("no " in error_or_warning_lower and "found" in error_or_warning_lower) - + is_empty_result = ( + "no " in error_or_warning_lower + and "found" in error_or_warning_lower + ) + if is_duplicate_warning or is_empty_result: # These are success cases - sync worked, just found nothing new - logger.info( - f"Indexing completed successfully: {error_or_warning}" - ) + logger.info(f"Indexing completed successfully: {error_or_warning}") # Still update timestamp so ElectricSQL syncs and clears "Syncing" UI if update_timestamp_func: await update_timestamp_func(session, connector_id) @@ -1269,7 +1272,11 @@ async def _run_indexing_with_notifications( # Refresh notification to ensure it's not stale after timestamp update commit await session.refresh(notification) # For empty results, use a cleaner message - notification_message = "No new items found in date range" if is_empty_result else error_or_warning + notification_message = ( + "No new items found in date range" + if is_empty_result + else error_or_warning + ) await NotificationService.connector_indexing.notify_indexing_completed( session=session, notification=notification, diff --git a/surfsense_backend/app/services/composio_service.py b/surfsense_backend/app/services/composio_service.py index 3ea2d1bf2..ad7841a8b 100644 --- a/surfsense_backend/app/services/composio_service.py +++ b/surfsense_backend/app/services/composio_service.py @@ -81,7 +81,9 @@ class ComposioService: # Default download directory for files from Composio DEFAULT_DOWNLOAD_DIR = "/tmp/composio_downloads" - def __init__(self, api_key: str | None = None, file_download_dir: str | None = None): + def __init__( + self, api_key: str | None = None, file_download_dir: str | None = None + ): """ Initialize the Composio service. @@ -90,18 +92,20 @@ class ComposioService: file_download_dir: Directory for downloaded files. Defaults to /tmp/composio_downloads. """ import os - + self.api_key = api_key or config.COMPOSIO_API_KEY if not self.api_key: raise ValueError("COMPOSIO_API_KEY is required but not configured") - + # Set up download directory self.file_download_dir = file_download_dir or self.DEFAULT_DOWNLOAD_DIR os.makedirs(self.file_download_dir, exist_ok=True) - + # Initialize Composio client with download directory # Per docs: file_download_dir configures where files are downloaded - self.client = Composio(api_key=self.api_key, file_download_dir=self.file_download_dir) + self.client = Composio( + api_key=self.api_key, file_download_dir=self.file_download_dir + ) @staticmethod def is_enabled() -> bool: @@ -512,7 +516,7 @@ class ComposioService: Tuple of (file content bytes, error message). """ from pathlib import Path - + try: result = await self.execute_tool( connected_account_id=connected_account_id, @@ -532,35 +536,37 @@ class ComposioService: # Response structure: {data: {...}, error: ..., successful: ...} # The actual file info is nested inside data["data"] file_path = None - + if isinstance(data, dict): # Handle nested response structure: data contains {data, error, successful} # The actual file info is in data["data"] inner_data = data if "data" in data and isinstance(data["data"], dict): inner_data = data["data"] - logger.debug(f"Found nested data structure. Inner keys: {list(inner_data.keys())}") + logger.debug( + f"Found nested data structure. Inner keys: {list(inner_data.keys())}" + ) elif "successful" in data and "data" in data: # Standard Composio response wrapper inner_data = data["data"] if data["data"] else data - + # Try documented fields: file_path, downloaded_file_content, path, uri file_path = ( - inner_data.get("file_path") or - inner_data.get("downloaded_file_content") or - inner_data.get("path") or - inner_data.get("uri") + inner_data.get("file_path") + or inner_data.get("downloaded_file_content") + or inner_data.get("path") + or inner_data.get("uri") ) - + # Handle nested dict case where downloaded_file_content contains the path if isinstance(file_path, dict): file_path = ( - file_path.get("file_path") or - file_path.get("downloaded_file_content") or - file_path.get("path") or - file_path.get("uri") + file_path.get("file_path") + or file_path.get("downloaded_file_content") + or file_path.get("path") + or file_path.get("uri") ) - + # If still no path, check if inner_data itself has the nested structure if not file_path and isinstance(inner_data, dict): for key in ["downloaded_file_content", "file_path", "path", "uri"]: @@ -572,15 +578,17 @@ class ComposioService: elif isinstance(val, dict): # One more level of nesting file_path = ( - val.get("file_path") or - val.get("downloaded_file_content") or - val.get("path") or - val.get("uri") + val.get("file_path") + or val.get("downloaded_file_content") + or val.get("path") + or val.get("uri") ) if file_path: break - - logger.debug(f"Composio response keys: {list(data.keys())}, inner keys: {list(inner_data.keys()) if isinstance(inner_data, dict) else 'N/A'}, extracted path: {file_path}") + + logger.debug( + f"Composio response keys: {list(data.keys())}, inner keys: {list(inner_data.keys()) if isinstance(inner_data, dict) else 'N/A'}, extracted path: {file_path}" + ) elif isinstance(data, str): # Direct string response (could be path or content) file_path = data @@ -591,24 +599,31 @@ class ComposioService: # Read file from the path if file_path and isinstance(file_path, str): path_obj = Path(file_path) - + # Check if it's a valid file path (absolute or in .composio directory) - if path_obj.is_absolute() or '.composio' in str(path_obj): + if path_obj.is_absolute() or ".composio" in str(path_obj): try: if path_obj.exists(): content = path_obj.read_bytes() - logger.info(f"Successfully read {len(content)} bytes from Composio file: {file_path}") + logger.info( + f"Successfully read {len(content)} bytes from Composio file: {file_path}" + ) return content, None else: - logger.warning(f"File path from Composio does not exist: {file_path}") + logger.warning( + f"File path from Composio does not exist: {file_path}" + ) return None, f"File not found at path: {file_path}" except Exception as e: - logger.error(f"Failed to read file from Composio path {file_path}: {e!s}") + logger.error( + f"Failed to read file from Composio path {file_path}: {e!s}" + ) return None, f"Failed to read file: {e!s}" else: # Not a file path - might be base64 encoded content try: import base64 + content = base64.b64decode(file_path) return content, None except Exception: @@ -625,8 +640,11 @@ class ComposioService: f"Inner data keys: {list(inner_data.keys()) if isinstance(inner_data, dict) else type(inner_data).__name__}, " f"Full inner data: {inner_data}" ) - return None, f"No file path in Composio response. Keys: {list(data.keys())}, inner: {list(inner_data.keys()) if isinstance(inner_data, dict) else 'N/A'}" - + return ( + None, + f"No file path in Composio response. Keys: {list(data.keys())}, inner: {list(inner_data.keys()) if isinstance(inner_data, dict) else 'N/A'}", + ) + return None, f"Unexpected data type from Composio: {type(data).__name__}" except Exception as e: @@ -638,14 +656,14 @@ class ComposioService: ) -> tuple[str | None, str | None]: """ Get the starting page token for Google Drive change tracking. - + This token represents the current state and is used for future delta syncs. Per Composio docs: Use GOOGLEDRIVE_GET_CHANGES_START_PAGE_TOKEN to get initial token. - + Args: connected_account_id: Composio connected account ID. entity_id: The entity/user ID that owns the connected account. - + Returns: Tuple of (start_page_token, error message). """ @@ -656,27 +674,27 @@ class ComposioService: params={}, entity_id=entity_id, ) - + if not result.get("success"): return None, result.get("error", "Unknown error") - + data = result.get("data", {}) # Handle nested response: {data: {startPageToken: ...}, successful: ...} if isinstance(data, dict): inner_data = data.get("data", data) token = ( - inner_data.get("startPageToken") or - inner_data.get("start_page_token") or - data.get("startPageToken") or - data.get("start_page_token") + inner_data.get("startPageToken") + or inner_data.get("start_page_token") + or data.get("startPageToken") + or data.get("start_page_token") ) if token: logger.info(f"Got Drive start page token: {token}") return token, None - + logger.warning(f"Could not extract start page token from response: {data}") return None, "No start page token in response" - + except Exception as e: logger.error(f"Failed to get Drive start page token: {e!s}") return None, str(e) @@ -691,18 +709,18 @@ class ComposioService: ) -> tuple[list[dict[str, Any]], str | None, str | None]: """ List changes in Google Drive since the given page token. - + Per Composio docs: GOOGLEDRIVE_LIST_CHANGES tracks modifications to files/folders. If pageToken is not provided, it auto-fetches the current start page token. Response includes nextPageToken for pagination and newStartPageToken for future syncs. - + Args: connected_account_id: Composio connected account ID. entity_id: The entity/user ID that owns the connected account. page_token: Page token from previous sync (optional - will auto-fetch if not provided). page_size: Number of changes per page. include_removed: Whether to include removed items in the response. - + Returns: Tuple of (changes list, new_start_page_token, error message). """ @@ -713,42 +731,44 @@ class ComposioService: } if page_token: params["pageToken"] = page_token - + result = await self.execute_tool( connected_account_id=connected_account_id, tool_name="GOOGLEDRIVE_LIST_CHANGES", params=params, entity_id=entity_id, ) - + if not result.get("success"): return [], None, result.get("error", "Unknown error") - + data = result.get("data", {}) - + # Handle nested response structure changes = [] new_start_token = None - + if isinstance(data, dict): inner_data = data.get("data", data) changes = inner_data.get("changes", []) or data.get("changes", []) - + # Get the token for next sync # newStartPageToken is returned when all changes have been fetched # nextPageToken is for pagination within the current fetch new_start_token = ( - inner_data.get("newStartPageToken") or - inner_data.get("new_start_page_token") or - inner_data.get("nextPageToken") or - inner_data.get("next_page_token") or - data.get("newStartPageToken") or - data.get("nextPageToken") + inner_data.get("newStartPageToken") + or inner_data.get("new_start_page_token") + or inner_data.get("nextPageToken") + or inner_data.get("next_page_token") + or data.get("newStartPageToken") + or data.get("nextPageToken") ) - - logger.info(f"Got {len(changes)} Drive changes, new token: {new_start_token[:20] if new_start_token else 'None'}...") + + logger.info( + f"Got {len(changes)} Drive changes, new token: {new_start_token[:20] if new_start_token else 'None'}..." + ) return changes, new_start_token, None - + except Exception as e: logger.error(f"Failed to list Drive changes: {e!s}") return [], None, str(e) diff --git a/surfsense_backend/app/services/notification_service.py b/surfsense_backend/app/services/notification_service.py index 9fcf807e7..04f39d8ef 100644 --- a/surfsense_backend/app/services/notification_service.py +++ b/surfsense_backend/app/services/notification_service.py @@ -385,7 +385,9 @@ class ConnectorIndexingNotificationHandler(BaseNotificationHandler): metadata_updates = { "indexed_count": indexed_count, - "sync_stage": "completed" if (not error_message or is_warning or indexed_count > 0) else "failed", + "sync_stage": "completed" + if (not error_message or is_warning or indexed_count > 0) + else "failed", "error_message": error_message, } diff --git a/surfsense_backend/app/tasks/connector_indexers/google_calendar_indexer.py b/surfsense_backend/app/tasks/connector_indexers/google_calendar_indexer.py index ef1f821d2..2365ff984 100644 --- a/surfsense_backend/app/tasks/connector_indexers/google_calendar_indexer.py +++ b/surfsense_backend/app/tasks/connector_indexers/google_calendar_indexer.py @@ -208,7 +208,7 @@ async def index_google_calendar_events( # Use provided dates (including future dates) start_date_str = start_date end_date_str = end_date - + # If start_date and end_date are the same, adjust end_date to be one day later # to ensure valid date range (start_date must be strictly before end_date) if start_date_str == end_date_str: @@ -269,10 +269,14 @@ async def index_google_calendar_events( # Check if this is an authentication error that requires re-authentication error_message = error error_type = "APIError" - if "re-authenticate" in error.lower() or "expired or been revoked" in error.lower() or "authentication failed" in error.lower(): + if ( + "re-authenticate" in error.lower() + or "expired or been revoked" in error.lower() + or "authentication failed" in error.lower() + ): error_message = "Google Calendar authentication failed. Please re-authenticate." error_type = "AuthenticationError" - + await task_logger.log_task_failure( log_entry, error_message, @@ -290,7 +294,9 @@ async def index_google_calendar_events( documents_indexed = 0 documents_skipped = 0 skipped_events = [] - duplicate_content_count = 0 # Track events skipped due to duplicate content_hash + duplicate_content_count = ( + 0 # Track events skipped due to duplicate content_hash + ) for event in events: try: @@ -417,7 +423,7 @@ async def index_google_calendar_events( duplicate_by_content = await check_duplicate_document_by_hash( session, content_hash ) - + if duplicate_by_content: # A document with the same content already exists (likely from Composio connector) logger.info( @@ -528,7 +534,10 @@ async def index_google_calendar_events( await session.commit() except Exception as e: # Handle any remaining integrity errors gracefully (race conditions, etc.) - if "duplicate key value violates unique constraint" in str(e).lower() or "uniqueviolationerror" in str(e).lower(): + if ( + "duplicate key value violates unique constraint" in str(e).lower() + or "uniqueviolationerror" in str(e).lower() + ): logger.warning( f"Duplicate content_hash detected during final commit. " f"This may occur if the same event was indexed by multiple connectors. " diff --git a/surfsense_backend/app/tasks/connector_indexers/google_drive_indexer.py b/surfsense_backend/app/tasks/connector_indexers/google_drive_indexer.py index af180c36b..f50e149d3 100644 --- a/surfsense_backend/app/tasks/connector_indexers/google_drive_indexer.py +++ b/surfsense_backend/app/tasks/connector_indexers/google_drive_indexer.py @@ -578,7 +578,7 @@ async def _check_rename_only_update( - (True, message): Only filename changed, document was updated - (False, None): Content changed or new file, needs full processing """ - from sqlalchemy import cast, select, String + from sqlalchemy import String, cast, select from sqlalchemy.orm.attributes import flag_modified from app.db import Document @@ -603,7 +603,8 @@ async def _check_rename_only_update( select(Document).where( Document.search_space_id == search_space_id, Document.document_type == DocumentType.GOOGLE_DRIVE_FILE, - cast(Document.document_metadata["google_drive_file_id"], String) == file_id, + cast(Document.document_metadata["google_drive_file_id"], String) + == file_id, ) ) existing_document = result.scalar_one_or_none() @@ -755,7 +756,7 @@ async def _remove_document(session: AsyncSession, file_id: str, search_space_id: Handles both new (file_id-based) and legacy (filename-based) hash schemes. """ - from sqlalchemy import cast, select, String + from sqlalchemy import String, cast, select from app.db import Document @@ -774,7 +775,8 @@ async def _remove_document(session: AsyncSession, file_id: str, search_space_id: select(Document).where( Document.search_space_id == search_space_id, Document.document_type == DocumentType.GOOGLE_DRIVE_FILE, - cast(Document.document_metadata["google_drive_file_id"], String) == file_id, + cast(Document.document_metadata["google_drive_file_id"], String) + == file_id, ) ) existing_document = result.scalar_one_or_none() diff --git a/surfsense_backend/app/tasks/connector_indexers/google_gmail_indexer.py b/surfsense_backend/app/tasks/connector_indexers/google_gmail_indexer.py index 6a3057437..08d2904d6 100644 --- a/surfsense_backend/app/tasks/connector_indexers/google_gmail_indexer.py +++ b/surfsense_backend/app/tasks/connector_indexers/google_gmail_indexer.py @@ -173,15 +173,16 @@ async def index_google_gmail_messages( # Check if this is an authentication error that requires re-authentication error_message = error error_type = "APIError" - if "re-authenticate" in error.lower() or "expired or been revoked" in error.lower() or "authentication failed" in error.lower(): + if ( + "re-authenticate" in error.lower() + or "expired or been revoked" in error.lower() + or "authentication failed" in error.lower() + ): error_message = "Gmail authentication failed. Please re-authenticate." error_type = "AuthenticationError" - + await task_logger.log_task_failure( - log_entry, - error_message, - error, - {"error_type": error_type} + log_entry, error_message, error, {"error_type": error_type} ) return 0, error_message diff --git a/surfsense_web/components/assistant-ui/connector-popup.tsx b/surfsense_web/components/assistant-ui/connector-popup.tsx index 68a548409..293d4a243 100644 --- a/surfsense_web/components/assistant-ui/connector-popup.tsx +++ b/surfsense_web/components/assistant-ui/connector-popup.tsx @@ -18,7 +18,10 @@ import { ConnectorDialogHeader } from "./connector-popup/components/connector-di import { ConnectorConnectView } from "./connector-popup/connector-configs/views/connector-connect-view"; import { ConnectorEditView } from "./connector-popup/connector-configs/views/connector-edit-view"; import { IndexingConfigurationView } from "./connector-popup/connector-configs/views/indexing-configuration-view"; -import { COMPOSIO_CONNECTORS, OAUTH_CONNECTORS } from "./connector-popup/constants/connector-constants"; +import { + COMPOSIO_CONNECTORS, + OAUTH_CONNECTORS, +} from "./connector-popup/constants/connector-constants"; import { useConnectorDialog } from "./connector-popup/hooks/use-connector-dialog"; import { useIndexingConnectors } from "./connector-popup/hooks/use-indexing-connectors"; import { ActiveConnectorsTab } from "./connector-popup/tabs/active-connectors-tab"; diff --git a/surfsense_web/components/assistant-ui/connector-popup/connector-configs/components/composio-calendar-config.tsx b/surfsense_web/components/assistant-ui/connector-popup/connector-configs/components/composio-calendar-config.tsx index ce5133a9d..6f282d892 100644 --- a/surfsense_web/components/assistant-ui/connector-popup/connector-configs/components/composio-calendar-config.tsx +++ b/surfsense_web/components/assistant-ui/connector-popup/connector-configs/components/composio-calendar-config.tsx @@ -12,4 +12,3 @@ interface ComposioCalendarConfigProps { export const ComposioCalendarConfig: FC = () => { return
; }; - diff --git a/surfsense_web/components/assistant-ui/connector-popup/connector-configs/components/composio-drive-config.tsx b/surfsense_web/components/assistant-ui/connector-popup/connector-configs/components/composio-drive-config.tsx index 0ab0869ff..239125565 100644 --- a/surfsense_web/components/assistant-ui/connector-popup/connector-configs/components/composio-drive-config.tsx +++ b/surfsense_web/components/assistant-ui/connector-popup/connector-configs/components/composio-drive-config.tsx @@ -1,6 +1,14 @@ "use client"; -import { File, FileSpreadsheet, FileText, FolderClosed, Image, Presentation, X } from "lucide-react"; +import { + File, + FileSpreadsheet, + FileText, + FolderClosed, + Image, + Presentation, + X, +} from "lucide-react"; import type { FC } from "react"; import { useEffect, useState } from "react"; import { ComposioDriveFolderTree } from "@/components/connectors/composio-drive-folder-tree"; @@ -85,7 +93,10 @@ function getFileIconFromName(fileName: string, className: string = "size-3.5 shr return ; } -export const ComposioDriveConfig: FC = ({ connector, onConfigChange }) => { +export const ComposioDriveConfig: FC = ({ + connector, + onConfigChange, +}) => { const isIndexable = connector.config?.is_indexable as boolean; // Initialize with existing selected folders and files from connector config @@ -184,9 +195,7 @@ export const ComposioDriveConfig: FC = ({ connector, o ); } if (selectedFiles.length > 0) { - parts.push( - `${selectedFiles.length} file${selectedFiles.length > 1 ? "s" : ""}` - ); + parts.push(`${selectedFiles.length} file${selectedFiles.length > 1 ? "s" : ""}`); } return parts.length > 0 ? `(${parts.join(", ")})` : ""; })()} @@ -329,13 +338,10 @@ export const ComposioDriveConfig: FC = ({ connector, o - handleIndexingOptionChange("include_subfolders", checked) - } + onCheckedChange={(checked) => handleIndexingOptionChange("include_subfolders", checked)} />
); }; - diff --git a/surfsense_web/components/assistant-ui/connector-popup/connector-configs/components/composio-gmail-config.tsx b/surfsense_web/components/assistant-ui/connector-popup/connector-configs/components/composio-gmail-config.tsx index 4664e3e64..494e1362f 100644 --- a/surfsense_web/components/assistant-ui/connector-popup/connector-configs/components/composio-gmail-config.tsx +++ b/surfsense_web/components/assistant-ui/connector-popup/connector-configs/components/composio-gmail-config.tsx @@ -12,4 +12,3 @@ interface ComposioGmailConfigProps { export const ComposioGmailConfig: FC = () => { return
; }; - diff --git a/surfsense_web/components/assistant-ui/connector-popup/connector-configs/components/google-drive-config.tsx b/surfsense_web/components/assistant-ui/connector-popup/connector-configs/components/google-drive-config.tsx index b6cfb39ae..383f6ce0e 100644 --- a/surfsense_web/components/assistant-ui/connector-popup/connector-configs/components/google-drive-config.tsx +++ b/surfsense_web/components/assistant-ui/connector-popup/connector-configs/components/google-drive-config.tsx @@ -1,6 +1,14 @@ "use client"; -import { File, FileSpreadsheet, FileText, FolderClosed, Image, Presentation, X } from "lucide-react"; +import { + File, + FileSpreadsheet, + FileText, + FolderClosed, + Image, + Presentation, + X, +} from "lucide-react"; import type { FC } from "react"; import { useEffect, useState } from "react"; import { GoogleDriveFolderTree } from "@/components/connectors/google-drive-folder-tree"; diff --git a/surfsense_web/components/assistant-ui/connector-popup/connector-configs/views/connector-edit-view.tsx b/surfsense_web/components/assistant-ui/connector-popup/connector-configs/views/connector-edit-view.tsx index 8f58db542..5668d398e 100644 --- a/surfsense_web/components/assistant-ui/connector-popup/connector-configs/views/connector-edit-view.tsx +++ b/surfsense_web/components/assistant-ui/connector-popup/connector-configs/views/connector-edit-view.tsx @@ -276,7 +276,8 @@ export const ConnectorEditView: FC = ({ Re-indexing runs in the background

- You can continue using SurfSense while we sync your data. Check inbox for updates. + You can continue using SurfSense while we sync your data. Check inbox for + updates.

diff --git a/surfsense_web/components/assistant-ui/connector-popup/connector-configs/views/indexing-configuration-view.tsx b/surfsense_web/components/assistant-ui/connector-popup/connector-configs/views/indexing-configuration-view.tsx index 019e6b37f..684f03252 100644 --- a/surfsense_web/components/assistant-ui/connector-popup/connector-configs/views/indexing-configuration-view.tsx +++ b/surfsense_web/components/assistant-ui/connector-popup/connector-configs/views/indexing-configuration-view.tsx @@ -170,13 +170,13 @@ export const IndexingConfigurationView: FC = ({ {/* Periodic sync - not shown for Google Drive (regular and Composio) */} {config.connectorType !== "GOOGLE_DRIVE_CONNECTOR" && config.connectorType !== "COMPOSIO_GOOGLE_DRIVE_CONNECTOR" && ( - - )} + + )} )} @@ -189,7 +189,8 @@ export const IndexingConfigurationView: FC = ({

Indexing runs in the background

- You can continue using SurfSense while we sync your data. Check inbox for updates. + You can continue using SurfSense while we sync your data. Check inbox for + updates.

diff --git a/surfsense_web/components/assistant-ui/connector-popup/hooks/use-connector-dialog.ts b/surfsense_web/components/assistant-ui/connector-popup/hooks/use-connector-dialog.ts index 9a7f15b0c..639d0f7ed 100644 --- a/surfsense_web/components/assistant-ui/connector-popup/hooks/use-connector-dialog.ts +++ b/surfsense_web/components/assistant-ui/connector-popup/hooks/use-connector-dialog.ts @@ -328,11 +328,7 @@ export const useConnectorDialog = () => { return; } - if ( - params.success === "true" && - searchSpaceId && - params.modal === "connectors" - ) { + if (params.success === "true" && searchSpaceId && params.modal === "connectors") { refetchAllConnectors().then((result) => { if (!result.data) return; @@ -346,16 +342,12 @@ export const useConnectorDialog = () => { if (params.connectorId) { const connectorId = parseInt(params.connectorId, 10); newConnector = result.data.find((c: SearchSourceConnector) => c.id === connectorId); - + // If we found the connector, find the matching OAuth/Composio connector by type if (newConnector) { oauthConnector = - OAUTH_CONNECTORS.find( - (c) => c.connectorType === newConnector!.connector_type - ) || - COMPOSIO_CONNECTORS.find( - (c) => c.connectorType === newConnector!.connector_type - ); + OAUTH_CONNECTORS.find((c) => c.connectorType === newConnector!.connector_type) || + COMPOSIO_CONNECTORS.find((c) => c.connectorType === newConnector!.connector_type); } } @@ -364,7 +356,7 @@ export const useConnectorDialog = () => { oauthConnector = OAUTH_CONNECTORS.find((c) => c.id === params.connector) || COMPOSIO_CONNECTORS.find((c) => c.id === params.connector); - + if (oauthConnector) { newConnector = result.data.find( (c: SearchSourceConnector) => c.connector_type === oauthConnector!.connectorType diff --git a/surfsense_web/components/assistant-ui/connector-popup/hooks/use-indexing-connectors.ts b/surfsense_web/components/assistant-ui/connector-popup/hooks/use-indexing-connectors.ts index 289da475d..19741e020 100644 --- a/surfsense_web/components/assistant-ui/connector-popup/hooks/use-indexing-connectors.ts +++ b/surfsense_web/components/assistant-ui/connector-popup/hooks/use-indexing-connectors.ts @@ -68,9 +68,7 @@ export function useIndexingConnectors( // Only check connector_indexing notifications if (item.type !== "connector_indexing") continue; - const metadata = isConnectorIndexingMetadata(item.metadata) - ? item.metadata - : null; + const metadata = isConnectorIndexingMetadata(item.metadata) ? item.metadata : null; if (!metadata) continue; // If status is "in_progress", add connector to indexing set From 3368a65b0c5ab714e7673128bdb4782d31734b63 Mon Sep 17 00:00:00 2001 From: Rohan Verma <122026167+MODSetter@users.noreply.github.com> Date: Sat, 24 Jan 2026 16:11:27 -0800 Subject: [PATCH 25/28] Change video link in README Updated video link in README. --- README.md | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/README.md b/README.md index 0c5f06029..4dd368c04 100644 --- a/README.md +++ b/README.md @@ -29,8 +29,7 @@ SurfSense is a highly customizable AI research agent, connected to external sour # Video -https://github.com/user-attachments/assets/42a29ea1-d4d8-4213-9c69-972b5b806d58 - +https://github.com/user-attachments/assets/cc0c84d3-1f2f-4f7a-b519-2ecce22310b1 ## Podcast Sample From 20efc63f3003971a0db6c62c1c34cfdbf756cc3c Mon Sep 17 00:00:00 2001 From: "DESKTOP-RTLN3BA\\$punk" Date: Sat, 24 Jan 2026 17:42:44 -0800 Subject: [PATCH 26/28] feat: implement dynamic connector and document type discovery for knowledge base tool - Added functionality to dynamically discover available connectors and document types for the knowledge base tool, enhancing its flexibility and usability. - Introduced new mapping functions and updated existing search methods to accommodate Composio connectors, improving integration with external services. - Enhanced error handling and logging for connector discovery processes, ensuring better feedback during failures. --- .../app/agents/new_chat/chat_deepagent.py | 112 ++++++ .../app/agents/new_chat/tools/__init__.py | 4 +- .../agents/new_chat/tools/knowledge_base.py | 306 +++++++++++---- .../app/agents/new_chat/tools/registry.py | 5 + .../app/services/connector_service.py | 347 ++++++++++++++++++ 5 files changed, 708 insertions(+), 66 deletions(-) diff --git a/surfsense_backend/app/agents/new_chat/chat_deepagent.py b/surfsense_backend/app/agents/new_chat/chat_deepagent.py index 5bc6ac2e2..53e1b14bd 100644 --- a/surfsense_backend/app/agents/new_chat/chat_deepagent.py +++ b/surfsense_backend/app/agents/new_chat/chat_deepagent.py @@ -7,6 +7,7 @@ via NewLLMConfig. """ from collections.abc import Sequence +from typing import Any from deepagents import create_deep_agent from langchain_core.tools import BaseTool @@ -23,6 +24,90 @@ from app.agents.new_chat.system_prompt import ( from app.agents.new_chat.tools.registry import build_tools_async from app.services.connector_service import ConnectorService +# ============================================================================= +# Connector Type Mapping +# ============================================================================= + +# Maps SearchSourceConnectorType enum values to the searchable document/connector types +# used by the knowledge_base tool. Some connectors map to different document types. +_CONNECTOR_TYPE_TO_SEARCHABLE: dict[str, str] = { + # Direct mappings (connector type == searchable type) + "TAVILY_API": "TAVILY_API", + "SEARXNG_API": "SEARXNG_API", + "LINKUP_API": "LINKUP_API", + "BAIDU_SEARCH_API": "BAIDU_SEARCH_API", + "SLACK_CONNECTOR": "SLACK_CONNECTOR", + "TEAMS_CONNECTOR": "TEAMS_CONNECTOR", + "NOTION_CONNECTOR": "NOTION_CONNECTOR", + "GITHUB_CONNECTOR": "GITHUB_CONNECTOR", + "LINEAR_CONNECTOR": "LINEAR_CONNECTOR", + "DISCORD_CONNECTOR": "DISCORD_CONNECTOR", + "JIRA_CONNECTOR": "JIRA_CONNECTOR", + "CONFLUENCE_CONNECTOR": "CONFLUENCE_CONNECTOR", + "CLICKUP_CONNECTOR": "CLICKUP_CONNECTOR", + "GOOGLE_CALENDAR_CONNECTOR": "GOOGLE_CALENDAR_CONNECTOR", + "GOOGLE_GMAIL_CONNECTOR": "GOOGLE_GMAIL_CONNECTOR", + "GOOGLE_DRIVE_CONNECTOR": "GOOGLE_DRIVE_FILE", # Connector type differs from document type + "AIRTABLE_CONNECTOR": "AIRTABLE_CONNECTOR", + "LUMA_CONNECTOR": "LUMA_CONNECTOR", + "ELASTICSEARCH_CONNECTOR": "ELASTICSEARCH_CONNECTOR", + "WEBCRAWLER_CONNECTOR": "CRAWLED_URL", # Maps to document type + "BOOKSTACK_CONNECTOR": "BOOKSTACK_CONNECTOR", + "CIRCLEBACK_CONNECTOR": "CIRCLEBACK", # Connector type differs from document type + "OBSIDIAN_CONNECTOR": "OBSIDIAN_CONNECTOR", + # Composio connectors + "COMPOSIO_GOOGLE_DRIVE_CONNECTOR": "COMPOSIO_GOOGLE_DRIVE_CONNECTOR", + "COMPOSIO_GMAIL_CONNECTOR": "COMPOSIO_GMAIL_CONNECTOR", + "COMPOSIO_GOOGLE_CALENDAR_CONNECTOR": "COMPOSIO_GOOGLE_CALENDAR_CONNECTOR", +} + +# Document types that don't come from SearchSourceConnector but should always be searchable +_ALWAYS_AVAILABLE_DOC_TYPES: list[str] = [ + "EXTENSION", # Browser extension data + "FILE", # Uploaded files + "NOTE", # User notes + "YOUTUBE_VIDEO", # YouTube videos +] + + +def _map_connectors_to_searchable_types( + connector_types: list[Any], +) -> list[str]: + """ + Map SearchSourceConnectorType enums to searchable document/connector types. + + This function: + 1. Converts connector type enums to their searchable counterparts + 2. Includes always-available document types (EXTENSION, FILE, NOTE, YOUTUBE_VIDEO) + 3. Deduplicates while preserving order + + Args: + connector_types: List of SearchSourceConnectorType enum values + + Returns: + List of searchable connector/document type strings + """ + result_set: set[str] = set() + result_list: list[str] = [] + + # Add always-available document types first + for doc_type in _ALWAYS_AVAILABLE_DOC_TYPES: + if doc_type not in result_set: + result_set.add(doc_type) + result_list.append(doc_type) + + # Map each connector type to its searchable equivalent + for ct in connector_types: + # Handle both enum and string types + ct_str = ct.value if hasattr(ct, "value") else str(ct) + searchable = _CONNECTOR_TYPE_TO_SEARCHABLE.get(ct_str) + if searchable and searchable not in result_set: + result_set.add(searchable) + result_list.append(searchable) + + return result_list + + # ============================================================================= # Deep Agent Factory # ============================================================================= @@ -116,6 +201,30 @@ async def create_surfsense_deep_agent( additional_tools=[my_custom_tool] ) """ + # Discover available connectors and document types for this search space + # This enables dynamic tool docstrings that inform the LLM about what's actually available + available_connectors: list[str] | None = None + available_document_types: list[str] | None = None + + try: + # Get enabled search source connectors for this search space + connector_types = await connector_service.get_available_connectors( + search_space_id + ) + if connector_types: + # Convert enum values to strings and also include mapped document types + available_connectors = _map_connectors_to_searchable_types(connector_types) + + # Get document types that have at least one document indexed + available_document_types = await connector_service.get_available_document_types( + search_space_id + ) + except Exception as e: + # Log but don't fail - fall back to all connectors if discovery fails + import logging + + logging.warning(f"Failed to discover available connectors/document types: {e}") + # Build dependencies dict for the tools registry dependencies = { "search_space_id": search_space_id, @@ -123,6 +232,9 @@ async def create_surfsense_deep_agent( "connector_service": connector_service, "firecrawl_api_key": firecrawl_api_key, "user_id": user_id, # Required for memory tools + # Dynamic connector/document type discovery for knowledge base tool + "available_connectors": available_connectors, + "available_document_types": available_document_types, } # Build tools using the async registry (includes MCP tools) diff --git a/surfsense_backend/app/agents/new_chat/tools/__init__.py b/surfsense_backend/app/agents/new_chat/tools/__init__.py index acbdbcb3a..9e1a4f19c 100644 --- a/surfsense_backend/app/agents/new_chat/tools/__init__.py +++ b/surfsense_backend/app/agents/new_chat/tools/__init__.py @@ -19,6 +19,7 @@ Available tools: # Tool factory exports (for direct use) from .display_image import create_display_image_tool from .knowledge_base import ( + CONNECTOR_DESCRIPTIONS, create_search_knowledge_base_tool, format_documents_for_context, search_knowledge_base_async, @@ -40,6 +41,8 @@ from .user_memory import create_recall_memory_tool, create_save_memory_tool __all__ = [ # Registry "BUILTIN_TOOLS", + # Knowledge base utilities + "CONNECTOR_DESCRIPTIONS", "ToolDefinition", "build_tools", # Tool factories @@ -51,7 +54,6 @@ __all__ = [ "create_scrape_webpage_tool", "create_search_knowledge_base_tool", "create_search_surfsense_docs_tool", - # Knowledge base utilities "format_documents_for_context", "get_all_tool_names", "get_default_enabled_tools", diff --git a/surfsense_backend/app/agents/new_chat/tools/knowledge_base.py b/surfsense_backend/app/agents/new_chat/tools/knowledge_base.py index 552019dda..a11e4ac38 100644 --- a/surfsense_backend/app/agents/new_chat/tools/knowledge_base.py +++ b/surfsense_backend/app/agents/new_chat/tools/knowledge_base.py @@ -12,7 +12,8 @@ import json from datetime import datetime from typing import Any -from langchain_core.tools import tool +from langchain_core.tools import StructuredTool +from pydantic import BaseModel, Field from sqlalchemy.ext.asyncio import AsyncSession from app.services.connector_service import ConnectorService @@ -22,6 +23,7 @@ from app.services.connector_service import ConnectorService # ============================================================================= # Canonical connector values used internally by ConnectorService +# Includes all document types and search source connectors _ALL_CONNECTORS: list[str] = [ "EXTENSION", "FILE", @@ -50,41 +52,117 @@ _ALL_CONNECTORS: list[str] = [ "CRAWLED_URL", "CIRCLEBACK", "OBSIDIAN_CONNECTOR", + # Composio connectors + "COMPOSIO_GOOGLE_DRIVE_CONNECTOR", + "COMPOSIO_GMAIL_CONNECTOR", + "COMPOSIO_GOOGLE_CALENDAR_CONNECTOR", ] +# Human-readable descriptions for each connector type +# Used for generating dynamic docstrings and informing the LLM +CONNECTOR_DESCRIPTIONS: dict[str, str] = { + "EXTENSION": "Web content saved via SurfSense browser extension (personal browsing history)", + "FILE": "User-uploaded documents (PDFs, Word, etc.) (personal files)", + "NOTE": "SurfSense Notes (notes created inside SurfSense)", + "SLACK_CONNECTOR": "Slack conversations and shared content (personal workspace communications)", + "TEAMS_CONNECTOR": "Microsoft Teams messages and conversations (personal Teams communications)", + "NOTION_CONNECTOR": "Notion workspace pages and databases (personal knowledge management)", + "YOUTUBE_VIDEO": "YouTube video transcripts and metadata (personally saved videos)", + "GITHUB_CONNECTOR": "GitHub repository content and issues (personal repositories and interactions)", + "ELASTICSEARCH_CONNECTOR": "Elasticsearch indexed documents and data (personal Elasticsearch instances)", + "LINEAR_CONNECTOR": "Linear project issues and discussions (personal project management)", + "JIRA_CONNECTOR": "Jira project issues, tickets, and comments (personal project tracking)", + "CONFLUENCE_CONNECTOR": "Confluence pages and comments (personal project documentation)", + "CLICKUP_CONNECTOR": "ClickUp tasks and project data (personal task management)", + "GOOGLE_CALENDAR_CONNECTOR": "Google Calendar events, meetings, and schedules (personal calendar)", + "GOOGLE_GMAIL_CONNECTOR": "Google Gmail emails and conversations (personal emails)", + "GOOGLE_DRIVE_FILE": "Google Drive files and documents (personal cloud storage)", + "DISCORD_CONNECTOR": "Discord server conversations and shared content (personal community)", + "AIRTABLE_CONNECTOR": "Airtable records, tables, and database content (personal data)", + "TAVILY_API": "Tavily web search API results (real-time web search)", + "SEARXNG_API": "SearxNG search API results (privacy-focused web search)", + "LINKUP_API": "Linkup search API results (web search)", + "BAIDU_SEARCH_API": "Baidu search API results (Chinese web search)", + "LUMA_CONNECTOR": "Luma events and meetings", + "WEBCRAWLER_CONNECTOR": "Webpages indexed by SurfSense (personally selected websites)", + "CRAWLED_URL": "Webpages indexed by SurfSense (personally selected websites)", + "BOOKSTACK_CONNECTOR": "BookStack pages (personal documentation)", + "CIRCLEBACK": "Circleback meeting notes, transcripts, and action items", + "OBSIDIAN_CONNECTOR": "Obsidian vault notes and markdown files (personal notes)", + # Composio connectors + "COMPOSIO_GOOGLE_DRIVE_CONNECTOR": "Google Drive files via Composio (personal cloud storage)", + "COMPOSIO_GMAIL_CONNECTOR": "Gmail emails via Composio (personal emails)", + "COMPOSIO_GOOGLE_CALENDAR_CONNECTOR": "Google Calendar events via Composio (personal calendar)", +} -def _normalize_connectors(connectors_to_search: list[str] | None) -> list[str]: + +def _normalize_connectors( + connectors_to_search: list[str] | None, + available_connectors: list[str] | None = None, +) -> list[str]: """ Normalize connectors provided by the model. - Accepts user-facing enums like WEBCRAWLER_CONNECTOR and maps them to canonical ConnectorService types. - Drops unknown values. - - If None/empty, defaults to searching across all known connectors. + - If available_connectors is provided, only includes connectors from that list. + - If connectors_to_search is None/empty, defaults to available_connectors or all. + + Args: + connectors_to_search: List of connectors requested by the model + available_connectors: List of connectors actually available in the search space + + Returns: + List of normalized connector strings to search """ + # Determine the set of valid connectors to consider + valid_set = ( + set(available_connectors) if available_connectors else set(_ALL_CONNECTORS) + ) + if not connectors_to_search: - return list(_ALL_CONNECTORS) + # Search all available connectors if none specified + return ( + list(available_connectors) + if available_connectors + else list(_ALL_CONNECTORS) + ) normalized: list[str] = [] for raw in connectors_to_search: c = (raw or "").strip().upper() if not c: continue + # Map user-facing aliases to canonical names if c == "WEBCRAWLER_CONNECTOR": c = "CRAWLED_URL" normalized.append(c) - # de-dupe while preserving order + filter unknown + # de-dupe while preserving order + filter to valid connectors seen: set[str] = set() out: list[str] = [] for c in normalized: if c in seen: continue + # Only include if it's a known connector AND available if c not in _ALL_CONNECTORS: continue + if c not in valid_set: + continue seen.add(c) out.append(c) - return out if out else list(_ALL_CONNECTORS) + + # Fallback to all available if nothing matched + return ( + out + if out + else ( + list(available_connectors) + if available_connectors + else list(_ALL_CONNECTORS) + ) + ) # ============================================================================= @@ -233,6 +311,7 @@ async def search_knowledge_base_async( top_k: int = 10, start_date: datetime | None = None, end_date: datetime | None = None, + available_connectors: list[str] | None = None, ) -> str: """ Search the user's knowledge base for relevant documents. @@ -248,6 +327,8 @@ async def search_knowledge_base_async( top_k: Number of results per connector start_date: Optional start datetime (UTC) for filtering documents end_date: Optional end datetime (UTC) for filtering documents + available_connectors: Optional list of connectors actually available in the search space. + If provided, only these connectors will be searched. Returns: Formatted string with search results @@ -262,7 +343,7 @@ async def search_knowledge_base_async( end_date=end_date, ) - connectors = _normalize_connectors(connectors_to_search) + connectors = _normalize_connectors(connectors_to_search, available_connectors) for connector in connectors: try: @@ -316,6 +397,16 @@ async def search_knowledge_base_async( ) all_documents.extend(chunks) + elif connector == "TEAMS_CONNECTOR": + _, chunks = await connector_service.search_teams( + user_query=query, + search_space_id=search_space_id, + top_k=top_k, + start_date=resolved_start_date, + end_date=resolved_end_date, + ) + all_documents.extend(chunks) + elif connector == "NOTION_CONNECTOR": _, chunks = await connector_service.search_notion( user_query=query, @@ -519,6 +610,39 @@ async def search_knowledge_base_async( ) all_documents.extend(chunks) + # ========================================================= + # Composio Connectors + # ========================================================= + elif connector == "COMPOSIO_GOOGLE_DRIVE_CONNECTOR": + _, chunks = await connector_service.search_composio_google_drive( + user_query=query, + search_space_id=search_space_id, + top_k=top_k, + start_date=resolved_start_date, + end_date=resolved_end_date, + ) + all_documents.extend(chunks) + + elif connector == "COMPOSIO_GMAIL_CONNECTOR": + _, chunks = await connector_service.search_composio_gmail( + user_query=query, + search_space_id=search_space_id, + top_k=top_k, + start_date=resolved_start_date, + end_date=resolved_end_date, + ) + all_documents.extend(chunks) + + elif connector == "COMPOSIO_GOOGLE_CALENDAR_CONNECTOR": + _, chunks = await connector_service.search_composio_google_calendar( + user_query=query, + search_space_id=search_space_id, + top_k=top_k, + start_date=resolved_start_date, + end_date=resolved_end_date, + ) + all_documents.extend(chunks) + except Exception as e: print(f"Error searching connector {connector}: {e}") continue @@ -543,11 +667,68 @@ async def search_knowledge_base_async( return format_documents_for_context(deduplicated) +def _build_connector_docstring(available_connectors: list[str] | None) -> str: + """ + Build the connector documentation section for the tool docstring. + + Args: + available_connectors: List of available connector types, or None for all + + Returns: + Formatted docstring section listing available connectors + """ + connectors = available_connectors if available_connectors else list(_ALL_CONNECTORS) + + lines = [] + for connector in connectors: + # Skip internal names, prefer user-facing aliases + if connector == "CRAWLED_URL": + # Show as WEBCRAWLER_CONNECTOR for user-facing docs + description = CONNECTOR_DESCRIPTIONS.get(connector, connector) + lines.append(f"- WEBCRAWLER_CONNECTOR: {description}") + else: + description = CONNECTOR_DESCRIPTIONS.get(connector, connector) + lines.append(f"- {connector}: {description}") + + return "\n".join(lines) + + +# ============================================================================= +# Tool Input Schema +# ============================================================================= + + +class SearchKnowledgeBaseInput(BaseModel): + """Input schema for the search_knowledge_base tool.""" + + query: str = Field( + description="The search query - be specific and include key terms" + ) + top_k: int = Field( + default=10, + description="Number of results to retrieve (default: 10)", + ) + start_date: str | None = Field( + default=None, + description="Optional ISO date/datetime (e.g. '2025-12-12' or '2025-12-12T00:00:00+00:00')", + ) + end_date: str | None = Field( + default=None, + description="Optional ISO date/datetime (e.g. '2025-12-19' or '2025-12-19T23:59:59+00:00')", + ) + connectors_to_search: list[str] | None = Field( + default=None, + description="Optional list of connector enums to search. If omitted, searches all available.", + ) + + def create_search_knowledge_base_tool( search_space_id: int, db_session: AsyncSession, connector_service: ConnectorService, -): + available_connectors: list[str] | None = None, + available_document_types: list[str] | None = None, +) -> StructuredTool: """ Factory function to create the search_knowledge_base tool with injected dependencies. @@ -555,72 +736,57 @@ def create_search_knowledge_base_tool( search_space_id: The user's search space ID db_session: Database session connector_service: Initialized connector service + available_connectors: Optional list of connector types available in the search space. + Used to dynamically generate the tool docstring. + available_document_types: Optional list of document types that have data in the search space. + Used to inform the LLM about what data exists. Returns: - A configured tool function + A configured StructuredTool instance """ + # Build connector documentation dynamically + connector_docs = _build_connector_docstring(available_connectors) - @tool - async def search_knowledge_base( + # Build context about available document types + doc_types_info = "" + if available_document_types: + doc_types_info = f""" + +## Document types with indexed content in this search space + +The following document types have content available for search: +{", ".join(available_document_types)} + +Focus searches on these types for best results.""" + + # Build the dynamic description for the tool + # This is what the LLM sees when deciding whether/how to use the tool + dynamic_description = f"""Search the user's personal knowledge base for relevant information. + +Use this tool to find documents, notes, files, web pages, and other content that may help answer the user's question. + +IMPORTANT: +- If the user requests a specific source type (e.g. "my notes", "Slack messages"), pass `connectors_to_search=[...]` using the enums below. +- If `connectors_to_search` is omitted/empty, the system will search broadly. +- Only connectors that are enabled/configured for this search space are available.{doc_types_info} + +## Available connector enums for `connectors_to_search` + +{connector_docs} + +NOTE: `WEBCRAWLER_CONNECTOR` is mapped internally to the canonical document type `CRAWLED_URL`.""" + + # Capture for closure + _available_connectors = available_connectors + + async def _search_knowledge_base_impl( query: str, top_k: int = 10, start_date: str | None = None, end_date: str | None = None, connectors_to_search: list[str] | None = None, ) -> str: - """ - Search the user's personal knowledge base for relevant information. - - Use this tool to find documents, notes, files, web pages, and other content - that may help answer the user's question. - - IMPORTANT: - - If the user requests a specific source type (e.g. "my notes", "Slack messages"), - pass `connectors_to_search=[...]` using the enums below. - - If `connectors_to_search` is omitted/empty, the system will search broadly. - - ## Available connector enums for `connectors_to_search` - - - EXTENSION: "Web content saved via SurfSense browser extension" (personal browsing history) - - FILE: "User-uploaded documents (PDFs, Word, etc.)" (personal files) - - NOTE: "SurfSense Notes" (notes created inside SurfSense) - - SLACK_CONNECTOR: "Slack conversations and shared content" (personal workspace communications) - - TEAMS_CONNECTOR: "Microsoft Teams messages and conversations" (personal Teams communications) - - NOTION_CONNECTOR: "Notion workspace pages and databases" (personal knowledge management) - - YOUTUBE_VIDEO: "YouTube video transcripts and metadata" (personally saved videos) - - GITHUB_CONNECTOR: "GitHub repository content and issues" (personal repositories and interactions) - - ELASTICSEARCH_CONNECTOR: "Elasticsearch indexed documents and data" (personal Elasticsearch instances and custom data sources) - - LINEAR_CONNECTOR: "Linear project issues and discussions" (personal project management) - - JIRA_CONNECTOR: "Jira project issues, tickets, and comments" (personal project tracking) - - CONFLUENCE_CONNECTOR: "Confluence pages and comments" (personal project documentation) - - CLICKUP_CONNECTOR: "ClickUp tasks and project data" (personal task management) - - GOOGLE_CALENDAR_CONNECTOR: "Google Calendar events, meetings, and schedules" (personal calendar and time management) - - GOOGLE_GMAIL_CONNECTOR: "Google Gmail emails and conversations" (personal emails and communications) - - GOOGLE_DRIVE_FILE: "Google Drive files and documents" (personal cloud storage and file management) - - DISCORD_CONNECTOR: "Discord server conversations and shared content" (personal community communications) - - AIRTABLE_CONNECTOR: "Airtable records, tables, and database content" (personal data management and organization) - - TAVILY_API: "Tavily search API results" (personalized search results) - - SEARXNG_API: "SearxNG search API results" (personalized search results) - - LINKUP_API: "Linkup search API results" (personalized search results) - - BAIDU_SEARCH_API: "Baidu search API results" (personalized search results) - - LUMA_CONNECTOR: "Luma events" - - WEBCRAWLER_CONNECTOR: "Webpages indexed by SurfSense" (personally selected websites) - - BOOKSTACK_CONNECTOR: "BookStack pages" (personal documentation) - - CIRCLEBACK: "Circleback meeting notes, transcripts, and action items" (personal meeting records) - - OBSIDIAN_CONNECTOR: "Obsidian vault notes and markdown files" (personal notes and knowledge management) - - NOTE: `WEBCRAWLER_CONNECTOR` is mapped internally to the canonical document type `CRAWLED_URL`. - - Args: - query: The search query - be specific and include key terms - top_k: Number of results to retrieve (default: 10) - start_date: Optional ISO date/datetime (e.g. "2025-12-12" or "2025-12-12T00:00:00+00:00") - end_date: Optional ISO date/datetime (e.g. "2025-12-19" or "2025-12-19T23:59:59+00:00") - connectors_to_search: Optional list of connector enums to search. If omitted, searches all. - - Returns: - Formatted string with relevant documents and their content - """ + """Implementation function for knowledge base search.""" from app.agents.new_chat.utils import parse_date_or_datetime parsed_start: datetime | None = None @@ -640,6 +806,16 @@ def create_search_knowledge_base_tool( top_k=top_k, start_date=parsed_start, end_date=parsed_end, + available_connectors=_available_connectors, ) - return search_knowledge_base + # Create StructuredTool with dynamic description + # This properly sets the description that the LLM sees + tool = StructuredTool( + name="search_knowledge_base", + description=dynamic_description, + coroutine=_search_knowledge_base_impl, + args_schema=SearchKnowledgeBaseInput, + ) + + return tool diff --git a/surfsense_backend/app/agents/new_chat/tools/registry.py b/surfsense_backend/app/agents/new_chat/tools/registry.py index e4ce7a6b7..968e51445 100644 --- a/surfsense_backend/app/agents/new_chat/tools/registry.py +++ b/surfsense_backend/app/agents/new_chat/tools/registry.py @@ -85,6 +85,7 @@ class ToolDefinition: # Contributors: Add your new tools here! BUILTIN_TOOLS: list[ToolDefinition] = [ # Core tool - searches the user's knowledge base + # Now supports dynamic connector/document type discovery ToolDefinition( name="search_knowledge_base", description="Search the user's personal knowledge base for relevant information", @@ -92,8 +93,12 @@ BUILTIN_TOOLS: list[ToolDefinition] = [ search_space_id=deps["search_space_id"], db_session=deps["db_session"], connector_service=deps["connector_service"], + # Optional: dynamically discovered connectors/document types + available_connectors=deps.get("available_connectors"), + available_document_types=deps.get("available_document_types"), ), requires=["search_space_id", "db_session", "connector_service"], + # Note: available_connectors and available_document_types are optional ), # Podcast generation tool ToolDefinition( diff --git a/surfsense_backend/app/services/connector_service.py b/surfsense_backend/app/services/connector_service.py index dc43697e7..4c5599815 100644 --- a/surfsense_backend/app/services/connector_service.py +++ b/surfsense_backend/app/services/connector_service.py @@ -2871,3 +2871,350 @@ class ConnectorService: } return result_object, obsidian_docs + + # ========================================================================= + # Composio Connector Search Methods + # ========================================================================= + + async def search_composio_google_drive( + self, + user_query: str, + search_space_id: int, + top_k: int = 20, + start_date: datetime | None = None, + end_date: datetime | None = None, + ) -> tuple: + """ + Search for Composio Google Drive files and return both the source information + and langchain documents. + + Uses combined chunk-level and document-level hybrid search with RRF fusion. + + Args: + user_query: The user's query + search_space_id: The search space ID to search in + top_k: Maximum number of results to return + start_date: Optional start date for filtering documents by updated_at + end_date: Optional end date for filtering documents by updated_at + + Returns: + tuple: (sources_info, langchain_documents) + """ + composio_drive_docs = await self._combined_rrf_search( + query_text=user_query, + search_space_id=search_space_id, + document_type="COMPOSIO_GOOGLE_DRIVE_CONNECTOR", + top_k=top_k, + start_date=start_date, + end_date=end_date, + ) + + # Early return if no results + if not composio_drive_docs: + return { + "id": 54, + "name": "Google Drive (Composio)", + "type": "COMPOSIO_GOOGLE_DRIVE_CONNECTOR", + "sources": [], + }, [] + + def _title_fn(doc_info: dict[str, Any], metadata: dict[str, Any]) -> str: + return ( + doc_info.get("title") + or metadata.get("title") + or metadata.get("file_name") + or "Untitled Document" + ) + + def _url_fn(_doc_info: dict[str, Any], metadata: dict[str, Any]) -> str: + return metadata.get("url") or metadata.get("web_view_link") or "" + + def _description_fn( + chunk: dict[str, Any], _doc_info: dict[str, Any], metadata: dict[str, Any] + ) -> str: + description = self._chunk_preview(chunk.get("content", ""), limit=200) + info_parts = [] + mime_type = metadata.get("mime_type") + modified_time = metadata.get("modified_time") + if mime_type: + info_parts.append(f"Type: {mime_type}") + if modified_time: + info_parts.append(f"Modified: {modified_time}") + if info_parts: + description = (description + " | " + " | ".join(info_parts)).strip(" |") + return description + + def _extra_fields_fn( + _chunk: dict[str, Any], _doc_info: dict[str, Any], metadata: dict[str, Any] + ) -> dict[str, Any]: + return { + "mime_type": metadata.get("mime_type", ""), + "file_id": metadata.get("file_id", ""), + "modified_time": metadata.get("modified_time", ""), + } + + sources_list = self._build_chunk_sources_from_documents( + composio_drive_docs, + title_fn=_title_fn, + url_fn=_url_fn, + description_fn=_description_fn, + extra_fields_fn=_extra_fields_fn, + ) + + # Create result object + result_object = { + "id": 54, + "name": "Google Drive (Composio)", + "type": "COMPOSIO_GOOGLE_DRIVE_CONNECTOR", + "sources": sources_list, + } + + return result_object, composio_drive_docs + + async def search_composio_gmail( + self, + user_query: str, + search_space_id: int, + top_k: int = 20, + start_date: datetime | None = None, + end_date: datetime | None = None, + ) -> tuple: + """ + Search for Composio Gmail messages and return both the source information + and langchain documents. + + Uses combined chunk-level and document-level hybrid search with RRF fusion. + + Args: + user_query: The user's query + search_space_id: The search space ID to search in + top_k: Maximum number of results to return + start_date: Optional start date for filtering documents by updated_at + end_date: Optional end date for filtering documents by updated_at + + Returns: + tuple: (sources_info, langchain_documents) + """ + composio_gmail_docs = await self._combined_rrf_search( + query_text=user_query, + search_space_id=search_space_id, + document_type="COMPOSIO_GMAIL_CONNECTOR", + top_k=top_k, + start_date=start_date, + end_date=end_date, + ) + + # Early return if no results + if not composio_gmail_docs: + return { + "id": 55, + "name": "Gmail (Composio)", + "type": "COMPOSIO_GMAIL_CONNECTOR", + "sources": [], + }, [] + + def _title_fn(doc_info: dict[str, Any], metadata: dict[str, Any]) -> str: + return ( + doc_info.get("title") + or metadata.get("subject") + or metadata.get("title") + or "Untitled Email" + ) + + def _url_fn(_doc_info: dict[str, Any], metadata: dict[str, Any]) -> str: + return metadata.get("url") or "" + + def _description_fn( + chunk: dict[str, Any], _doc_info: dict[str, Any], metadata: dict[str, Any] + ) -> str: + description = self._chunk_preview(chunk.get("content", ""), limit=200) + info_parts = [] + sender = metadata.get("from") or metadata.get("sender") + date = metadata.get("date") or metadata.get("received_at") + if sender: + info_parts.append(f"From: {sender}") + if date: + info_parts.append(f"Date: {date}") + if info_parts: + description = (description + " | " + " | ".join(info_parts)).strip(" |") + return description + + def _extra_fields_fn( + _chunk: dict[str, Any], _doc_info: dict[str, Any], metadata: dict[str, Any] + ) -> dict[str, Any]: + return { + "message_id": metadata.get("message_id", ""), + "thread_id": metadata.get("thread_id", ""), + "from": metadata.get("from", ""), + "to": metadata.get("to", ""), + "date": metadata.get("date", ""), + } + + sources_list = self._build_chunk_sources_from_documents( + composio_gmail_docs, + title_fn=_title_fn, + url_fn=_url_fn, + description_fn=_description_fn, + extra_fields_fn=_extra_fields_fn, + ) + + # Create result object + result_object = { + "id": 55, + "name": "Gmail (Composio)", + "type": "COMPOSIO_GMAIL_CONNECTOR", + "sources": sources_list, + } + + return result_object, composio_gmail_docs + + async def search_composio_google_calendar( + self, + user_query: str, + search_space_id: int, + top_k: int = 20, + start_date: datetime | None = None, + end_date: datetime | None = None, + ) -> tuple: + """ + Search for Composio Google Calendar events and return both the source information + and langchain documents. + + Uses combined chunk-level and document-level hybrid search with RRF fusion. + + Args: + user_query: The user's query + search_space_id: The search space ID to search in + top_k: Maximum number of results to return + start_date: Optional start date for filtering documents by updated_at + end_date: Optional end date for filtering documents by updated_at + + Returns: + tuple: (sources_info, langchain_documents) + """ + composio_calendar_docs = await self._combined_rrf_search( + query_text=user_query, + search_space_id=search_space_id, + document_type="COMPOSIO_GOOGLE_CALENDAR_CONNECTOR", + top_k=top_k, + start_date=start_date, + end_date=end_date, + ) + + # Early return if no results + if not composio_calendar_docs: + return { + "id": 56, + "name": "Google Calendar (Composio)", + "type": "COMPOSIO_GOOGLE_CALENDAR_CONNECTOR", + "sources": [], + }, [] + + def _title_fn(doc_info: dict[str, Any], metadata: dict[str, Any]) -> str: + return ( + doc_info.get("title") + or metadata.get("summary") + or metadata.get("title") + or "Untitled Event" + ) + + def _url_fn(_doc_info: dict[str, Any], metadata: dict[str, Any]) -> str: + return metadata.get("url") or metadata.get("html_link") or "" + + def _description_fn( + chunk: dict[str, Any], _doc_info: dict[str, Any], metadata: dict[str, Any] + ) -> str: + description = self._chunk_preview(chunk.get("content", ""), limit=200) + info_parts = [] + start_time = metadata.get("start_time") or metadata.get("start") + end_time = metadata.get("end_time") or metadata.get("end") + if start_time: + info_parts.append(f"Start: {start_time}") + if end_time: + info_parts.append(f"End: {end_time}") + if info_parts: + description = (description + " | " + " | ".join(info_parts)).strip(" |") + return description + + def _extra_fields_fn( + _chunk: dict[str, Any], _doc_info: dict[str, Any], metadata: dict[str, Any] + ) -> dict[str, Any]: + return { + "event_id": metadata.get("event_id", ""), + "calendar_id": metadata.get("calendar_id", ""), + "start_time": metadata.get("start_time", ""), + "end_time": metadata.get("end_time", ""), + "location": metadata.get("location", ""), + } + + sources_list = self._build_chunk_sources_from_documents( + composio_calendar_docs, + title_fn=_title_fn, + url_fn=_url_fn, + description_fn=_description_fn, + extra_fields_fn=_extra_fields_fn, + ) + + # Create result object + result_object = { + "id": 56, + "name": "Google Calendar (Composio)", + "type": "COMPOSIO_GOOGLE_CALENDAR_CONNECTOR", + "sources": sources_list, + } + + return result_object, composio_calendar_docs + + # ========================================================================= + # Utility Methods for Connector Discovery + # ========================================================================= + + async def get_available_connectors( + self, + search_space_id: int, + ) -> list[SearchSourceConnectorType]: + """ + Get all available (enabled) connector types for a search space. + + Args: + search_space_id: The search space ID + + Returns: + List of SearchSourceConnectorType enums for enabled connectors + """ + query = ( + select(SearchSourceConnector.connector_type) + .filter( + SearchSourceConnector.search_space_id == search_space_id, + ) + .distinct() + ) + + result = await self.session.execute(query) + connector_types = result.scalars().all() + return list(connector_types) + + async def get_available_document_types( + self, + search_space_id: int, + ) -> list[str]: + """ + Get all document types that have at least one document in the search space. + + Args: + search_space_id: The search space ID + + Returns: + List of document type strings that have documents indexed + """ + from sqlalchemy import distinct + + from app.db import Document + + query = select(distinct(Document.document_type)).filter( + Document.search_space_id == search_space_id, + ) + + result = await self.session.execute(query) + doc_types = result.scalars().all() + return [str(dt) for dt in doc_types] From 555df90c842c52ac50a708a38f424287e1fc88b5 Mon Sep 17 00:00:00 2001 From: "DESKTOP-RTLN3BA\\$punk" Date: Sat, 24 Jan 2026 17:47:18 -0800 Subject: [PATCH 27/28] chore: New connector statuses for Composio and GitHub --- .../config/connector-status-config.json | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/surfsense_web/components/assistant-ui/connector-popup/config/connector-status-config.json b/surfsense_web/components/assistant-ui/connector-popup/config/connector-status-config.json index b729c3f8b..2c1010b1c 100644 --- a/surfsense_web/components/assistant-ui/connector-popup/config/connector-status-config.json +++ b/surfsense_web/components/assistant-ui/connector-popup/config/connector-status-config.json @@ -24,6 +24,16 @@ "enabled": true, "status": "warning", "statusMessage": "Some requests may be blocked if not using Firecrawl." + }, + "COMPOSIO_GOOGLE_DRIVE_CONNECTOR": { + "enabled": false, + "status": "disabled", + "statusMessage": "Not available yet." + }, + "GITHUB_CONNECTOR": { + "enabled": false, + "status": "warning", + "statusMessage": "Some issues with indexing repositories." } }, "globalSettings": { From 09162ad5cad4d627aa070f881830f9ca95b9d2ee Mon Sep 17 00:00:00 2001 From: "DESKTOP-RTLN3BA\\$punk" Date: Sat, 24 Jan 2026 17:53:57 -0800 Subject: [PATCH 28/28] release: 0.0.12 --- surfsense_backend/pyproject.toml | 2 +- surfsense_backend/uv.lock | 2 +- surfsense_browser_extension/package.json | 2 +- surfsense_web/package.json | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/surfsense_backend/pyproject.toml b/surfsense_backend/pyproject.toml index ffe9e5232..57dbdc7b5 100644 --- a/surfsense_backend/pyproject.toml +++ b/surfsense_backend/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "surf-new-backend" -version = "0.0.11" +version = "0.0.12" description = "SurfSense Backend" requires-python = ">=3.12" dependencies = [ diff --git a/surfsense_backend/uv.lock b/surfsense_backend/uv.lock index 18f04288e..16b77a7b2 100644 --- a/surfsense_backend/uv.lock +++ b/surfsense_backend/uv.lock @@ -6545,7 +6545,7 @@ wheels = [ [[package]] name = "surf-new-backend" -version = "0.0.11" +version = "0.0.12" source = { editable = "." } dependencies = [ { name = "alembic" }, diff --git a/surfsense_browser_extension/package.json b/surfsense_browser_extension/package.json index b225bc206..bf926d09f 100644 --- a/surfsense_browser_extension/package.json +++ b/surfsense_browser_extension/package.json @@ -1,7 +1,7 @@ { "name": "surfsense_browser_extension", "displayName": "Surfsense Browser Extension", - "version": "0.0.11", + "version": "0.0.12", "description": "Extension to collect Browsing History for SurfSense.", "author": "https://github.com/MODSetter", "engines": { diff --git a/surfsense_web/package.json b/surfsense_web/package.json index 7ec05c95d..235f4b9db 100644 --- a/surfsense_web/package.json +++ b/surfsense_web/package.json @@ -1,6 +1,6 @@ { "name": "surfsense_web", - "version": "0.0.11", + "version": "0.0.12", "private": true, "description": "SurfSense Frontend", "scripts": {