- This connection uses Composio's managed OAuth, which means you don't need to
- wait for app verification. Your data is securely accessed through Composio.
-
- Connect these services for future indexing support. Currently available for connection only.
-
-
- {nonIndexableToolkits.map((toolkit) => (
-
-
-
- {getToolkitIcon(toolkit.id, "size-5")}
-
-
- Soon
-
-
-
{toolkit.name}
-
- {toolkit.description}
-
-
-
- ))}
-
-
-
- {/* Info footer */}
-
-
-
-
-
-
-
Why use Composio?
-
- Composio provides pre-verified OAuth apps, so you don't need to wait for Google app verification.
- Your data is securely processed through Composio's managed authentication.
-
-
-
-
-
-
- );
-};
From 4cbf80d73a74170a532cf1b531d7a9d670cc4663 Mon Sep 17 00:00:00 2001
From: Anish Sarkar <104695310+AnishSarkar22@users.noreply.github.com>
Date: Fri, 23 Jan 2026 04:44:37 +0530
Subject: [PATCH 03/28] feat: enhance Composio integration with pagination and
improved error handling
- Updated the list_gmail_messages method to support pagination with page tokens, allowing for more efficient message retrieval.
- Modified the return structure to include next_page_token and result_size_estimate for better client-side handling.
- Improved error handling and logging throughout the Gmail indexing process, ensuring better visibility into failures.
- Implemented batch processing for Gmail messages, committing changes incrementally to prevent data loss.
- Ensured consistent timestamp updates for connectors, even when no documents are indexed, to maintain accurate UI states.
- Refactored the indexing logic to streamline message processing and enhance overall performance.
---
.../app/connectors/composio_connector.py | 15 +-
.../routes/search_source_connectors_routes.py | 16 +-
.../app/services/composio_service.py | 54 +-
.../app/tasks/composio_indexer.py | 579 ++++++++++++------
4 files changed, 451 insertions(+), 213 deletions(-)
diff --git a/surfsense_backend/app/connectors/composio_connector.py b/surfsense_backend/app/connectors/composio_connector.py
index 18fd9564c..21e339d12 100644
--- a/surfsense_backend/app/connectors/composio_connector.py
+++ b/surfsense_backend/app/connectors/composio_connector.py
@@ -151,21 +151,23 @@ class ComposioConnector:
async def list_gmail_messages(
self,
query: str = "",
- max_results: int = 100,
- ) -> tuple[list[dict[str, Any]], str | None]:
+ max_results: int = 50,
+ page_token: str | None = None,
+ ) -> tuple[list[dict[str, Any]], str | None, int | None, str | None]:
"""
- List Gmail messages via Composio.
+ List Gmail messages via Composio with pagination support.
Args:
query: Gmail search query.
- max_results: Maximum number of messages.
+ max_results: Maximum number of messages per page (default: 50).
+ page_token: Optional pagination token for next page.
Returns:
- Tuple of (messages list, error message).
+ Tuple of (messages list, next_page_token, result_size_estimate, error message).
"""
connected_account_id = await self.get_connected_account_id()
if not connected_account_id:
- return [], "No connected account ID found"
+ return [], None, None, "No connected account ID found"
entity_id = await self.get_entity_id()
service = await self._get_service()
@@ -174,6 +176,7 @@ class ComposioConnector:
entity_id=entity_id,
query=query,
max_results=max_results,
+ page_token=page_token,
)
async def get_gmail_message_detail(
diff --git a/surfsense_backend/app/routes/search_source_connectors_routes.py b/surfsense_backend/app/routes/search_source_connectors_routes.py
index 9ad03fba8..1578ad0d5 100644
--- a/surfsense_backend/app/routes/search_source_connectors_routes.py
+++ b/surfsense_backend/app/routes/search_source_connectors_routes.py
@@ -957,7 +957,7 @@ async def _update_connector_timestamp_by_id(session: AsyncSession, connector_id:
connector = result.scalars().first()
if connector:
- connector.last_indexed_at = datetime.now()
+ connector.last_indexed_at = datetime.now(UTC) # Use UTC for timezone consistency
await session.commit()
logger.info(f"Updated last_indexed_at for connector {connector_id}")
except Exception as e:
@@ -1097,18 +1097,22 @@ async def _run_indexing_with_notifications(
)
await update_timestamp_func(session, connector_id)
+ await session.commit() # Commit timestamp update
logger.info(
f"Indexing completed successfully: {documents_processed} documents processed"
)
# Update notification on success
if notification:
+ # Refresh notification to ensure it's not stale after timestamp update commit
+ await session.refresh(notification)
await NotificationService.connector_indexing.notify_indexing_completed(
session=session,
notification=notification,
indexed_count=documents_processed,
error_message=None,
)
+ await session.commit() # Commit to ensure Electric SQL syncs the notification update
elif documents_processed > 0:
# Update notification to storing stage
if notification:
@@ -1124,24 +1128,30 @@ async def _run_indexing_with_notifications(
f"Indexing completed successfully: {documents_processed} documents processed"
)
if notification:
+ # Refresh notification to ensure it's not stale after indexing function commits
+ await session.refresh(notification)
await NotificationService.connector_indexing.notify_indexing_completed(
session=session,
notification=notification,
indexed_count=documents_processed,
error_message=None,
)
+ await session.commit() # Commit to ensure Electric SQL syncs the notification update
else:
# No new documents processed - check if this is an error or just no changes
if error_or_warning:
# Actual failure
logger.error(f"Indexing failed: {error_or_warning}")
if notification:
+ # Refresh notification to ensure it's not stale after indexing function commits
+ await session.refresh(notification)
await NotificationService.connector_indexing.notify_indexing_completed(
session=session,
notification=notification,
indexed_count=0,
error_message=error_or_warning,
)
+ await session.commit() # Commit to ensure Electric SQL syncs the notification update
else:
# Success - just no new documents to index (all skipped/unchanged)
logger.info(
@@ -1150,13 +1160,17 @@ async def _run_indexing_with_notifications(
# Still update timestamp so ElectricSQL syncs and clears "Syncing" UI
if update_timestamp_func:
await update_timestamp_func(session, connector_id)
+ await session.commit() # Commit timestamp update
if notification:
+ # Refresh notification to ensure it's not stale after timestamp update commit
+ await session.refresh(notification)
await NotificationService.connector_indexing.notify_indexing_completed(
session=session,
notification=notification,
indexed_count=0,
error_message=None, # No error - sync succeeded
)
+ await session.commit() # Commit to ensure Electric SQL syncs the notification update
except Exception as e:
logger.error(f"Error in indexing task: {e!s}", exc_info=True)
diff --git a/surfsense_backend/app/services/composio_service.py b/surfsense_backend/app/services/composio_service.py
index 17fbd64e0..e32cbf8a0 100644
--- a/surfsense_backend/app/services/composio_service.py
+++ b/surfsense_backend/app/services/composio_service.py
@@ -256,7 +256,6 @@ class ComposioService:
"user_id": getattr(acc, "user_id", None),
})
- logger.info(f"DEBUG: Found {len(result)} TOTAL connections in Composio")
return result
except Exception as e:
logger.error(f"Failed to list all connections: {e!s}")
@@ -273,7 +272,6 @@ class ComposioService:
List of connected account details.
"""
try:
- logger.info(f"DEBUG: Calling connected_accounts.list(user_id='{user_id}')")
accounts_response = self.client.connected_accounts.list(user_id=user_id)
# Handle paginated response (may have .items attribute) or direct list
@@ -358,7 +356,6 @@ class ComposioService:
# - connected_account_id: for authentication
# - user_id: user identifier (SDK uses user_id, not entity_id)
# - dangerously_skip_version_check: skip version check for manual execution
- logger.info(f"DEBUG: Executing tool {tool_name} with params: {params}")
result = self.client.tools.execute(
slug=tool_name,
connected_account_id=connected_account_id,
@@ -366,8 +363,6 @@ class ComposioService:
arguments=params or {},
dangerously_skip_version_check=True,
)
- logger.info(f"DEBUG: Tool {tool_name} raw result type: {type(result)}")
- logger.info(f"DEBUG: Tool {tool_name} raw result: {result}")
return {"success": True, "data": result}
except Exception as e:
logger.error(f"Failed to execute tool {tool_name}: {e!s}")
@@ -417,7 +412,6 @@ class ComposioService:
return [], None, result.get("error", "Unknown error")
data = result.get("data", {})
- logger.info(f"DEBUG: Drive data type: {type(data)}, keys: {data.keys() if isinstance(data, dict) else 'N/A'}")
# Handle nested response structure from Composio
files = []
@@ -429,7 +423,6 @@ class ComposioService:
elif isinstance(data, list):
files = data
- logger.info(f"DEBUG: Extracted {len(files)} drive files")
return files, next_token, None
except Exception as e:
@@ -478,25 +471,30 @@ class ComposioService:
connected_account_id: str,
entity_id: str,
query: str = "",
- max_results: int = 100,
- ) -> tuple[list[dict[str, Any]], str | None]:
+ max_results: int = 50,
+ page_token: str | None = None,
+ ) -> tuple[list[dict[str, Any]], str | None, int | None, str | None]:
"""
- List Gmail messages via Composio.
+ List Gmail messages via Composio with pagination support.
Args:
connected_account_id: Composio connected account ID.
entity_id: The entity/user ID that owns the connected account.
query: Gmail search query.
- max_results: Maximum number of messages to return.
+ max_results: Maximum number of messages to return per page (default: 50 to avoid payload size issues).
+ page_token: Optional pagination token for next page.
Returns:
- Tuple of (messages list, error message).
+ Tuple of (messages list, next_page_token, result_size_estimate, error message).
"""
try:
- # Composio uses snake_case for parameters, max is 500
- params = {"max_results": min(max_results, 500)}
+ # Use smaller batch size to avoid 413 payload too large errors
+ # Composio uses snake_case for parameters
+ params = {"max_results": min(max_results, 50)} # Reduced from 500 to 50
if query:
params["query"] = query # Composio uses 'query' not 'q'
+ if page_token:
+ params["page_token"] = page_token
result = await self.execute_tool(
connected_account_id=connected_account_id,
@@ -506,25 +504,38 @@ class ComposioService:
)
if not result.get("success"):
- return [], result.get("error", "Unknown error")
+ return [], None, result.get("error", "Unknown error")
data = result.get("data", {})
- logger.info(f"DEBUG: Gmail data type: {type(data)}, keys: {data.keys() if isinstance(data, dict) else 'N/A'}")
- logger.info(f"DEBUG: Gmail full data: {data}")
# Try different possible response structures
messages = []
+ next_token = None
+ result_size_estimate = None
if isinstance(data, dict):
messages = data.get("messages", []) or data.get("data", {}).get("messages", []) or data.get("emails", [])
+ # Check for pagination token in various possible locations
+ next_token = (
+ data.get("nextPageToken")
+ or data.get("next_page_token")
+ or data.get("data", {}).get("nextPageToken")
+ or data.get("data", {}).get("next_page_token")
+ )
+ # Extract resultSizeEstimate if available (Gmail API provides this)
+ result_size_estimate = (
+ data.get("resultSizeEstimate")
+ or data.get("result_size_estimate")
+ or data.get("data", {}).get("resultSizeEstimate")
+ or data.get("data", {}).get("result_size_estimate")
+ )
elif isinstance(data, list):
messages = data
- logger.info(f"DEBUG: Extracted {len(messages)} messages")
- return messages, None
+ return messages, next_token, result_size_estimate, None
except Exception as e:
logger.error(f"Failed to list Gmail messages: {e!s}")
- return [], str(e)
+ return [], None, str(e)
async def get_gmail_message_detail(
self, connected_account_id: str, entity_id: str, message_id: str
@@ -603,8 +614,6 @@ class ComposioService:
return [], result.get("error", "Unknown error")
data = result.get("data", {})
- logger.info(f"DEBUG: Calendar data type: {type(data)}, keys: {data.keys() if isinstance(data, dict) else 'N/A'}")
- logger.info(f"DEBUG: Calendar full data: {data}")
# Try different possible response structures
events = []
@@ -613,7 +622,6 @@ class ComposioService:
elif isinstance(data, list):
events = data
- logger.info(f"DEBUG: Extracted {len(events)} calendar events")
return events, None
except Exception as e:
diff --git a/surfsense_backend/app/tasks/composio_indexer.py b/surfsense_backend/app/tasks/composio_indexer.py
index 8762561ee..c9cd74234 100644
--- a/surfsense_backend/app/tasks/composio_indexer.py
+++ b/surfsense_backend/app/tasks/composio_indexer.py
@@ -9,6 +9,7 @@ to avoid circular import issues with the connector_indexers package.
import logging
from datetime import UTC, datetime
+from typing import Any
from sqlalchemy.exc import SQLAlchemyError
from sqlalchemy.ext.asyncio import AsyncSession
@@ -26,6 +27,7 @@ from app.db import (
from app.services.composio_service import INDEXABLE_TOOLKITS, TOOLKIT_TO_DOCUMENT_TYPE
from app.services.llm_service import get_user_long_context_llm
from app.services.task_logging_service import TaskLoggingService
+from app.tasks.connector_indexers.base import calculate_date_range
from app.utils.document_converters import (
create_document_chunks,
generate_content_hash,
@@ -75,7 +77,7 @@ async def update_connector_last_indexed(
) -> None:
"""Update the last_indexed_at timestamp for a connector."""
if update_last_indexed:
- connector.last_indexed_at = datetime.now()
+ connector.last_indexed_at = datetime.now(UTC) # Use UTC for timezone consistency
logger.info(f"Updated last_indexed_at to {connector.last_indexed_at}")
@@ -287,6 +289,9 @@ async def _index_composio_google_drive(
await task_logger.log_task_success(
log_entry, success_msg, {"files_count": 0}
)
+ # CRITICAL: Update timestamp even when no files found so Electric SQL syncs and UI shows indexed status
+ await update_connector_last_indexed(session, connector, update_last_indexed)
+ await session.commit()
return 0, None # Return None (not error) when no items found - this is success with 0 items
logger.info(f"Found {len(all_files)} Google Drive files to index via Composio")
@@ -380,6 +385,13 @@ async def _index_composio_google_drive(
existing_document.updated_at = get_current_timestamp()
documents_indexed += 1
+
+ # Batch commit every 10 documents
+ if documents_indexed % 10 == 0:
+ logger.info(
+ f"Committing batch: {documents_indexed} Google Drive files processed so far"
+ )
+ await session.commit()
continue
# Create new document
@@ -425,7 +437,11 @@ async def _index_composio_google_drive(
session.add(document)
documents_indexed += 1
+ # Batch commit every 10 documents
if documents_indexed % 10 == 0:
+ logger.info(
+ f"Committing batch: {documents_indexed} Google Drive files processed so far"
+ )
await session.commit()
except Exception as e:
@@ -433,10 +449,19 @@ async def _index_composio_google_drive(
documents_skipped += 1
continue
- if documents_indexed > 0:
- await update_connector_last_indexed(session, connector, update_last_indexed)
+ # CRITICAL: Always update timestamp (even if 0 documents indexed) so Electric SQL syncs
+ # This ensures the UI shows "Last indexed" instead of "Never indexed"
+ await update_connector_last_indexed(session, connector, update_last_indexed)
+ # Final commit to ensure all documents are persisted (safety net)
+ # This matches the pattern used in non-Composio Gmail indexer
+ logger.info(
+ f"Final commit: Total {documents_indexed} Google Drive files processed"
+ )
await session.commit()
+ logger.info(
+ "Successfully committed all Composio Google Drive document changes to database"
+ )
await task_logger.log_task_success(
log_entry,
@@ -454,154 +479,89 @@ async def _index_composio_google_drive(
return 0, f"Failed to index Google Drive via Composio: {e!s}"
-async def _index_composio_gmail(
+async def _process_gmail_message_batch(
session: AsyncSession,
- connector,
+ messages: list[dict[str, Any]],
+ composio_connector: ComposioConnector,
connector_id: int,
search_space_id: int,
user_id: str,
- start_date: str | None,
- end_date: str | None,
- task_logger: TaskLoggingService,
- log_entry,
- update_last_indexed: bool = True,
- max_items: int = 1000,
-) -> tuple[int, str]:
- """Index Gmail messages via Composio."""
- try:
- composio_connector = ComposioConnector(session, connector_id)
+ total_documents_indexed: int = 0,
+) -> tuple[int, int]:
+ """
+ Process a batch of Gmail messages and index them.
+
+ Args:
+ total_documents_indexed: Running total of documents indexed so far (for batch commits).
+
+ Returns:
+ Tuple of (documents_indexed, documents_skipped)
+ """
+ documents_indexed = 0
+ documents_skipped = 0
- await task_logger.log_task_progress(
- log_entry,
- f"Fetching Gmail messages via Composio for connector {connector_id}",
- {"stage": "fetching_messages"},
- )
+ for message in messages:
+ try:
+ # Composio uses 'messageId' (camelCase), not 'id'
+ message_id = message.get("messageId", "") or message.get("id", "")
+ if not message_id:
+ documents_skipped += 1
+ continue
- # Build query with date range
- query_parts = []
- if start_date:
- query_parts.append(f"after:{start_date.replace('-', '/')}")
- if end_date:
- query_parts.append(f"before:{end_date.replace('-', '/')}")
- query = " ".join(query_parts)
+ # Composio's GMAIL_FETCH_EMAILS already returns full message content
+ # No need for a separate detail API call
- messages, error = await composio_connector.list_gmail_messages(
- query=query,
- max_results=max_items,
- )
+ # Extract message info from Composio response
+ # Composio structure: messageId, messageText, messageTimestamp, payload.headers, labelIds
+ payload = message.get("payload", {})
+ headers = payload.get("headers", [])
- if error:
- await task_logger.log_task_failure(
- log_entry, f"Failed to fetch Gmail messages: {error}", {}
+ subject = "No Subject"
+ sender = "Unknown Sender"
+ date_str = message.get("messageTimestamp", "Unknown Date")
+
+ for header in headers:
+ name = header.get("name", "").lower()
+ value = header.get("value", "")
+ if name == "subject":
+ subject = value
+ elif name == "from":
+ sender = value
+ elif name == "date":
+ date_str = value
+
+ # Format to markdown using the full message data
+ markdown_content = composio_connector.format_gmail_message_to_markdown(message)
+
+ # Check for empty content (defensive parsing per Composio best practices)
+ if not markdown_content.strip():
+ logger.warning(f"Skipping Gmail message with no content: {subject}")
+ documents_skipped += 1
+ continue
+
+ # Generate unique identifier
+ document_type = DocumentType(TOOLKIT_TO_DOCUMENT_TYPE["gmail"])
+ unique_identifier_hash = generate_unique_identifier_hash(
+ document_type, f"gmail_{message_id}", search_space_id
)
- return 0, f"Failed to fetch Gmail messages: {error}"
- if not messages:
- success_msg = "No Gmail messages found in the specified date range"
- await task_logger.log_task_success(
- log_entry, success_msg, {"messages_count": 0}
+ content_hash = generate_content_hash(markdown_content, search_space_id)
+
+ existing_document = await check_document_by_unique_identifier(
+ session, unique_identifier_hash
)
- return 0, None # Return None (not error) when no items found - this is success with 0 items
- logger.info(f"Found {len(messages)} Gmail messages to index via Composio")
+ # Get label IDs from Composio response
+ label_ids = message.get("labelIds", [])
+ # Extract thread_id if available (for consistency with non-Composio implementation)
+ thread_id = message.get("threadId", "") or message.get("thread_id", "")
- documents_indexed = 0
- documents_skipped = 0
-
- for message in messages:
- try:
- # Composio uses 'messageId' (camelCase), not 'id'
- message_id = message.get("messageId", "") or message.get("id", "")
- if not message_id:
+ if existing_document:
+ if existing_document.content_hash == content_hash:
documents_skipped += 1
continue
- # Composio's GMAIL_FETCH_EMAILS already returns full message content
- # No need for a separate detail API call
-
- # Extract message info from Composio response
- # Composio structure: messageId, messageText, messageTimestamp, payload.headers, labelIds
- payload = message.get("payload", {})
- headers = payload.get("headers", [])
-
- subject = "No Subject"
- sender = "Unknown Sender"
- date_str = message.get("messageTimestamp", "Unknown Date")
-
- for header in headers:
- name = header.get("name", "").lower()
- value = header.get("value", "")
- if name == "subject":
- subject = value
- elif name == "from":
- sender = value
- elif name == "date":
- date_str = value
-
- # Format to markdown using the full message data
- markdown_content = composio_connector.format_gmail_message_to_markdown(message)
-
- # Generate unique identifier
- document_type = DocumentType(TOOLKIT_TO_DOCUMENT_TYPE["gmail"])
- unique_identifier_hash = generate_unique_identifier_hash(
- document_type, f"gmail_{message_id}", search_space_id
- )
-
- content_hash = generate_content_hash(markdown_content, search_space_id)
-
- existing_document = await check_document_by_unique_identifier(
- session, unique_identifier_hash
- )
-
- # Get label IDs from Composio response
- label_ids = message.get("labelIds", [])
-
- if existing_document:
- if existing_document.content_hash == content_hash:
- documents_skipped += 1
- continue
-
- # Update existing
- user_llm = await get_user_long_context_llm(
- session, user_id, search_space_id
- )
-
- if user_llm:
- document_metadata = {
- "message_id": message_id,
- "subject": subject,
- "sender": sender,
- "document_type": "Gmail Message (Composio)",
- }
- summary_content, summary_embedding = await generate_document_summary(
- markdown_content, user_llm, document_metadata
- )
- else:
- summary_content = f"Gmail: {subject}\n\nFrom: {sender}\nDate: {date_str}"
- summary_embedding = config.embedding_model_instance.embed(summary_content)
-
- chunks = await create_document_chunks(markdown_content)
-
- existing_document.title = f"Gmail: {subject}"
- existing_document.content = summary_content
- existing_document.content_hash = content_hash
- existing_document.embedding = summary_embedding
- existing_document.document_metadata = {
- "message_id": message_id,
- "subject": subject,
- "sender": sender,
- "date": date_str,
- "labels": label_ids,
- "connector_id": connector_id,
- "source": "composio",
- }
- existing_document.chunks = chunks
- existing_document.updated_at = get_current_timestamp()
-
- documents_indexed += 1
- continue
-
- # Create new document
+ # Update existing
user_llm = await get_user_long_context_llm(
session, user_id, search_space_id
)
@@ -609,6 +569,7 @@ async def _index_composio_gmail(
if user_llm:
document_metadata = {
"message_id": message_id,
+ "thread_id": thread_id,
"subject": subject,
"sender": sender,
"document_type": "Gmail Message (Composio)",
@@ -622,53 +583,276 @@ async def _index_composio_gmail(
chunks = await create_document_chunks(markdown_content)
- document = Document(
- search_space_id=search_space_id,
- title=f"Gmail: {subject}",
- document_type=DocumentType(TOOLKIT_TO_DOCUMENT_TYPE["gmail"]),
- document_metadata={
- "message_id": message_id,
- "subject": subject,
- "sender": sender,
- "date": date_str,
- "labels": label_ids,
- "connector_id": connector_id,
- "toolkit_id": "gmail",
- "source": "composio",
- },
- content=summary_content,
- content_hash=content_hash,
- unique_identifier_hash=unique_identifier_hash,
- embedding=summary_embedding,
- chunks=chunks,
- updated_at=get_current_timestamp(),
- )
- session.add(document)
+ existing_document.title = f"Gmail: {subject}"
+ existing_document.content = summary_content
+ existing_document.content_hash = content_hash
+ existing_document.embedding = summary_embedding
+ existing_document.document_metadata = {
+ "message_id": message_id,
+ "thread_id": thread_id,
+ "subject": subject,
+ "sender": sender,
+ "date": date_str,
+ "labels": label_ids,
+ "connector_id": connector_id,
+ "source": "composio",
+ }
+ existing_document.chunks = chunks
+ existing_document.updated_at = get_current_timestamp()
+
documents_indexed += 1
-
- if documents_indexed % 10 == 0:
+
+ # Batch commit every 10 documents
+ current_total = total_documents_indexed + documents_indexed
+ if current_total % 10 == 0:
+ logger.info(
+ f"Committing batch: {current_total} Gmail messages processed so far"
+ )
await session.commit()
-
- except Exception as e:
- logger.error(f"Error processing Gmail message: {e!s}", exc_info=True)
- documents_skipped += 1
continue
- if documents_indexed > 0:
- await update_connector_last_indexed(session, connector, update_last_indexed)
+ # Create new document
+ user_llm = await get_user_long_context_llm(
+ session, user_id, search_space_id
+ )
+ if user_llm:
+ document_metadata = {
+ "message_id": message_id,
+ "thread_id": thread_id,
+ "subject": subject,
+ "sender": sender,
+ "document_type": "Gmail Message (Composio)",
+ }
+ summary_content, summary_embedding = await generate_document_summary(
+ markdown_content, user_llm, document_metadata
+ )
+ else:
+ summary_content = f"Gmail: {subject}\n\nFrom: {sender}\nDate: {date_str}"
+ summary_embedding = config.embedding_model_instance.embed(summary_content)
+
+ chunks = await create_document_chunks(markdown_content)
+
+ document = Document(
+ search_space_id=search_space_id,
+ title=f"Gmail: {subject}",
+ document_type=DocumentType(TOOLKIT_TO_DOCUMENT_TYPE["gmail"]),
+ document_metadata={
+ "message_id": message_id,
+ "thread_id": thread_id,
+ "subject": subject,
+ "sender": sender,
+ "date": date_str,
+ "labels": label_ids,
+ "connector_id": connector_id,
+ "toolkit_id": "gmail",
+ "source": "composio",
+ },
+ content=summary_content,
+ content_hash=content_hash,
+ unique_identifier_hash=unique_identifier_hash,
+ embedding=summary_embedding,
+ chunks=chunks,
+ updated_at=get_current_timestamp(),
+ )
+ session.add(document)
+ documents_indexed += 1
+
+ # Batch commit every 10 documents
+ current_total = total_documents_indexed + documents_indexed
+ if current_total % 10 == 0:
+ logger.info(
+ f"Committing batch: {current_total} Gmail messages processed so far"
+ )
+ await session.commit()
+
+ except Exception as e:
+ logger.error(f"Error processing Gmail message: {e!s}", exc_info=True)
+ documents_skipped += 1
+ # Rollback on error to avoid partial state (per Composio best practices)
+ try:
+ await session.rollback()
+ except Exception as rollback_error:
+ logger.error(f"Error during rollback: {rollback_error!s}", exc_info=True)
+ continue
+
+ return documents_indexed, documents_skipped
+
+
+async def _index_composio_gmail(
+ session: AsyncSession,
+ connector,
+ connector_id: int,
+ search_space_id: int,
+ user_id: str,
+ start_date: str | None,
+ end_date: str | None,
+ task_logger: TaskLoggingService,
+ log_entry,
+ update_last_indexed: bool = True,
+ max_items: int = 1000,
+) -> tuple[int, str]:
+ """Index Gmail messages via Composio with pagination and incremental processing."""
+ try:
+ composio_connector = ComposioConnector(session, connector_id)
+
+ # Normalize date values - handle "undefined" strings from frontend
+ if start_date == "undefined" or start_date == "":
+ start_date = None
+ if end_date == "undefined" or end_date == "":
+ end_date = None
+
+ # Calculate date range with defaults (uses last_indexed_at or 365 days back)
+ # This ensures indexing works even when user doesn't specify dates
+ start_date_str, end_date_str = calculate_date_range(
+ connector, start_date, end_date, default_days_back=365
+ )
+
+ # Build query with date range
+ query_parts = []
+ if start_date_str:
+ query_parts.append(f"after:{start_date_str.replace('-', '/')}")
+ if end_date_str:
+ query_parts.append(f"before:{end_date_str.replace('-', '/')}")
+ query = " ".join(query_parts) if query_parts else ""
+
+ logger.info(
+ f"Gmail query for connector {connector_id}: '{query}' "
+ f"(start_date={start_date_str}, end_date={end_date_str})"
+ )
+
+ # Use smaller batch size to avoid 413 payload too large errors
+ batch_size = 50
+ page_token = None
+ total_documents_indexed = 0
+ total_documents_skipped = 0
+ total_messages_fetched = 0
+ result_size_estimate = None # Will be set from first API response
+
+ while total_messages_fetched < max_items:
+ # Calculate how many messages to fetch in this batch
+ remaining = max_items - total_messages_fetched
+ current_batch_size = min(batch_size, remaining)
+
+ # Use result_size_estimate if available, otherwise fall back to max_items
+ estimated_total = result_size_estimate if result_size_estimate is not None else max_items
+ # Cap estimated_total at max_items to avoid showing misleading progress
+ estimated_total = min(estimated_total, max_items)
+
+ await task_logger.log_task_progress(
+ log_entry,
+ f"Fetching Gmail messages batch via Composio for connector {connector_id} "
+ f"({total_messages_fetched}/{estimated_total} fetched, {total_documents_indexed} indexed)",
+ {
+ "stage": "fetching_messages",
+ "batch_size": current_batch_size,
+ "total_fetched": total_messages_fetched,
+ "total_indexed": total_documents_indexed,
+ "estimated_total": estimated_total,
+ },
+ )
+
+ # Fetch batch of messages
+ messages, next_token, result_size_estimate_batch, error = await composio_connector.list_gmail_messages(
+ query=query,
+ max_results=current_batch_size,
+ page_token=page_token,
+ )
+
+ if error:
+ await task_logger.log_task_failure(
+ log_entry, f"Failed to fetch Gmail messages: {error}", {}
+ )
+ return 0, f"Failed to fetch Gmail messages: {error}"
+
+ if not messages:
+ # No more messages available
+ break
+
+ # Update result_size_estimate from first response (Gmail provides this estimate)
+ if result_size_estimate is None and result_size_estimate_batch is not None:
+ result_size_estimate = result_size_estimate_batch
+ logger.info(f"Gmail API estimated {result_size_estimate} total messages for query: '{query}'")
+
+ total_messages_fetched += len(messages)
+ # Recalculate estimated_total after potentially updating result_size_estimate
+ estimated_total = result_size_estimate if result_size_estimate is not None else max_items
+ estimated_total = min(estimated_total, max_items)
+
+ logger.info(
+ f"Fetched batch of {len(messages)} Gmail messages "
+ f"(total: {total_messages_fetched}/{estimated_total})"
+ )
+
+ # Process batch incrementally
+ batch_indexed, batch_skipped = await _process_gmail_message_batch(
+ session=session,
+ messages=messages,
+ composio_connector=composio_connector,
+ connector_id=connector_id,
+ search_space_id=search_space_id,
+ user_id=user_id,
+ total_documents_indexed=total_documents_indexed,
+ )
+
+ total_documents_indexed += batch_indexed
+ total_documents_skipped += batch_skipped
+
+ logger.info(
+ f"Processed batch: {batch_indexed} indexed, {batch_skipped} skipped "
+ f"(total: {total_documents_indexed} indexed, {total_documents_skipped} skipped)"
+ )
+
+ # Batch commits happen in _process_gmail_message_batch every 10 documents
+ # This ensures progress is saved incrementally, preventing data loss on crashes
+
+ # Check if we should continue
+ if not next_token:
+ # No more pages available
+ break
+
+ if len(messages) < current_batch_size:
+ # Last page had fewer items than requested, we're done
+ break
+
+ # Continue with next page
+ page_token = next_token
+
+ if total_messages_fetched == 0:
+ success_msg = "No Gmail messages found in the specified date range"
+ await task_logger.log_task_success(
+ log_entry, success_msg, {"messages_count": 0}
+ )
+ # CRITICAL: Update timestamp even when no messages found so Electric SQL syncs and UI shows indexed status
+ await update_connector_last_indexed(session, connector, update_last_indexed)
+ await session.commit()
+ return 0, None # Return None (not error) when no items found
+
+ # CRITICAL: Always update timestamp (even if 0 documents indexed) so Electric SQL syncs
+ # This ensures the UI shows "Last indexed" instead of "Never indexed"
+ await update_connector_last_indexed(session, connector, update_last_indexed)
+
+ # Final commit to ensure all documents are persisted (safety net)
+ # This matches the pattern used in non-Composio Gmail indexer
+ logger.info(
+ f"Final commit: Total {total_documents_indexed} Gmail messages processed"
+ )
await session.commit()
+ logger.info(
+ "Successfully committed all Composio Gmail document changes to database"
+ )
await task_logger.log_task_success(
log_entry,
f"Successfully completed Gmail indexing via Composio for connector {connector_id}",
{
- "documents_indexed": documents_indexed,
- "documents_skipped": documents_skipped,
+ "documents_indexed": total_documents_indexed,
+ "documents_skipped": total_documents_skipped,
+ "messages_fetched": total_messages_fetched,
},
)
- return documents_indexed, None
+ return total_documents_indexed, None
except Exception as e:
logger.error(f"Failed to index Gmail via Composio: {e!s}", exc_info=True)
@@ -689,8 +873,6 @@ async def _index_composio_google_calendar(
max_items: int = 2500,
) -> tuple[int, str]:
"""Index Google Calendar events via Composio."""
- from datetime import datetime, timedelta
-
try:
composio_connector = ComposioConnector(session, connector_id)
@@ -700,18 +882,26 @@ async def _index_composio_google_calendar(
{"stage": "fetching_events"},
)
- # Build time range
- if start_date:
- time_min = f"{start_date}T00:00:00Z"
- else:
- # Default to 365 days ago
- default_start = datetime.now() - timedelta(days=365)
- time_min = default_start.strftime("%Y-%m-%dT00:00:00Z")
+ # Normalize date values - handle "undefined" strings from frontend
+ if start_date == "undefined" or start_date == "":
+ start_date = None
+ if end_date == "undefined" or end_date == "":
+ end_date = None
- if end_date:
- time_max = f"{end_date}T23:59:59Z"
- else:
- time_max = datetime.now().strftime("%Y-%m-%dT23:59:59Z")
+ # Calculate date range with defaults (uses last_indexed_at or 365 days back)
+ # This ensures indexing works even when user doesn't specify dates
+ start_date_str, end_date_str = calculate_date_range(
+ connector, start_date, end_date, default_days_back=365
+ )
+
+ # Build time range for API call
+ time_min = f"{start_date_str}T00:00:00Z"
+ time_max = f"{end_date_str}T23:59:59Z"
+
+ logger.info(
+ f"Google Calendar query for connector {connector_id}: "
+ f"(start_date={start_date_str}, end_date={end_date_str})"
+ )
events, error = await composio_connector.list_calendar_events(
time_min=time_min,
@@ -730,6 +920,9 @@ async def _index_composio_google_calendar(
await task_logger.log_task_success(
log_entry, success_msg, {"events_count": 0}
)
+ # CRITICAL: Update timestamp even when no events found so Electric SQL syncs and UI shows indexed status
+ await update_connector_last_indexed(session, connector, update_last_indexed)
+ await session.commit()
return 0, None # Return None (not error) when no items found - this is success with 0 items
logger.info(f"Found {len(events)} Google Calendar events to index via Composio")
@@ -814,6 +1007,13 @@ async def _index_composio_google_calendar(
existing_document.updated_at = get_current_timestamp()
documents_indexed += 1
+
+ # Batch commit every 10 documents
+ if documents_indexed % 10 == 0:
+ logger.info(
+ f"Committing batch: {documents_indexed} Google Calendar events processed so far"
+ )
+ await session.commit()
continue
# Create new document
@@ -863,7 +1063,11 @@ async def _index_composio_google_calendar(
session.add(document)
documents_indexed += 1
+ # Batch commit every 10 documents
if documents_indexed % 10 == 0:
+ logger.info(
+ f"Committing batch: {documents_indexed} Google Calendar events processed so far"
+ )
await session.commit()
except Exception as e:
@@ -871,10 +1075,19 @@ async def _index_composio_google_calendar(
documents_skipped += 1
continue
- if documents_indexed > 0:
- await update_connector_last_indexed(session, connector, update_last_indexed)
+ # CRITICAL: Always update timestamp (even if 0 documents indexed) so Electric SQL syncs
+ # This ensures the UI shows "Last indexed" instead of "Never indexed"
+ await update_connector_last_indexed(session, connector, update_last_indexed)
+ # Final commit to ensure all documents are persisted (safety net)
+ # This matches the pattern used in non-Composio Gmail indexer
+ logger.info(
+ f"Final commit: Total {documents_indexed} Google Calendar events processed"
+ )
await session.commit()
+ logger.info(
+ "Successfully committed all Composio Google Calendar document changes to database"
+ )
await task_logger.log_task_success(
log_entry,
From e6a4ac7c9cd14c3bcae4bbeb91b7b58abd538b80 Mon Sep 17 00:00:00 2001
From: Anish Sarkar <104695310+AnishSarkar22@users.noreply.github.com>
Date: Fri, 23 Jan 2026 04:56:15 +0530
Subject: [PATCH 04/28] fix: change animation from spring to tween for sliding
---
.../components/layout/ui/sidebar/AllPrivateChatsSidebar.tsx | 2 +-
.../components/layout/ui/sidebar/AllSharedChatsSidebar.tsx | 2 +-
surfsense_web/components/layout/ui/sidebar/InboxSidebar.tsx | 2 +-
3 files changed, 3 insertions(+), 3 deletions(-)
diff --git a/surfsense_web/components/layout/ui/sidebar/AllPrivateChatsSidebar.tsx b/surfsense_web/components/layout/ui/sidebar/AllPrivateChatsSidebar.tsx
index 39f1b95bc..c094ff44a 100644
--- a/surfsense_web/components/layout/ui/sidebar/AllPrivateChatsSidebar.tsx
+++ b/surfsense_web/components/layout/ui/sidebar/AllPrivateChatsSidebar.tsx
@@ -231,7 +231,7 @@ export function AllPrivateChatsSidebar({
initial={{ x: "-100%" }}
animate={{ x: 0 }}
exit={{ x: "-100%" }}
- transition={{ type: "spring", damping: 25, stiffness: 300 }}
+ transition={{ type: "tween", duration: 0.3, ease: "easeOut" }}
className="fixed inset-y-0 left-0 z-70 w-80 bg-background shadow-xl flex flex-col pointer-events-auto isolate"
role="dialog"
aria-modal="true"
diff --git a/surfsense_web/components/layout/ui/sidebar/AllSharedChatsSidebar.tsx b/surfsense_web/components/layout/ui/sidebar/AllSharedChatsSidebar.tsx
index 8dd593945..76dbf1aad 100644
--- a/surfsense_web/components/layout/ui/sidebar/AllSharedChatsSidebar.tsx
+++ b/surfsense_web/components/layout/ui/sidebar/AllSharedChatsSidebar.tsx
@@ -231,7 +231,7 @@ export function AllSharedChatsSidebar({
initial={{ x: "-100%" }}
animate={{ x: 0 }}
exit={{ x: "-100%" }}
- transition={{ type: "spring", damping: 25, stiffness: 300 }}
+ transition={{ type: "tween", duration: 0.3, ease: "easeOut" }}
className="fixed inset-y-0 left-0 z-70 w-80 bg-background shadow-xl flex flex-col pointer-events-auto isolate"
role="dialog"
aria-modal="true"
diff --git a/surfsense_web/components/layout/ui/sidebar/InboxSidebar.tsx b/surfsense_web/components/layout/ui/sidebar/InboxSidebar.tsx
index 166d77eca..a3fd3ea14 100644
--- a/surfsense_web/components/layout/ui/sidebar/InboxSidebar.tsx
+++ b/surfsense_web/components/layout/ui/sidebar/InboxSidebar.tsx
@@ -446,7 +446,7 @@ export function InboxSidebar({
initial={{ x: "-100%" }}
animate={{ x: 0 }}
exit={{ x: "-100%" }}
- transition={{ type: "spring", damping: 25, stiffness: 300 }}
+ transition={{ type: "tween", duration: 0.3, ease: "easeOut" }}
className="fixed inset-y-0 left-0 z-70 w-90 bg-background shadow-xl flex flex-col pointer-events-auto isolate"
role="dialog"
aria-modal="true"
From 7ec7ed5c3b6dde85127e8809d7c07c47fe62fd87 Mon Sep 17 00:00:00 2001
From: Anish Sarkar <104695310+AnishSarkar22@users.noreply.github.com>
Date: Fri, 23 Jan 2026 05:17:28 +0530
Subject: [PATCH 05/28] feat: enhance Composio Google Drive integration with
folder and file selection
- Added a new endpoint to list folders and files in a user's Composio Google Drive, supporting hierarchical structure.
- Implemented UI components for selecting specific folders and files to index, improving user control over indexing options.
- Introduced indexing options for maximum files per folder and inclusion of subfolders, allowing for customizable indexing behavior.
- Enhanced error handling and logging for Composio Drive operations, ensuring better visibility into issues during file retrieval and indexing.
- Updated the Composio configuration component to reflect new selection capabilities and indexing options.
---
.../app/routes/composio_routes.py | 122 ++++++
.../routes/search_source_connectors_routes.py | 40 +-
.../app/services/composio_service.py | 6 +-
.../app/tasks/composio_indexer.py | 195 +++++++++-
.../components/composio-config.tsx | 294 +++++++++++++-
.../views/connector-edit-view.tsx | 7 +-
.../hooks/use-connector-dialog.ts | 8 +-
.../connectors/composio-drive-folder-tree.tsx | 365 ++++++++++++++++++
.../hooks/use-composio-drive-folders.ts | 29 ++
.../lib/apis/connectors-api.service.ts | 23 ++
surfsense_web/lib/query-client/cache-keys.ts | 4 +
11 files changed, 1069 insertions(+), 24 deletions(-)
create mode 100644 surfsense_web/components/connectors/composio-drive-folder-tree.tsx
create mode 100644 surfsense_web/hooks/use-composio-drive-folders.ts
diff --git a/surfsense_backend/app/routes/composio_routes.py b/surfsense_backend/app/routes/composio_routes.py
index 77891fc88..25e545dfb 100644
--- a/surfsense_backend/app/routes/composio_routes.py
+++ b/surfsense_backend/app/routes/composio_routes.py
@@ -8,6 +8,7 @@ Endpoints:
- GET /composio/toolkits - List available Composio toolkits
- GET /auth/composio/connector/add - Initiate OAuth for a specific toolkit
- GET /auth/composio/connector/callback - Handle OAuth callback
+- GET /connectors/{connector_id}/composio-drive/folders - List folders/files for Composio Google Drive
"""
import asyncio
@@ -369,3 +370,124 @@ async def composio_callback(
raise HTTPException(
status_code=500, detail=f"Failed to complete Composio OAuth: {e!s}"
) from e
+
+
+@router.get("/connectors/{connector_id}/composio-drive/folders")
+async def list_composio_drive_folders(
+ connector_id: int,
+ parent_id: str | None = None,
+ session: AsyncSession = Depends(get_async_session),
+ user: User = Depends(current_active_user),
+):
+ """
+ List folders AND files in user's Google Drive via Composio with hierarchical support.
+
+ This is called at index time from the manage connector page to display
+ the complete file system (folders and files). Only folders are selectable.
+
+ Args:
+ connector_id: ID of the Composio Google Drive connector
+ parent_id: Optional parent folder ID to list contents (None for root)
+
+ Returns:
+ JSON with list of items: {
+ "items": [
+ {"id": str, "name": str, "mimeType": str, "isFolder": bool, ...},
+ ...
+ ]
+ }
+ """
+ if not ComposioService.is_enabled():
+ raise HTTPException(
+ status_code=503,
+ detail="Composio integration is not enabled.",
+ )
+
+ try:
+ # Get connector and verify ownership
+ result = await session.execute(
+ select(SearchSourceConnector).filter(
+ SearchSourceConnector.id == connector_id,
+ SearchSourceConnector.user_id == user.id,
+ SearchSourceConnector.connector_type
+ == SearchSourceConnectorType.COMPOSIO_GOOGLE_DRIVE_CONNECTOR,
+ )
+ )
+ connector = result.scalars().first()
+
+ if not connector:
+ raise HTTPException(
+ status_code=404,
+ detail="Composio Google Drive connector not found or access denied",
+ )
+
+ # Get Composio connected account ID from config
+ composio_connected_account_id = connector.config.get("composio_connected_account_id")
+ if not composio_connected_account_id:
+ raise HTTPException(
+ status_code=400,
+ detail="Composio connected account not found. Please reconnect the connector.",
+ )
+
+ # Initialize Composio service and fetch files
+ service = ComposioService()
+ entity_id = f"surfsense_{user.id}"
+
+ # Fetch files/folders from Composio Google Drive
+ files, next_token, error = await service.get_drive_files(
+ connected_account_id=composio_connected_account_id,
+ entity_id=entity_id,
+ folder_id=parent_id,
+ page_size=100,
+ )
+
+ if error:
+ logger.error(f"Failed to list Composio Drive files: {error}")
+ raise HTTPException(
+ status_code=500, detail=f"Failed to list folder contents: {error}"
+ )
+
+ # Transform files to match the expected format with isFolder field
+ items = []
+ for file_info in files:
+ file_id = file_info.get("id", "") or file_info.get("fileId", "")
+ file_name = file_info.get("name", "") or file_info.get("fileName", "") or "Untitled"
+ mime_type = file_info.get("mimeType", "") or file_info.get("mime_type", "")
+
+ if not file_id:
+ continue
+
+ is_folder = mime_type == "application/vnd.google-apps.folder"
+
+ items.append({
+ "id": file_id,
+ "name": file_name,
+ "mimeType": mime_type,
+ "isFolder": is_folder,
+ "parents": file_info.get("parents", []),
+ "size": file_info.get("size"),
+ "iconLink": file_info.get("iconLink"),
+ })
+
+ # Sort: folders first, then files, both alphabetically
+ folders = sorted([item for item in items if item["isFolder"]], key=lambda x: x["name"].lower())
+ files_list = sorted([item for item in items if not item["isFolder"]], key=lambda x: x["name"].lower())
+ items = folders + files_list
+
+ folder_count = len(folders)
+ file_count = len(files_list)
+
+ logger.info(
+ f"✅ Listed {len(items)} total items ({folder_count} folders, {file_count} files) for Composio connector {connector_id}"
+ + (f" in folder {parent_id}" if parent_id else " in ROOT")
+ )
+
+ return {"items": items}
+
+ except HTTPException:
+ raise
+ except Exception as e:
+ logger.error(f"Error listing Composio Drive contents: {e!s}", exc_info=True)
+ raise HTTPException(
+ status_code=500, detail=f"Failed to list Drive contents: {e!s}"
+ ) from e
diff --git a/surfsense_backend/app/routes/search_source_connectors_routes.py b/surfsense_backend/app/routes/search_source_connectors_routes.py
index 1578ad0d5..89cdd9f95 100644
--- a/surfsense_backend/app/routes/search_source_connectors_routes.py
+++ b/surfsense_backend/app/routes/search_source_connectors_routes.py
@@ -897,8 +897,46 @@ async def index_connector_content(
)
response_message = "Web page indexing started in the background."
+ elif connector.connector_type == SearchSourceConnectorType.COMPOSIO_GOOGLE_DRIVE_CONNECTOR:
+ from app.tasks.celery_tasks.connector_tasks import (
+ index_composio_connector_task,
+ )
+
+ # For Composio Google Drive, if drive_items is provided, update connector config
+ # This allows the UI to pass folder/file selection like the regular Google Drive connector
+ if drive_items and drive_items.has_items():
+ # Update connector config with the selected folders/files
+ config = connector.config or {}
+ config["selected_folders"] = [{"id": f.id, "name": f.name} for f in drive_items.folders]
+ config["selected_files"] = [{"id": f.id, "name": f.name} for f in drive_items.files]
+ if drive_items.indexing_options:
+ config["indexing_options"] = {
+ "max_files_per_folder": drive_items.indexing_options.max_files_per_folder,
+ "incremental_sync": drive_items.indexing_options.incremental_sync,
+ "include_subfolders": drive_items.indexing_options.include_subfolders,
+ }
+ connector.config = config
+ from sqlalchemy.orm.attributes import flag_modified
+ flag_modified(connector, "config")
+ await session.commit()
+ await session.refresh(connector)
+
+ logger.info(
+ f"Triggering Composio Google Drive indexing for connector {connector_id} into search space {search_space_id}, "
+ f"folders: {len(drive_items.folders)}, files: {len(drive_items.files)}"
+ )
+ else:
+ logger.info(
+ f"Triggering Composio Google Drive indexing for connector {connector_id} into search space {search_space_id} "
+ f"using existing config (from {indexing_from} to {indexing_to})"
+ )
+
+ index_composio_connector_task.delay(
+ connector_id, search_space_id, str(user.id), indexing_from, indexing_to
+ )
+ response_message = "Composio Google Drive indexing started in the background."
+
elif connector.connector_type in [
- SearchSourceConnectorType.COMPOSIO_GOOGLE_DRIVE_CONNECTOR,
SearchSourceConnectorType.COMPOSIO_GMAIL_CONNECTOR,
SearchSourceConnectorType.COMPOSIO_GOOGLE_CALENDAR_CONNECTOR,
]:
diff --git a/surfsense_backend/app/services/composio_service.py b/surfsense_backend/app/services/composio_service.py
index e32cbf8a0..5a6148533 100644
--- a/surfsense_backend/app/services/composio_service.py
+++ b/surfsense_backend/app/services/composio_service.py
@@ -397,7 +397,11 @@ class ComposioService:
"page_size": min(page_size, 100),
}
if folder_id:
- params["folder_id"] = folder_id
+ # List contents of a specific folder (exclude shortcuts - we don't have access to them)
+ params["q"] = f"'{folder_id}' in parents and trashed = false and mimeType != 'application/vnd.google-apps.shortcut'"
+ else:
+ # List root-level items only (My Drive root), exclude shortcuts
+ params["q"] = "'root' in parents and trashed = false and mimeType != 'application/vnd.google-apps.shortcut'"
if page_token:
params["page_token"] = page_token
diff --git a/surfsense_backend/app/tasks/composio_indexer.py b/surfsense_backend/app/tasks/composio_indexer.py
index c9cd74234..f568d4134 100644
--- a/surfsense_backend/app/tasks/composio_indexer.py
+++ b/surfsense_backend/app/tasks/composio_indexer.py
@@ -252,37 +252,123 @@ async def _index_composio_google_drive(
update_last_indexed: bool = True,
max_items: int = 1000,
) -> tuple[int, str]:
- """Index Google Drive files via Composio."""
+ """Index Google Drive files via Composio.
+
+ Supports folder/file selection via connector config:
+ - selected_folders: List of {id, name} for folders to index
+ - selected_files: List of {id, name} for individual files to index
+ - indexing_options: {max_files_per_folder, incremental_sync, include_subfolders}
+ """
try:
composio_connector = ComposioConnector(session, connector_id)
+ connector_config = await composio_connector.get_config()
+
+ # Get folder/file selection configuration
+ selected_folders = connector_config.get("selected_folders", [])
+ selected_files = connector_config.get("selected_files", [])
+ indexing_options = connector_config.get("indexing_options", {})
+
+ max_files_per_folder = indexing_options.get("max_files_per_folder", 100)
+ include_subfolders = indexing_options.get("include_subfolders", True)
await task_logger.log_task_progress(
log_entry,
f"Fetching Google Drive files via Composio for connector {connector_id}",
- {"stage": "fetching_files"},
+ {"stage": "fetching_files", "selected_folders": len(selected_folders), "selected_files": len(selected_files)},
)
- # Fetch files
all_files = []
- page_token = None
- while len(all_files) < max_items:
- files, next_token, error = await composio_connector.list_drive_files(
- page_token=page_token,
- page_size=min(100, max_items - len(all_files)),
- )
+ # If specific folders/files are selected, fetch from those
+ if selected_folders or selected_files:
+ # Fetch files from selected folders
+ for folder in selected_folders:
+ folder_id = folder.get("id")
+ folder_name = folder.get("name", "Unknown")
+
+ if not folder_id:
+ continue
+
+ # Handle special case for "root" folder
+ actual_folder_id = None if folder_id == "root" else folder_id
+
+ logger.info(f"Fetching files from folder: {folder_name} ({folder_id})")
+
+ # Fetch files from this folder
+ folder_files = []
+ page_token = None
+
+ while len(folder_files) < max_files_per_folder:
+ files, next_token, error = await composio_connector.list_drive_files(
+ folder_id=actual_folder_id,
+ page_token=page_token,
+ page_size=min(100, max_files_per_folder - len(folder_files)),
+ )
- if error:
- await task_logger.log_task_failure(
- log_entry, f"Failed to fetch Drive files: {error}", {}
+ if error:
+ logger.warning(f"Failed to fetch files from folder {folder_name}: {error}")
+ break
+
+ # Process files
+ for file_info in files:
+ mime_type = file_info.get("mimeType", "") or file_info.get("mime_type", "")
+
+ # If it's a folder and include_subfolders is enabled, recursively fetch
+ if mime_type == "application/vnd.google-apps.folder":
+ if include_subfolders:
+ # Add subfolder files recursively
+ subfolder_files = await _fetch_folder_files_recursively(
+ composio_connector,
+ file_info.get("id"),
+ max_files=max_files_per_folder,
+ current_count=len(folder_files),
+ )
+ folder_files.extend(subfolder_files)
+ else:
+ folder_files.append(file_info)
+
+ if not next_token:
+ break
+ page_token = next_token
+
+ all_files.extend(folder_files[:max_files_per_folder])
+ logger.info(f"Found {len(folder_files)} files in folder {folder_name}")
+
+ # Add specifically selected files
+ for selected_file in selected_files:
+ file_id = selected_file.get("id")
+ file_name = selected_file.get("name", "Unknown")
+
+ if not file_id:
+ continue
+
+ # Add file info (we'll fetch content later during indexing)
+ all_files.append({
+ "id": file_id,
+ "name": file_name,
+ "mimeType": "", # Will be determined later
+ })
+ else:
+ # No selection specified - fetch all files (original behavior)
+ page_token = None
+
+ while len(all_files) < max_items:
+ files, next_token, error = await composio_connector.list_drive_files(
+ page_token=page_token,
+ page_size=min(100, max_items - len(all_files)),
)
- return 0, f"Failed to fetch Drive files: {error}"
- all_files.extend(files)
+ if error:
+ await task_logger.log_task_failure(
+ log_entry, f"Failed to fetch Drive files: {error}", {}
+ )
+ return 0, f"Failed to fetch Drive files: {error}"
- if not next_token:
- break
- page_token = next_token
+ all_files.extend(files)
+
+ if not next_token:
+ break
+ page_token = next_token
if not all_files:
success_msg = "No Google Drive files found"
@@ -479,6 +565,81 @@ async def _index_composio_google_drive(
return 0, f"Failed to index Google Drive via Composio: {e!s}"
+async def _fetch_folder_files_recursively(
+ composio_connector: ComposioConnector,
+ folder_id: str,
+ max_files: int = 100,
+ current_count: int = 0,
+ depth: int = 0,
+ max_depth: int = 10,
+) -> list[dict[str, Any]]:
+ """
+ Recursively fetch files from a Google Drive folder via Composio.
+
+ Args:
+ composio_connector: The Composio connector instance
+ folder_id: Google Drive folder ID
+ max_files: Maximum number of files to fetch
+ current_count: Current number of files already fetched
+ depth: Current recursion depth
+ max_depth: Maximum recursion depth to prevent infinite loops
+
+ Returns:
+ List of file info dictionaries
+ """
+ if depth >= max_depth:
+ logger.warning(f"Max recursion depth reached for folder {folder_id}")
+ return []
+
+ if current_count >= max_files:
+ return []
+
+ all_files = []
+ page_token = None
+
+ try:
+ while len(all_files) + current_count < max_files:
+ files, next_token, error = await composio_connector.list_drive_files(
+ folder_id=folder_id,
+ page_token=page_token,
+ page_size=min(100, max_files - len(all_files) - current_count),
+ )
+
+ if error:
+ logger.warning(f"Error fetching files from subfolder {folder_id}: {error}")
+ break
+
+ for file_info in files:
+ mime_type = file_info.get("mimeType", "") or file_info.get("mime_type", "")
+
+ if mime_type == "application/vnd.google-apps.folder":
+ # Recursively fetch from subfolders
+ subfolder_files = await _fetch_folder_files_recursively(
+ composio_connector,
+ file_info.get("id"),
+ max_files=max_files,
+ current_count=current_count + len(all_files),
+ depth=depth + 1,
+ max_depth=max_depth,
+ )
+ all_files.extend(subfolder_files)
+ else:
+ all_files.append(file_info)
+
+ if len(all_files) + current_count >= max_files:
+ break
+
+ if not next_token:
+ break
+ page_token = next_token
+
+ return all_files[:max_files - current_count]
+
+ except Exception as e:
+ logger.error(f"Error in recursive folder fetch: {e!s}")
+ return all_files
+
+
async def _process_gmail_message_batch(
session: AsyncSession,
messages: list[dict[str, Any]],
diff --git a/surfsense_web/components/assistant-ui/connector-popup/connector-configs/components/composio-config.tsx b/surfsense_web/components/assistant-ui/connector-popup/connector-configs/components/composio-config.tsx
index a96f906fe..255d0cef4 100644
--- a/surfsense_web/components/assistant-ui/connector-popup/connector-configs/components/composio-config.tsx
+++ b/surfsense_web/components/assistant-ui/connector-popup/connector-configs/components/composio-config.tsx
@@ -1,7 +1,20 @@
"use client";
+import { File, FileSpreadsheet, FileText, FolderClosed, Image, Presentation } from "lucide-react";
import type { FC } from "react";
+import { useEffect, useState } from "react";
+import { ComposioDriveFolderTree } from "@/components/connectors/composio-drive-folder-tree";
import { Badge } from "@/components/ui/badge";
+import { Button } from "@/components/ui/button";
+import { Label } from "@/components/ui/label";
+import {
+ Select,
+ SelectContent,
+ SelectItem,
+ SelectTrigger,
+ SelectValue,
+} from "@/components/ui/select";
+import { Switch } from "@/components/ui/switch";
import type { SearchSourceConnector } from "@/contracts/types/connector.types";
import { cn } from "@/lib/utils";
@@ -11,11 +24,134 @@ interface ComposioConfigProps {
onNameChange?: (name: string) => void;
}
-export const ComposioConfig: FC = ({ connector }) => {
+interface SelectedFolder {
+ id: string;
+ name: string;
+}
+
+interface IndexingOptions {
+ max_files_per_folder: number;
+ incremental_sync: boolean;
+ include_subfolders: boolean;
+}
+
+const DEFAULT_INDEXING_OPTIONS: IndexingOptions = {
+ max_files_per_folder: 100,
+ incremental_sync: true,
+ include_subfolders: true,
+};
+
+// Helper to get appropriate icon for file type based on file name
+function getFileIconFromName(fileName: string, className: string = "size-3.5 shrink-0") {
+ const lowerName = fileName.toLowerCase();
+ // Spreadsheets
+ if (
+ lowerName.endsWith(".xlsx") ||
+ lowerName.endsWith(".xls") ||
+ lowerName.endsWith(".csv") ||
+ lowerName.includes("spreadsheet")
+ ) {
+ return ;
+ }
+ // Presentations
+ if (
+ lowerName.endsWith(".pptx") ||
+ lowerName.endsWith(".ppt") ||
+ lowerName.includes("presentation")
+ ) {
+ return ;
+ }
+ // Documents (word, text only - not PDF)
+ if (
+ lowerName.endsWith(".docx") ||
+ lowerName.endsWith(".doc") ||
+ lowerName.endsWith(".txt") ||
+ lowerName.includes("document") ||
+ lowerName.includes("word") ||
+ lowerName.includes("text")
+ ) {
+ return ;
+ }
+ // Images
+ if (
+ lowerName.endsWith(".png") ||
+ lowerName.endsWith(".jpg") ||
+ lowerName.endsWith(".jpeg") ||
+ lowerName.endsWith(".gif") ||
+ lowerName.endsWith(".webp") ||
+ lowerName.endsWith(".svg")
+ ) {
+ return ;
+ }
+ // Default (including PDF)
+ return ;
+}
+
+export const ComposioConfig: FC = ({ connector, onConfigChange }) => {
const toolkitId = connector.config?.toolkit_id as string;
const isIndexable = connector.config?.is_indexable as boolean;
const composioAccountId = connector.config?.composio_connected_account_id as string;
+ // Check if this is a Google Drive Composio connector
+ const isGoogleDrive = toolkitId === "googledrive";
+
+ // Initialize with existing selected folders and files from connector config
+ const existingFolders =
+ (connector.config?.selected_folders as SelectedFolder[] | undefined) || [];
+ const existingFiles = (connector.config?.selected_files as SelectedFolder[] | undefined) || [];
+ const existingIndexingOptions =
+ (connector.config?.indexing_options as IndexingOptions | undefined) || DEFAULT_INDEXING_OPTIONS;
+
+ const [selectedFolders, setSelectedFolders] = useState(existingFolders);
+ const [selectedFiles, setSelectedFiles] = useState(existingFiles);
+ const [showFolderSelector, setShowFolderSelector] = useState(false);
+ const [indexingOptions, setIndexingOptions] = useState(existingIndexingOptions);
+
+ // Update selected folders and files when connector config changes
+ useEffect(() => {
+ const folders = (connector.config?.selected_folders as SelectedFolder[] | undefined) || [];
+ const files = (connector.config?.selected_files as SelectedFolder[] | undefined) || [];
+ const options =
+ (connector.config?.indexing_options as IndexingOptions | undefined) ||
+ DEFAULT_INDEXING_OPTIONS;
+ setSelectedFolders(folders);
+ setSelectedFiles(files);
+ setIndexingOptions(options);
+ }, [connector.config]);
+
+ const updateConfig = (
+ folders: SelectedFolder[],
+ files: SelectedFolder[],
+ options: IndexingOptions
+ ) => {
+ if (onConfigChange) {
+ onConfigChange({
+ ...connector.config,
+ selected_folders: folders,
+ selected_files: files,
+ indexing_options: options,
+ });
+ }
+ };
+
+ const handleSelectFolders = (folders: SelectedFolder[]) => {
+ setSelectedFolders(folders);
+ updateConfig(folders, selectedFiles, indexingOptions);
+ };
+
+ const handleSelectFiles = (files: SelectedFolder[]) => {
+ setSelectedFiles(files);
+ updateConfig(selectedFolders, files, indexingOptions);
+ };
+
+ const handleIndexingOptionChange = (key: keyof IndexingOptions, value: number | boolean) => {
+ const newOptions = { ...indexingOptions, [key]: value };
+ setIndexingOptions(newOptions);
+ updateConfig(selectedFolders, selectedFiles, newOptions);
+ };
+
+ const totalSelected = selectedFolders.length + selectedFiles.length;
+
return (
Manage your connector settings and sync configuration
diff --git a/surfsense_web/components/assistant-ui/connector-popup/tabs/active-connectors-tab.tsx b/surfsense_web/components/assistant-ui/connector-popup/tabs/active-connectors-tab.tsx
index e45888bb1..2067ca9ad 100644
--- a/surfsense_web/components/assistant-ui/connector-popup/tabs/active-connectors-tab.tsx
+++ b/surfsense_web/components/assistant-ui/connector-popup/tabs/active-connectors-tab.tsx
@@ -15,6 +15,7 @@ import { connectorsApiService } from "@/lib/apis/connectors-api.service";
import { cn } from "@/lib/utils";
import { COMPOSIO_CONNECTORS, OAUTH_CONNECTORS } from "../constants/connector-constants";
import { getDocumentCountForConnector } from "../utils/connector-document-mapping";
+import { getConnectorDisplayName } from "./all-connectors-tab";
interface ActiveConnectorsTabProps {
searchQuery: string;
@@ -263,8 +264,8 @@ export const ActiveConnectorsTab: FC = ({
-
- {connector.name}
+
+ {getConnectorDisplayName(connector.name)}
{isIndexing ? (
From 08f16b43d72edff44bcd4621a43cad79a61ed103 Mon Sep 17 00:00:00 2001
From: Anish Sarkar <104695310+AnishSarkar22@users.noreply.github.com>
Date: Fri, 23 Jan 2026 20:36:00 +0530
Subject: [PATCH 14/28] feat: enhance Composio connector naming logic and
improve UI focus
- Updated the Composio connector naming logic to dynamically generate user-friendly names based on existing connectors.
- Introduced new utility functions for counting connectors and retrieving base names for specific connector types.
- Enhanced the UI components to improve accessibility and focus management, ensuring a better user experience when interacting with connector dialogs.
---
.../app/routes/composio_routes.py | 27 +++++++++++--------
.../app/utils/connector_naming.py | 3 +++
.../assistant-ui/connector-popup.tsx | 2 +-
surfsense_web/components/ui/dialog.tsx | 2 +-
4 files changed, 21 insertions(+), 13 deletions(-)
diff --git a/surfsense_backend/app/routes/composio_routes.py b/surfsense_backend/app/routes/composio_routes.py
index 9e9b59f82..14ef9efcf 100644
--- a/surfsense_backend/app/routes/composio_routes.py
+++ b/surfsense_backend/app/routes/composio_routes.py
@@ -35,7 +35,10 @@ from app.services.composio_service import (
ComposioService,
)
from app.users import current_active_user
-from app.utils.connector_naming import generate_unique_connector_name
+from app.utils.connector_naming import (
+ count_connectors_of_type,
+ get_base_name_for_type,
+)
from app.utils.oauth_security import OAuthStateManager
# Note: We no longer use check_duplicate_connector for Composio connectors because
@@ -343,17 +346,19 @@ async def composio_callback(
)
try:
- # Generate a unique, user-friendly connector name
- # Pass just toolkit_name (without "(Composio)") to avoid redundancy
- base_name = await generate_unique_connector_name(
- session,
- connector_type,
- space_id,
- user_id,
- toolkit_name,
+ # Count existing connectors of this type to determine the number
+ count = await count_connectors_of_type(
+ session, connector_type, space_id, user_id
)
- # Append "(Composio)" suffix for identification
- connector_name = f"{base_name} (Composio)"
+
+ # Generate base name (e.g., "Gmail", "Google Drive")
+ base_name = get_base_name_for_type(connector_type)
+
+ # Format: "Gmail (Composio) 1", "Gmail (Composio) 2", etc.
+ if count == 0:
+ connector_name = f"{base_name} (Composio) 1"
+ else:
+ connector_name = f"{base_name} (Composio) {count + 1}"
db_connector = SearchSourceConnector(
name=connector_name,
diff --git a/surfsense_backend/app/utils/connector_naming.py b/surfsense_backend/app/utils/connector_naming.py
index a2b748a3a..7d3efc001 100644
--- a/surfsense_backend/app/utils/connector_naming.py
+++ b/surfsense_backend/app/utils/connector_naming.py
@@ -28,6 +28,9 @@ BASE_NAME_FOR_TYPE = {
SearchSourceConnectorType.CONFLUENCE_CONNECTOR: "Confluence",
SearchSourceConnectorType.AIRTABLE_CONNECTOR: "Airtable",
SearchSourceConnectorType.MCP_CONNECTOR: "Model Context Protocol (MCP)",
+ SearchSourceConnectorType.COMPOSIO_GMAIL_CONNECTOR: "Gmail",
+ SearchSourceConnectorType.COMPOSIO_GOOGLE_DRIVE_CONNECTOR: "Google Drive",
+ SearchSourceConnectorType.COMPOSIO_GOOGLE_CALENDAR_CONNECTOR: "Google Calendar",
}
diff --git a/surfsense_web/components/assistant-ui/connector-popup.tsx b/surfsense_web/components/assistant-ui/connector-popup.tsx
index 1ec8fad73..e656c06d6 100644
--- a/surfsense_web/components/assistant-ui/connector-popup.tsx
+++ b/surfsense_web/components/assistant-ui/connector-popup.tsx
@@ -184,7 +184,7 @@ export const ConnectorIndicator: FC = () => {
)}
-
+ Manage Connectors
{/* YouTube Crawler View - shown when adding YouTube videos */}
{isYouTubeView && searchSpaceId ? (
diff --git a/surfsense_web/components/ui/dialog.tsx b/surfsense_web/components/ui/dialog.tsx
index d04d76520..f3fa856d3 100644
--- a/surfsense_web/components/ui/dialog.tsx
+++ b/surfsense_web/components/ui/dialog.tsx
@@ -38,7 +38,7 @@ const DialogContent = React.forwardRef<
Date: Fri, 23 Jan 2026 10:48:43 -0500
Subject: [PATCH 15/28] Reworded README.md around LLM compatibility (Based on
discussion with Sid)
---
README.md | 9 ++++++---
1 file changed, 6 insertions(+), 3 deletions(-)
diff --git a/README.md b/README.md
index 7f50b924c..0c5f06029 100644
--- a/README.md
+++ b/README.md
@@ -52,8 +52,10 @@ https://github.com/user-attachments/assets/a0a16566-6967-4374-ac51-9b3e07fbecd7
- Interact in Natural Language and get cited answers.
### 📄 **Cited Answers**
- Get Cited answers just like Perplexity.
+### 🧩 **Universal Compatibility**
+- Connect virtually any inference provider via the OpenAI spec and LiteLLM.
### 🔔 **Privacy & Local LLM Support**
-- Works Flawlessly with Ollama local LLMs.
+- Works Flawlessly with local LLMs like vLLM and Ollama.
### 🏠 **Self Hostable**
- Open source and easy to deploy locally.
### 👥 **Team Collaboration with RBAC**
@@ -61,6 +63,7 @@ https://github.com/user-attachments/assets/a0a16566-6967-4374-ac51-9b3e07fbecd7
- Invite team members with customizable roles (Owner, Admin, Editor, Viewer)
- Granular permissions for documents, chats, connectors, and settings
- Share knowledge bases securely within your organization
+- Team chats update in real-time and "Chat about the chat" in comment threads
### 🎙️ Podcasts
- Blazingly fast podcast generation agent. (Creates a 3-minute podcast in under 20 seconds.)
- Convert your chat conversations into engaging audio content
@@ -237,6 +240,8 @@ Before self-hosting installation, make sure to complete the [prerequisite setup
### **BackEnd**
+- **LiteLLM**: Universal LLM integration supporting 100+ models (OpenAI, Anthropic, Ollama, etc.)
+
- **FastAPI**: Modern, fast web framework for building APIs with Python
- **PostgreSQL with pgvector**: Database with vector search capabilities for similarity searches
@@ -253,8 +258,6 @@ Before self-hosting installation, make sure to complete the [prerequisite setup
- **LangChain**: Framework for developing AI-powered applications.
-- **LiteLLM**: Universal LLM integration supporting 100+ models (OpenAI, Anthropic, Ollama, etc.)
-
- **Rerankers**: Advanced result ranking for improved search relevance
- **Hybrid Search**: Combines vector similarity and full-text search for optimal results using Reciprocal Rank Fusion (RRF)
From d20bb385b5439abc1c1a0dd4e73c275970c68bea Mon Sep 17 00:00:00 2001
From: Anish Sarkar <104695310+AnishSarkar22@users.noreply.github.com>
Date: Fri, 23 Jan 2026 23:03:29 +0530
Subject: [PATCH 16/28] feat: enhance date handling and indexing logic across
connectors
- Added normalization for "undefined" strings to None in date parameters to prevent parsing errors.
- Improved date range validation to ensure start_date is strictly before end_date, adjusting end_date if necessary.
- Updated Google Calendar and Composio connector indexing logic to handle duplicate content more effectively, logging warnings for skipped events.
- Enhanced error handling during final commits to manage integrity errors gracefully.
- Refactored date handling in various connector indexers for consistency and reliability.
---
.../app/connectors/google_gmail_connector.py | 7 +++++
.../routes/search_source_connectors_routes.py | 26 ++++++++++++++-----
.../app/tasks/connector_indexers/base.py | 7 +++++
.../google_calendar_indexer.py | 19 ++++++++++++++
.../tasks/connector_indexers/luma_indexer.py | 7 +++++
.../assistant-ui/connector-popup.tsx | 8 +++++-
.../views/connector-edit-view.tsx | 3 +--
.../views/indexing-configuration-view.tsx | 3 +--
.../hooks/use-connector-dialog.ts | 16 ++++++++++--
9 files changed, 83 insertions(+), 13 deletions(-)
diff --git a/surfsense_backend/app/connectors/google_gmail_connector.py b/surfsense_backend/app/connectors/google_gmail_connector.py
index 8c0e4690e..c86a96413 100644
--- a/surfsense_backend/app/connectors/google_gmail_connector.py
+++ b/surfsense_backend/app/connectors/google_gmail_connector.py
@@ -285,6 +285,13 @@ class GoogleGmailConnector:
try:
from datetime import datetime, timedelta
+ # Normalize date values - handle "undefined" strings from frontend
+ # This prevents "time data 'undefined' does not match format" errors
+ if start_date == "undefined" or start_date == "":
+ start_date = None
+ if end_date == "undefined" or end_date == "":
+ end_date = None
+
# Build date query
query_parts = []
diff --git a/surfsense_backend/app/routes/search_source_connectors_routes.py b/surfsense_backend/app/routes/search_source_connectors_routes.py
index 82f452c61..928327d9a 100644
--- a/surfsense_backend/app/routes/search_source_connectors_routes.py
+++ b/surfsense_backend/app/routes/search_source_connectors_routes.py
@@ -644,20 +644,30 @@ async def index_connector_content(
# Handle different connector types
response_message = ""
- today_str = datetime.now().strftime("%Y-%m-%d")
+ # Use UTC for consistency with last_indexed_at storage
+ today_str = datetime.now(UTC).strftime("%Y-%m-%d")
# Determine the actual date range to use
if start_date is None:
# Use last_indexed_at or default to 365 days ago
if connector.last_indexed_at:
- today = datetime.now().date()
- if connector.last_indexed_at.date() == today:
+ # Convert last_indexed_at to timezone-naive for comparison (like calculate_date_range does)
+ last_indexed_naive = (
+ connector.last_indexed_at.replace(tzinfo=None)
+ if connector.last_indexed_at.tzinfo
+ else connector.last_indexed_at
+ )
+ # Use UTC for "today" to match how last_indexed_at is stored
+ today_utc = datetime.now(UTC).replace(tzinfo=None).date()
+ last_indexed_date = last_indexed_naive.date()
+
+ if last_indexed_date == today_utc:
# If last indexed today, go back 1 day to ensure we don't miss anything
- indexing_from = (today - timedelta(days=1)).strftime("%Y-%m-%d")
+ indexing_from = (today_utc - timedelta(days=1)).strftime("%Y-%m-%d")
else:
- indexing_from = connector.last_indexed_at.strftime("%Y-%m-%d")
+ indexing_from = last_indexed_naive.strftime("%Y-%m-%d")
else:
- indexing_from = (datetime.now() - timedelta(days=365)).strftime(
+ indexing_from = (datetime.now(UTC).replace(tzinfo=None) - timedelta(days=365)).strftime(
"%Y-%m-%d"
)
else:
@@ -666,6 +676,7 @@ async def index_connector_content(
# For calendar connectors, default to today but allow future dates if explicitly provided
if connector.connector_type in [
SearchSourceConnectorType.GOOGLE_CALENDAR_CONNECTOR,
+ SearchSourceConnectorType.COMPOSIO_GOOGLE_CALENDAR_CONNECTOR,
SearchSourceConnectorType.LUMA_CONNECTOR,
]:
# Default to today if no end_date provided (users can manually select future dates)
@@ -977,6 +988,9 @@ async def index_connector_content(
index_composio_connector_task,
)
+ # For Composio Gmail and Calendar, use the same date calculation logic as normal connectors
+ # This ensures consistent behavior and uses last_indexed_at to reduce API calls
+ # (includes special case: if indexed today, go back 1 day to avoid missing data)
logger.info(
f"Triggering Composio connector indexing for connector {connector_id} into search space {search_space_id} from {indexing_from} to {indexing_to}"
)
diff --git a/surfsense_backend/app/tasks/connector_indexers/base.py b/surfsense_backend/app/tasks/connector_indexers/base.py
index b9a99808e..b390937f0 100644
--- a/surfsense_backend/app/tasks/connector_indexers/base.py
+++ b/surfsense_backend/app/tasks/connector_indexers/base.py
@@ -112,6 +112,13 @@ def calculate_date_range(
Returns:
Tuple of (start_date_str, end_date_str)
"""
+ # Normalize "undefined" strings to None (from frontend)
+ # This prevents parsing errors and ensures consistent behavior across all indexers
+ if start_date == "undefined" or start_date == "":
+ start_date = None
+ if end_date == "undefined" or end_date == "":
+ end_date = None
+
if start_date is not None and end_date is not None:
return start_date, end_date
diff --git a/surfsense_backend/app/tasks/connector_indexers/google_calendar_indexer.py b/surfsense_backend/app/tasks/connector_indexers/google_calendar_indexer.py
index 09bb8de4b..7787560fa 100644
--- a/surfsense_backend/app/tasks/connector_indexers/google_calendar_indexer.py
+++ b/surfsense_backend/app/tasks/connector_indexers/google_calendar_indexer.py
@@ -4,6 +4,8 @@ Google Calendar connector indexer.
from datetime import datetime, timedelta
+import pytz
+from dateutil.parser import isoparse
from google.oauth2.credentials import Credentials
from sqlalchemy.exc import SQLAlchemyError
from sqlalchemy.ext.asyncio import AsyncSession
@@ -205,6 +207,23 @@ async def index_google_calendar_events(
# Use provided dates (including future dates)
start_date_str = start_date
end_date_str = end_date
+
+ # If start_date and end_date are the same, adjust end_date to be one day later
+ # to ensure valid date range (start_date must be strictly before end_date)
+ if start_date_str == end_date_str:
+ # Parse the date and add one day to ensure valid range
+ dt = isoparse(end_date_str)
+ if dt.tzinfo is None:
+ dt = dt.replace(tzinfo=pytz.UTC)
+ else:
+ dt = dt.astimezone(pytz.UTC)
+ # Add one day to end_date to make it strictly after start_date
+ dt_end = dt + timedelta(days=1)
+ end_date_str = dt_end.strftime("%Y-%m-%d")
+ logger.info(
+ f"Adjusted end_date from {end_date} to {end_date_str} "
+ f"to ensure valid date range (start_date must be strictly before end_date)"
+ )
await task_logger.log_task_progress(
log_entry,
diff --git a/surfsense_backend/app/tasks/connector_indexers/luma_indexer.py b/surfsense_backend/app/tasks/connector_indexers/luma_indexer.py
index 91f81ac20..0d7a979be 100644
--- a/surfsense_backend/app/tasks/connector_indexers/luma_indexer.py
+++ b/surfsense_backend/app/tasks/connector_indexers/luma_indexer.py
@@ -116,6 +116,13 @@ async def index_luma_events(
luma_client = LumaConnector(api_key=api_key)
+ # Handle 'undefined' string from frontend (treat as None)
+ # This prevents "time data 'undefined' does not match format" errors
+ if start_date == "undefined" or start_date == "":
+ start_date = None
+ if end_date == "undefined" or end_date == "":
+ end_date = None
+
# Calculate date range
# For calendar connectors, allow future dates to index upcoming events
if start_date is None or end_date is None:
diff --git a/surfsense_web/components/assistant-ui/connector-popup.tsx b/surfsense_web/components/assistant-ui/connector-popup.tsx
index e656c06d6..68a548409 100644
--- a/surfsense_web/components/assistant-ui/connector-popup.tsx
+++ b/surfsense_web/components/assistant-ui/connector-popup.tsx
@@ -259,7 +259,13 @@ export const ConnectorIndicator: FC = () => {
editingConnector.connector_type !== "GOOGLE_DRIVE_CONNECTOR"
? () => {
startIndexing(editingConnector.id);
- handleQuickIndexConnector(editingConnector.id, editingConnector.connector_type, stopIndexing);
+ handleQuickIndexConnector(
+ editingConnector.id,
+ editingConnector.connector_type,
+ stopIndexing,
+ startDate,
+ endDate
+ );
}
: undefined
}
diff --git a/surfsense_web/components/assistant-ui/connector-popup/connector-configs/views/connector-edit-view.tsx b/surfsense_web/components/assistant-ui/connector-popup/connector-configs/views/connector-edit-view.tsx
index 8951336c5..d12264fbd 100644
--- a/surfsense_web/components/assistant-ui/connector-popup/connector-configs/views/connector-edit-view.tsx
+++ b/surfsense_web/components/assistant-ui/connector-popup/connector-configs/views/connector-edit-view.tsx
@@ -272,8 +272,7 @@ export const ConnectorEditView: FC = ({
Re-indexing runs in the background
- You can continue using SurfSense while we sync your data. Check the Active tab
- to see progress.
+ You can continue using SurfSense while we sync your data. Check inbox for updates.
- You can continue using SurfSense while we sync your data. Check the Active tab
- to see progress.
+ You can continue using SurfSense while we sync your data. Check inbox for updates.
diff --git a/surfsense_web/components/assistant-ui/connector-popup/hooks/use-connector-dialog.ts b/surfsense_web/components/assistant-ui/connector-popup/hooks/use-connector-dialog.ts
index 3e9e1d930..1bcbd4263 100644
--- a/surfsense_web/components/assistant-ui/connector-popup/hooks/use-connector-dialog.ts
+++ b/surfsense_web/components/assistant-ui/connector-popup/hooks/use-connector-dialog.ts
@@ -1400,9 +1400,15 @@ export const useConnectorDialog = () => {
[editingConnector, searchSpaceId, deleteConnector, router, cameFromMCPList]
);
- // Handle quick index (index without date picker, uses backend defaults)
+ // Handle quick index (index with selected date range, or backend defaults if none selected)
const handleQuickIndexConnector = useCallback(
- async (connectorId: number, connectorType?: string, stopIndexing?: (id: number) => void) => {
+ async (
+ connectorId: number,
+ connectorType?: string,
+ stopIndexing?: (id: number) => void,
+ startDate?: Date,
+ endDate?: Date
+ ) => {
if (!searchSpaceId) return;
// Track quick index clicked event
@@ -1411,10 +1417,16 @@ export const useConnectorDialog = () => {
}
try {
+ // Format dates if provided, otherwise pass undefined (backend will use defaults)
+ const startDateStr = startDate ? format(startDate, "yyyy-MM-dd") : undefined;
+ const endDateStr = endDate ? format(endDate, "yyyy-MM-dd") : undefined;
+
await indexConnector({
connector_id: connectorId,
queryParams: {
search_space_id: searchSpaceId,
+ start_date: startDateStr,
+ end_date: endDateStr,
},
});
toast.success("Indexing started", {
From c48ba36fa47ccffb10f68a76231ab017321c5dbe Mon Sep 17 00:00:00 2001
From: Anish Sarkar <104695310+AnishSarkar22@users.noreply.github.com>
Date: Fri, 23 Jan 2026 23:36:14 +0530
Subject: [PATCH 17/28] feat: improve indexing logic and duplicate handling in
connectors
- Enhanced Google Calendar and Composio connector indexing to track and log duplicate content, preventing re-indexing of already processed events.
- Implemented robust error handling during final commits to manage integrity errors gracefully, ensuring successful indexing despite potential duplicates.
- Updated notification service to differentiate between actual errors and warnings for duplicate content, improving user feedback.
- Refactored date handling to ensure valid date ranges and adjusted end dates when necessary for better indexing accuracy.
---
.../composio_google_calendar_connector.py | 59 +++++++++++++--
.../routes/search_source_connectors_routes.py | 72 +++++++++++++++----
.../app/services/notification_service.py | 28 ++++++--
.../google_calendar_indexer.py | 49 ++++++++++++-
.../views/connector-edit-view.tsx | 14 ++--
.../hooks/use-connector-dialog.ts | 11 ++-
6 files changed, 198 insertions(+), 35 deletions(-)
diff --git a/surfsense_backend/app/connectors/composio_google_calendar_connector.py b/surfsense_backend/app/connectors/composio_google_calendar_connector.py
index ab8bde53c..3ac235848 100644
--- a/surfsense_backend/app/connectors/composio_google_calendar_connector.py
+++ b/surfsense_backend/app/connectors/composio_google_calendar_connector.py
@@ -18,7 +18,10 @@ from app.db import Document, DocumentType
from app.services.composio_service import TOOLKIT_TO_DOCUMENT_TYPE
from app.services.llm_service import get_user_long_context_llm
from app.services.task_logging_service import TaskLoggingService
-from app.tasks.connector_indexers.base import calculate_date_range
+from app.tasks.connector_indexers.base import (
+ calculate_date_range,
+ check_duplicate_document_by_hash,
+)
from app.utils.document_converters import (
create_document_chunks,
generate_content_hash,
@@ -256,6 +259,7 @@ async def index_composio_google_calendar(
documents_indexed = 0
documents_skipped = 0
+ duplicate_content_count = 0 # Track events skipped due to duplicate content_hash
for event in events:
try:
@@ -349,7 +353,25 @@ async def index_composio_google_calendar(
logger.info(
f"Committing batch: {documents_indexed} Google Calendar events processed so far"
)
- await session.commit()
+ await session.commit( )
+ continue
+
+ # Document doesn't exist by unique_identifier_hash
+ # Check if a document with the same content_hash exists (from standard connector)
+ with session.no_autoflush:
+ duplicate_by_content = await check_duplicate_document_by_hash(
+ session, content_hash
+ )
+
+ if duplicate_by_content:
+ # A document with the same content already exists (likely from standard connector)
+ logger.info(
+ f"Event {summary} already indexed by another connector "
+ f"(existing document ID: {duplicate_by_content.id}, "
+ f"type: {duplicate_by_content.document_type}). Skipping to avoid duplicate content."
+ )
+ duplicate_content_count += 1
+ documents_skipped += 1
continue
# Create new document
@@ -429,10 +451,28 @@ async def index_composio_google_calendar(
logger.info(
f"Final commit: Total {documents_indexed} Google Calendar events processed"
)
- await session.commit()
- logger.info(
- "Successfully committed all Composio Google Calendar document changes to database"
- )
+ try:
+ await session.commit()
+ logger.info(
+ "Successfully committed all Composio Google Calendar document changes to database"
+ )
+ except Exception as e:
+ # Handle any remaining integrity errors gracefully (race conditions, etc.)
+ if "duplicate key value violates unique constraint" in str(e).lower() or "uniqueviolationerror" in str(e).lower():
+ logger.warning(
+ f"Duplicate content_hash detected during final commit. "
+ f"This may occur if the same event was indexed by multiple connectors. "
+ f"Rolling back and continuing. Error: {e!s}"
+ )
+ await session.rollback()
+ # Don't fail the entire task - some documents may have been successfully indexed
+ else:
+ raise
+
+ # Build warning message if duplicates were found
+ warning_message = None
+ if duplicate_content_count > 0:
+ warning_message = f"{duplicate_content_count} skipped (duplicate)"
await task_logger.log_task_success(
log_entry,
@@ -440,10 +480,15 @@ async def index_composio_google_calendar(
{
"documents_indexed": documents_indexed,
"documents_skipped": documents_skipped,
+ "duplicate_content_count": duplicate_content_count,
},
)
- return documents_indexed, None
+ logger.info(
+ f"Composio Google Calendar indexing completed: {documents_indexed} new events, {documents_skipped} skipped "
+ f"({duplicate_content_count} due to duplicate content from other connectors)"
+ )
+ return documents_indexed, warning_message
except Exception as e:
logger.error(
diff --git a/surfsense_backend/app/routes/search_source_connectors_routes.py b/surfsense_backend/app/routes/search_source_connectors_routes.py
index 928327d9a..3b98d7d7c 100644
--- a/surfsense_backend/app/routes/search_source_connectors_routes.py
+++ b/surfsense_backend/app/routes/search_source_connectors_routes.py
@@ -22,6 +22,8 @@ import logging
from datetime import UTC, datetime, timedelta
from typing import Any
+import pytz
+from dateutil.parser import isoparse
from fastapi import APIRouter, Body, Depends, HTTPException, Query
from pydantic import BaseModel, Field, ValidationError
from sqlalchemy.exc import IntegrityError
@@ -681,6 +683,22 @@ async def index_connector_content(
]:
# Default to today if no end_date provided (users can manually select future dates)
indexing_to = today_str if end_date is None else end_date
+
+ # If start_date and end_date are the same, adjust end_date to be one day later
+ # to ensure valid date range (start_date must be strictly before end_date)
+ if indexing_from == indexing_to:
+ dt = isoparse(indexing_to)
+ if dt.tzinfo is None:
+ dt = dt.replace(tzinfo=pytz.UTC)
+ else:
+ dt = dt.astimezone(pytz.UTC)
+ # Add one day to end_date to make it strictly after start_date
+ dt_end = dt + timedelta(days=1)
+ indexing_to = dt_end.strftime("%Y-%m-%d")
+ logger.info(
+ f"Adjusted end_date from {end_date} to {indexing_to} "
+ f"to ensure valid date range (start_date must be strictly before end_date)"
+ )
else:
# For non-calendar connectors, cap at today
indexing_to = end_date if end_date else today_str
@@ -1231,20 +1249,48 @@ async def _run_indexing_with_notifications(
else:
# No new documents processed - check if this is an error or just no changes
if error_or_warning:
- # Actual failure
- logger.error(f"Indexing failed: {error_or_warning}")
- if notification:
- # Refresh notification to ensure it's not stale after indexing function commits
- await session.refresh(notification)
- await NotificationService.connector_indexing.notify_indexing_completed(
- session=session,
- notification=notification,
- indexed_count=0,
- error_message=error_or_warning,
+ # Check if this is a duplicate warning (success case) or an actual error
+ # Handle both normal and Composio calendar connectors
+ error_or_warning_lower = str(error_or_warning).lower() if error_or_warning else ""
+ is_duplicate_warning = "skipped (duplicate)" in error_or_warning_lower
+
+ if is_duplicate_warning:
+ # Duplicate warnings are success cases - sync worked, just found duplicates
+ logger.info(
+ f"Indexing completed successfully: {error_or_warning}"
)
- await (
- session.commit()
- ) # Commit to ensure Electric SQL syncs the notification update
+ # Still update timestamp so ElectricSQL syncs and clears "Syncing" UI
+ if update_timestamp_func:
+ await update_timestamp_func(session, connector_id)
+ await session.commit() # Commit timestamp update
+ if notification:
+ # Refresh notification to ensure it's not stale after timestamp update commit
+ await session.refresh(notification)
+ await NotificationService.connector_indexing.notify_indexing_completed(
+ session=session,
+ notification=notification,
+ indexed_count=0,
+ error_message=error_or_warning, # Pass as warning, not error
+ is_warning=True, # Flag to indicate this is a warning, not an error
+ )
+ await (
+ session.commit()
+ ) # Commit to ensure Electric SQL syncs the notification update
+ else:
+ # Actual failure
+ logger.error(f"Indexing failed: {error_or_warning}")
+ if notification:
+ # Refresh notification to ensure it's not stale after indexing function commits
+ await session.refresh(notification)
+ await NotificationService.connector_indexing.notify_indexing_completed(
+ session=session,
+ notification=notification,
+ indexed_count=0,
+ error_message=error_or_warning,
+ )
+ await (
+ session.commit()
+ ) # Commit to ensure Electric SQL syncs the notification update
else:
# Success - just no new documents to index (all skipped/unchanged)
logger.info(
diff --git a/surfsense_backend/app/services/notification_service.py b/surfsense_backend/app/services/notification_service.py
index 836daeb9e..9fcf807e7 100644
--- a/surfsense_backend/app/services/notification_service.py
+++ b/surfsense_backend/app/services/notification_service.py
@@ -335,6 +335,7 @@ class ConnectorIndexingNotificationHandler(BaseNotificationHandler):
notification: Notification,
indexed_count: int,
error_message: str | None = None,
+ is_warning: bool = False,
) -> Notification:
"""
Update notification when connector indexing completes.
@@ -343,7 +344,8 @@ class ConnectorIndexingNotificationHandler(BaseNotificationHandler):
session: Database session
notification: Notification to update
indexed_count: Total number of items indexed
- error_message: Error message if indexing failed (optional)
+ error_message: Error message if indexing failed, or warning message (optional)
+ is_warning: If True, treat error_message as a warning (success case) rather than an error
Returns:
Updated notification
@@ -352,10 +354,26 @@ class ConnectorIndexingNotificationHandler(BaseNotificationHandler):
"connector_name", "Connector"
)
+ # If there's an error message but items were indexed, treat it as a warning (partial success)
+ # If is_warning is True, treat it as success even with 0 items (e.g., duplicates found)
+ # Otherwise, treat it as a failure
if error_message:
- title = f"Failed: {connector_name}"
- message = f"Sync failed: {error_message}"
- status = "failed"
+ if indexed_count > 0:
+ # Partial success with warnings (e.g., duplicate content from other connectors)
+ title = f"Ready: {connector_name}"
+ item_text = "item" if indexed_count == 1 else "items"
+ message = f"Now searchable! {indexed_count} {item_text} synced. Note: {error_message}"
+ status = "completed"
+ elif is_warning:
+ # Warning case (e.g., duplicates found) - treat as success
+ title = f"Ready: {connector_name}"
+ message = f"Sync completed. {error_message}"
+ status = "completed"
+ else:
+ # Complete failure
+ title = f"Failed: {connector_name}"
+ message = f"Sync failed: {error_message}"
+ status = "failed"
else:
title = f"Ready: {connector_name}"
if indexed_count == 0:
@@ -367,7 +385,7 @@ class ConnectorIndexingNotificationHandler(BaseNotificationHandler):
metadata_updates = {
"indexed_count": indexed_count,
- "sync_stage": "completed" if not error_message else "failed",
+ "sync_stage": "completed" if (not error_message or is_warning or indexed_count > 0) else "failed",
"error_message": error_message,
}
diff --git a/surfsense_backend/app/tasks/connector_indexers/google_calendar_indexer.py b/surfsense_backend/app/tasks/connector_indexers/google_calendar_indexer.py
index 7787560fa..5bc805549 100644
--- a/surfsense_backend/app/tasks/connector_indexers/google_calendar_indexer.py
+++ b/surfsense_backend/app/tasks/connector_indexers/google_calendar_indexer.py
@@ -23,6 +23,7 @@ from app.utils.document_converters import (
from .base import (
check_document_by_unique_identifier,
+ check_duplicate_document_by_hash,
get_connector_by_id,
get_current_timestamp,
logger,
@@ -289,6 +290,7 @@ async def index_google_calendar_events(
documents_indexed = 0
documents_skipped = 0
skipped_events = []
+ duplicate_content_count = 0 # Track events skipped due to duplicate content_hash
for event in events:
try:
@@ -409,6 +411,27 @@ async def index_google_calendar_events(
)
continue
+ # Document doesn't exist by unique_identifier_hash
+ # Check if a document with the same content_hash exists (from another connector)
+ with session.no_autoflush:
+ duplicate_by_content = await check_duplicate_document_by_hash(
+ session, content_hash
+ )
+
+ if duplicate_by_content:
+ # A document with the same content already exists (likely from Composio connector)
+ logger.info(
+ f"Event {event_summary} already indexed by another connector "
+ f"(existing document ID: {duplicate_by_content.id}, "
+ f"type: {duplicate_by_content.document_type}). Skipping to avoid duplicate content."
+ )
+ duplicate_content_count += 1
+ documents_skipped += 1
+ skipped_events.append(
+ f"{event_summary} (already indexed by another connector)"
+ )
+ continue
+
# Document doesn't exist - create new one
# Generate summary with metadata
user_llm = await get_user_long_context_llm(
@@ -501,7 +524,25 @@ async def index_google_calendar_events(
logger.info(
f"Final commit: Total {documents_indexed} Google Calendar events processed"
)
- await session.commit()
+ try:
+ await session.commit()
+ except Exception as e:
+ # Handle any remaining integrity errors gracefully (race conditions, etc.)
+ if "duplicate key value violates unique constraint" in str(e).lower() or "uniqueviolationerror" in str(e).lower():
+ logger.warning(
+ f"Duplicate content_hash detected during final commit. "
+ f"This may occur if the same event was indexed by multiple connectors. "
+ f"Rolling back and continuing. Error: {e!s}"
+ )
+ await session.rollback()
+ # Don't fail the entire task - some documents may have been successfully indexed
+ else:
+ raise
+
+ # Build warning message if duplicates were found
+ warning_message = None
+ if duplicate_content_count > 0:
+ warning_message = f"{duplicate_content_count} skipped (duplicate)"
await task_logger.log_task_success(
log_entry,
@@ -510,14 +551,16 @@ async def index_google_calendar_events(
"events_processed": total_processed,
"documents_indexed": documents_indexed,
"documents_skipped": documents_skipped,
+ "duplicate_content_count": duplicate_content_count,
"skipped_events_count": len(skipped_events),
},
)
logger.info(
- f"Google Calendar indexing completed: {documents_indexed} new events, {documents_skipped} skipped"
+ f"Google Calendar indexing completed: {documents_indexed} new events, {documents_skipped} skipped "
+ f"({duplicate_content_count} due to duplicate content from other connectors)"
)
- return total_processed, None
+ return total_processed, warning_message
except SQLAlchemyError as db_error:
await session.rollback()
diff --git a/surfsense_web/components/assistant-ui/connector-popup/connector-configs/views/connector-edit-view.tsx b/surfsense_web/components/assistant-ui/connector-popup/connector-configs/views/connector-edit-view.tsx
index d12264fbd..8f58db542 100644
--- a/surfsense_web/components/assistant-ui/connector-popup/connector-configs/views/connector-edit-view.tsx
+++ b/surfsense_web/components/assistant-ui/connector-popup/connector-configs/views/connector-edit-view.tsx
@@ -100,10 +100,14 @@ export const ConnectorEditView: FC = ({
// Reset local quick indexing state when indexing completes or fails
useEffect(() => {
- if (!isIndexing) {
- setIsQuickIndexing(false);
+ if (!isIndexing && isQuickIndexing) {
+ // Small delay to ensure smooth transition
+ const timer = setTimeout(() => {
+ setIsQuickIndexing(false);
+ }, 100);
+ return () => clearTimeout(timer);
}
- }, [isIndexing]);
+ }, [isIndexing, isQuickIndexing]);
const handleDisconnectClick = () => {
setShowDisconnectConfirm(true);
@@ -119,11 +123,11 @@ export const ConnectorEditView: FC = ({
};
const handleQuickIndex = useCallback(() => {
- if (onQuickIndex) {
+ if (onQuickIndex && !isQuickIndexing && !isIndexing) {
setIsQuickIndexing(true);
onQuickIndex();
}
- }, [onQuickIndex]);
+ }, [onQuickIndex, isQuickIndexing, isIndexing]);
return (
diff --git a/surfsense_web/components/assistant-ui/connector-popup/hooks/use-connector-dialog.ts b/surfsense_web/components/assistant-ui/connector-popup/hooks/use-connector-dialog.ts
index 1bcbd4263..9a7f15b0c 100644
--- a/surfsense_web/components/assistant-ui/connector-popup/hooks/use-connector-dialog.ts
+++ b/surfsense_web/components/assistant-ui/connector-popup/hooks/use-connector-dialog.ts
@@ -1409,7 +1409,12 @@ export const useConnectorDialog = () => {
startDate?: Date,
endDate?: Date
) => {
- if (!searchSpaceId) return;
+ if (!searchSpaceId) {
+ if (stopIndexing) {
+ stopIndexing(connectorId);
+ }
+ return;
+ }
// Track quick index clicked event
if (connectorType) {
@@ -1437,6 +1442,8 @@ export const useConnectorDialog = () => {
queryClient.invalidateQueries({
queryKey: cacheKeys.logs.summary(Number(searchSpaceId)),
});
+ // Note: Don't call stopIndexing here - let useIndexingConnectors hook
+ // detect when last_indexed_at changes via Electric SQL
} catch (error) {
console.error("Error indexing connector content:", error);
toast.error(error instanceof Error ? error.message : "Failed to start indexing");
@@ -1446,7 +1453,7 @@ export const useConnectorDialog = () => {
}
}
},
- [searchSpaceId, indexConnector]
+ [searchSpaceId, indexConnector, queryClient]
);
// Handle going back from edit view
From a7333853a283e040515188a481a8c8f935861ee6 Mon Sep 17 00:00:00 2001
From: Eric Lammertsma
Date: Fri, 23 Jan 2026 13:14:23 -0500
Subject: [PATCH 18/28] Swapped Inbox and Documents in sidebar
---
.../layout/providers/LayoutDataProvider.tsx | 12 ++++++------
1 file changed, 6 insertions(+), 6 deletions(-)
diff --git a/surfsense_web/components/layout/providers/LayoutDataProvider.tsx b/surfsense_web/components/layout/providers/LayoutDataProvider.tsx
index 52dc7196a..1761c74a1 100644
--- a/surfsense_web/components/layout/providers/LayoutDataProvider.tsx
+++ b/surfsense_web/components/layout/providers/LayoutDataProvider.tsx
@@ -161,12 +161,6 @@ export function LayoutDataProvider({
// Navigation items
const navItems: NavItem[] = useMemo(
() => [
- {
- title: "Documents",
- url: `/dashboard/${searchSpaceId}/documents`,
- icon: SquareLibrary,
- isActive: pathname?.includes("/documents"),
- },
{
title: "Inbox",
url: "#inbox", // Special URL to indicate this is handled differently
@@ -174,6 +168,12 @@ export function LayoutDataProvider({
isActive: isInboxSidebarOpen,
badge: unreadCount > 0 ? (unreadCount > 99 ? "99+" : unreadCount) : undefined,
},
+ {
+ title: "Documents",
+ url: `/dashboard/${searchSpaceId}/documents`,
+ icon: SquareLibrary,
+ isActive: pathname?.includes("/documents"),
+ },
],
[searchSpaceId, pathname, isInboxSidebarOpen, unreadCount]
);
From 417ff58fad6ba8221c1a561a00ec3f44a99a93cc Mon Sep 17 00:00:00 2001
From: Eric Lammertsma
Date: Fri, 23 Jan 2026 13:27:14 -0500
Subject: [PATCH 19/28] Fixed a bug where new chats weren't auto selected when
created This additionally fixes a bug where the New Chat button wasn't
working properly after creating a new chat
---
.../layout/providers/LayoutDataProvider.tsx | 41 ++++++++++++++++---
1 file changed, 35 insertions(+), 6 deletions(-)
diff --git a/surfsense_web/components/layout/providers/LayoutDataProvider.tsx b/surfsense_web/components/layout/providers/LayoutDataProvider.tsx
index 1761c74a1..37cb468ec 100644
--- a/surfsense_web/components/layout/providers/LayoutDataProvider.tsx
+++ b/surfsense_web/components/layout/providers/LayoutDataProvider.tsx
@@ -1,12 +1,13 @@
"use client";
import { useQuery, useQueryClient } from "@tanstack/react-query";
-import { useAtomValue } from "jotai";
+import { useAtomValue, useSetAtom } from "jotai";
import { Inbox, LogOut, SquareLibrary, Trash2 } from "lucide-react";
import { useParams, usePathname, useRouter } from "next/navigation";
import { useTranslations } from "next-intl";
import { useTheme } from "next-themes";
-import { useCallback, useMemo, useState } from "react";
+import { useCallback, useEffect, useMemo, useState } from "react";
+import { currentThreadAtom, resetCurrentThreadAtom } from "@/atoms/chat/current-thread.atom";
import { deleteSearchSpaceMutationAtom } from "@/atoms/search-spaces/search-space-mutation.atoms";
import { searchSpacesAtom } from "@/atoms/search-spaces/search-space-query.atoms";
import { currentUserAtom } from "@/atoms/user/user-query.atoms";
@@ -55,11 +56,16 @@ export function LayoutDataProvider({
const { data: user } = useAtomValue(currentUserAtom);
const { data: searchSpacesData, refetch: refetchSearchSpaces } = useAtomValue(searchSpacesAtom);
const { mutateAsync: deleteSearchSpace } = useAtomValue(deleteSearchSpaceMutationAtom);
+ const currentThreadState = useAtomValue(currentThreadAtom);
+ const resetCurrentThread = useSetAtom(resetCurrentThreadAtom);
- // Current IDs from URL
+ // State for handling new chat navigation when router is out of sync
+ const [pendingNewChat, setPendingNewChat] = useState(false);
+
+ // Current IDs from URL, with fallback to atom for replaceState updates
const currentChatId = params?.chat_id
? Number(Array.isArray(params.chat_id) ? params.chat_id[0] : params.chat_id)
- : null;
+ : currentThreadState.id;
// Fetch current search space (for caching purposes)
useQuery({
@@ -111,6 +117,17 @@ export function LayoutDataProvider({
const [isDeletingSearchSpace, setIsDeletingSearchSpace] = useState(false);
const [isLeavingSearchSpace, setIsLeavingSearchSpace] = useState(false);
+ // Effect to complete new chat navigation after router syncs
+ // This runs when handleNewChat detected an out-of-sync state and triggered a sync
+ useEffect(() => {
+ if (pendingNewChat && params?.chat_id) {
+ // Router is now synced (chat_id is in params), complete navigation to new-chat
+ resetCurrentThread();
+ router.push(`/dashboard/${searchSpaceId}/new-chat`);
+ setPendingNewChat(false);
+ }
+ }, [pendingNewChat, params?.chat_id, router, searchSpaceId, resetCurrentThread]);
+
const searchSpaces: SearchSpace[] = useMemo(() => {
if (!searchSpacesData || !Array.isArray(searchSpacesData)) return [];
return searchSpacesData.map((space) => ({
@@ -278,8 +295,20 @@ export function LayoutDataProvider({
);
const handleNewChat = useCallback(() => {
- router.push(`/dashboard/${searchSpaceId}/new-chat`);
- }, [router, searchSpaceId]);
+ // Check if router is out of sync (thread created via replaceState but params don't have chat_id)
+ const isOutOfSync = currentThreadState.id !== null && !params?.chat_id;
+
+ if (isOutOfSync) {
+ // First sync Next.js router by navigating to the current chat's actual URL
+ // This updates the router's internal state to match the browser URL
+ router.replace(`/dashboard/${searchSpaceId}/new-chat/${currentThreadState.id}`);
+ // Set flag to trigger navigation to new-chat after params update
+ setPendingNewChat(true);
+ } else {
+ // Normal navigation - router is in sync
+ router.push(`/dashboard/${searchSpaceId}/new-chat`);
+ }
+ }, [router, searchSpaceId, currentThreadState.id, params?.chat_id]);
const handleChatSelect = useCallback(
(chat: ChatItem) => {
From 6d14b49d3f4fb39994be6ba96bc93af3f1031831 Mon Sep 17 00:00:00 2001
From: Anish Sarkar <104695310+AnishSarkar22@users.noreply.github.com>
Date: Sat, 24 Jan 2026 01:20:51 +0530
Subject: [PATCH 20/28] feat: enhance indexing state management and inbox count
formatting
- Improved indexing state management by refining the logic for handling notifications, ensuring accurate updates for in-progress, completed, and failed states.
- Introduced a new utility function to format inbox counts, displaying numbers up to 999 and using "k+" for larger counts, enhancing user interface clarity.
- Updated sidebar components to utilize the new inbox count formatting, improving the overall user experience.
---
.../hooks/use-indexing-connectors.ts | 87 ++++++++++---------
.../layout/providers/LayoutDataProvider.tsx | 13 ++-
.../layout/ui/sidebar/InboxSidebar.tsx | 15 +++-
.../layout/ui/sidebar/NavSection.tsx | 4 +-
4 files changed, 75 insertions(+), 44 deletions(-)
diff --git a/surfsense_web/components/assistant-ui/connector-popup/hooks/use-indexing-connectors.ts b/surfsense_web/components/assistant-ui/connector-popup/hooks/use-indexing-connectors.ts
index e82a8eb29..289da475d 100644
--- a/surfsense_web/components/assistant-ui/connector-popup/hooks/use-indexing-connectors.ts
+++ b/surfsense_web/components/assistant-ui/connector-popup/hooks/use-indexing-connectors.ts
@@ -10,8 +10,9 @@ import { isConnectorIndexingMetadata } from "@/contracts/types/inbox.types";
*
* This provides a better UX than polling by:
* 1. Setting indexing state immediately when user triggers indexing (optimistic)
- * 2. Clearing indexing state when Electric SQL detects last_indexed_at changed
- * 3. Clearing indexing state when a failed notification is detected
+ * 2. Detecting in_progress notifications from Electric SQL to restore state after remounts
+ * 3. Clearing indexing state when notifications become completed or failed
+ * 4. Clearing indexing state when Electric SQL detects last_indexed_at changed
*
* The actual `last_indexed_at` value comes from Electric SQL/PGlite, not local state.
*/
@@ -28,65 +29,73 @@ export function useIndexingConnectors(
// Detect when last_indexed_at changes (indexing completed) via Electric SQL
useEffect(() => {
const previousValues = previousLastIndexedAtRef.current;
- const newIndexingIds = new Set(indexingConnectorIds);
- let hasChanges = false;
for (const connector of connectors) {
const previousValue = previousValues.get(connector.id);
const currentValue = connector.last_indexed_at;
- // If last_indexed_at changed and connector was in indexing state, clear it
+ // If last_indexed_at changed, clear it from indexing state
if (
previousValue !== undefined && // We've seen this connector before
- previousValue !== currentValue && // Value changed
- indexingConnectorIds.has(connector.id) // It was marked as indexing
+ previousValue !== currentValue // Value changed
) {
- newIndexingIds.delete(connector.id);
- hasChanges = true;
+ // Use functional update to access current state
+ setIndexingConnectorIds((prev) => {
+ if (prev.has(connector.id)) {
+ const next = new Set(prev);
+ next.delete(connector.id);
+ return next;
+ }
+ return prev;
+ });
}
// Update previous value tracking
previousValues.set(connector.id, currentValue);
}
+ }, [connectors]);
- if (hasChanges) {
- setIndexingConnectorIds(newIndexingIds);
- }
- }, [connectors, indexingConnectorIds]);
-
- // Detect failed notifications and stop indexing state
+ // Detect notification status changes and update indexing state accordingly
+ // This restores spinner state after component remounts and handles all status transitions
useEffect(() => {
if (!inboxItems || inboxItems.length === 0) return;
- const newIndexingIds = new Set(indexingConnectorIds);
- let hasChanges = false;
+ setIndexingConnectorIds((prev) => {
+ const newIndexingIds = new Set(prev);
+ let hasChanges = false;
- for (const item of inboxItems) {
- // Only check connector_indexing notifications
- if (item.type !== "connector_indexing") continue;
+ for (const item of inboxItems) {
+ // Only check connector_indexing notifications
+ if (item.type !== "connector_indexing") continue;
- // Check if this notification indicates a failure
- const metadata = isConnectorIndexingMetadata(item.metadata)
- ? item.metadata
- : null;
- if (!metadata) continue;
+ const metadata = isConnectorIndexingMetadata(item.metadata)
+ ? item.metadata
+ : null;
+ if (!metadata) continue;
- // Check if status is "failed" or if there's an error_message
- const isFailed =
- metadata.status === "failed" ||
- (metadata.error_message && metadata.error_message.trim().length > 0);
-
- // If failed and connector is in indexing state, clear it
- if (isFailed && indexingConnectorIds.has(metadata.connector_id)) {
- newIndexingIds.delete(metadata.connector_id);
- hasChanges = true;
+ // If status is "in_progress", add connector to indexing set
+ if (metadata.status === "in_progress") {
+ if (!newIndexingIds.has(metadata.connector_id)) {
+ newIndexingIds.add(metadata.connector_id);
+ hasChanges = true;
+ }
+ }
+ // If status is "completed" or "failed", remove connector from indexing set
+ else if (
+ metadata.status === "completed" ||
+ metadata.status === "failed" ||
+ (metadata.error_message && metadata.error_message.trim().length > 0)
+ ) {
+ if (newIndexingIds.has(metadata.connector_id)) {
+ newIndexingIds.delete(metadata.connector_id);
+ hasChanges = true;
+ }
+ }
}
- }
- if (hasChanges) {
- setIndexingConnectorIds(newIndexingIds);
- }
- }, [inboxItems, indexingConnectorIds]);
+ return hasChanges ? newIndexingIds : prev;
+ });
+ }, [inboxItems]);
// Add a connector to the indexing set (called when indexing starts)
const startIndexing = useCallback((connectorId: number) => {
diff --git a/surfsense_web/components/layout/providers/LayoutDataProvider.tsx b/surfsense_web/components/layout/providers/LayoutDataProvider.tsx
index 52dc7196a..9e3f55c97 100644
--- a/surfsense_web/components/layout/providers/LayoutDataProvider.tsx
+++ b/surfsense_web/components/layout/providers/LayoutDataProvider.tsx
@@ -38,6 +38,17 @@ interface LayoutDataProviderProps {
breadcrumb?: React.ReactNode;
}
+/**
+ * Format count for display: shows numbers up to 999, then "1k+", "2k+", etc.
+ */
+function formatInboxCount(count: number): string {
+ if (count <= 999) {
+ return count.toString();
+ }
+ const thousands = Math.floor(count / 1000);
+ return `${thousands}k+`;
+}
+
export function LayoutDataProvider({
searchSpaceId,
children,
@@ -172,7 +183,7 @@ export function LayoutDataProvider({
url: "#inbox", // Special URL to indicate this is handled differently
icon: Inbox,
isActive: isInboxSidebarOpen,
- badge: unreadCount > 0 ? (unreadCount > 99 ? "99+" : unreadCount) : undefined,
+ badge: unreadCount > 0 ? formatInboxCount(unreadCount) : undefined,
},
],
[searchSpaceId, pathname, isInboxSidebarOpen, unreadCount]
diff --git a/surfsense_web/components/layout/ui/sidebar/InboxSidebar.tsx b/surfsense_web/components/layout/ui/sidebar/InboxSidebar.tsx
index bb06d6a56..e80c6e62d 100644
--- a/surfsense_web/components/layout/ui/sidebar/InboxSidebar.tsx
+++ b/surfsense_web/components/layout/ui/sidebar/InboxSidebar.tsx
@@ -70,6 +70,17 @@ function getInitials(name: string | null | undefined, email: string | null | und
return "U";
}
+/**
+ * Format count for display: shows numbers up to 999, then "1k+", "2k+", etc.
+ */
+function formatInboxCount(count: number): string {
+ if (count <= 999) {
+ return count.toString();
+ }
+ const thousands = Math.floor(count / 1000);
+ return `${thousands}k+`;
+}
+
/**
* Get display name for connector type
*/
@@ -732,7 +743,7 @@ export function InboxSidebar({
{t("mentions") || "Mentions"}
- {unreadMentionsCount}
+ {formatInboxCount(unreadMentionsCount)}
@@ -744,7 +755,7 @@ export function InboxSidebar({
{t("status") || "Status"}
- {unreadStatusCount}
+ {formatInboxCount(unreadStatusCount)}
diff --git a/surfsense_web/components/layout/ui/sidebar/NavSection.tsx b/surfsense_web/components/layout/ui/sidebar/NavSection.tsx
index d2d926de8..742a27bbc 100644
--- a/surfsense_web/components/layout/ui/sidebar/NavSection.tsx
+++ b/surfsense_web/components/layout/ui/sidebar/NavSection.tsx
@@ -39,7 +39,7 @@ export function NavSection({ items, onItemClick, isCollapsed = false }: NavSecti
>
{item.badge && (
-
+
{item.badge}
)}
@@ -70,7 +70,7 @@ export function NavSection({ items, onItemClick, isCollapsed = false }: NavSecti
{item.title}
{item.badge && (
-
+
{item.badge}
)}
From f4b1192a063e71437bb24340342fcee2a69f6a1f Mon Sep 17 00:00:00 2001
From: Anish Sarkar <104695310+AnishSarkar22@users.noreply.github.com>
Date: Sat, 24 Jan 2026 03:51:57 +0530
Subject: [PATCH 21/28] feat: refine indexing success case handling and
notification messaging
- Enhanced the logic for determining success cases during indexing by distinguishing between duplicate warnings and empty results.
- Updated notification messages to provide clearer feedback for empty results, improving user understanding of indexing outcomes.
- Ensured that notifications reflect accurate statuses, maintaining consistency in user feedback during the indexing process.
---
.../app/routes/search_source_connectors_routes.py | 12 ++++++++----
1 file changed, 8 insertions(+), 4 deletions(-)
diff --git a/surfsense_backend/app/routes/search_source_connectors_routes.py b/surfsense_backend/app/routes/search_source_connectors_routes.py
index 3b98d7d7c..487a689dc 100644
--- a/surfsense_backend/app/routes/search_source_connectors_routes.py
+++ b/surfsense_backend/app/routes/search_source_connectors_routes.py
@@ -1249,13 +1249,15 @@ async def _run_indexing_with_notifications(
else:
# No new documents processed - check if this is an error or just no changes
if error_or_warning:
- # Check if this is a duplicate warning (success case) or an actual error
+ # Check if this is a duplicate warning or empty result (success cases) or an actual error
# Handle both normal and Composio calendar connectors
error_or_warning_lower = str(error_or_warning).lower() if error_or_warning else ""
is_duplicate_warning = "skipped (duplicate)" in error_or_warning_lower
+ # "No X found" messages are success cases - sync worked, just found nothing in date range
+ is_empty_result = ("no " in error_or_warning_lower and "found" in error_or_warning_lower)
- if is_duplicate_warning:
- # Duplicate warnings are success cases - sync worked, just found duplicates
+ if is_duplicate_warning or is_empty_result:
+ # These are success cases - sync worked, just found nothing new
logger.info(
f"Indexing completed successfully: {error_or_warning}"
)
@@ -1266,11 +1268,13 @@ async def _run_indexing_with_notifications(
if notification:
# Refresh notification to ensure it's not stale after timestamp update commit
await session.refresh(notification)
+ # For empty results, use a cleaner message
+ notification_message = "No new items found in date range" if is_empty_result else error_or_warning
await NotificationService.connector_indexing.notify_indexing_completed(
session=session,
notification=notification,
indexed_count=0,
- error_message=error_or_warning, # Pass as warning, not error
+ error_message=notification_message, # Pass as warning, not error
is_warning=True, # Flag to indicate this is a warning, not an error
)
await (
From 5cf6fb15ed9c0f875c584ac4af216d279ae9eb36 Mon Sep 17 00:00:00 2001
From: Anish Sarkar <104695310+AnishSarkar22@users.noreply.github.com>
Date: Sat, 24 Jan 2026 03:59:17 +0530
Subject: [PATCH 22/28] fix: improve error logging for indexing tasks across
multiple connectors
- Updated error handling in the indexing functions for BookStack, Confluence, Google Calendar, Jira, Linear, and Luma connectors to log specific error messages when failures occur.
- Enhanced logging for cases where no pages or events are found, providing clearer informational messages instead of treating them as critical errors.
- Ensured consistent error reporting across all connector indexers, improving debugging and user feedback during indexing operations.
---
.../app/tasks/connector_indexers/bookstack_indexer.py | 4 ++--
.../app/tasks/connector_indexers/confluence_indexer.py | 4 ++--
.../app/tasks/connector_indexers/google_calendar_indexer.py | 4 ++--
.../app/tasks/connector_indexers/jira_indexer.py | 4 ++--
.../app/tasks/connector_indexers/linear_indexer.py | 4 ++--
.../app/tasks/connector_indexers/luma_indexer.py | 4 ++--
6 files changed, 12 insertions(+), 12 deletions(-)
diff --git a/surfsense_backend/app/tasks/connector_indexers/bookstack_indexer.py b/surfsense_backend/app/tasks/connector_indexers/bookstack_indexer.py
index 2793f78db..a1067255d 100644
--- a/surfsense_backend/app/tasks/connector_indexers/bookstack_indexer.py
+++ b/surfsense_backend/app/tasks/connector_indexers/bookstack_indexer.py
@@ -136,10 +136,9 @@ async def index_bookstack_pages(
)
if error:
- logger.error(f"Failed to get BookStack pages: {error}")
-
# Don't treat "No pages found" as an error that should stop indexing
if "No pages found" in error:
+ logger.info(f"No BookStack pages found: {error}")
logger.info(
"No pages found is not a critical error, continuing with update"
)
@@ -159,6 +158,7 @@ async def index_bookstack_pages(
)
return 0, None
else:
+ logger.error(f"Failed to get BookStack pages: {error}")
await task_logger.log_task_failure(
log_entry,
f"Failed to get BookStack pages: {error}",
diff --git a/surfsense_backend/app/tasks/connector_indexers/confluence_indexer.py b/surfsense_backend/app/tasks/connector_indexers/confluence_indexer.py
index 7289b0ccd..ddbefafb9 100644
--- a/surfsense_backend/app/tasks/connector_indexers/confluence_indexer.py
+++ b/surfsense_backend/app/tasks/connector_indexers/confluence_indexer.py
@@ -120,10 +120,9 @@ async def index_confluence_pages(
)
if error:
- logger.error(f"Failed to get Confluence pages: {error}")
-
# Don't treat "No pages found" as an error that should stop indexing
if "No pages found" in error:
+ logger.info(f"No Confluence pages found: {error}")
logger.info(
"No pages found is not a critical error, continuing with update"
)
@@ -147,6 +146,7 @@ async def index_confluence_pages(
await confluence_client.close()
return 0, None
else:
+ logger.error(f"Failed to get Confluence pages: {error}")
await task_logger.log_task_failure(
log_entry,
f"Failed to get Confluence pages: {error}",
diff --git a/surfsense_backend/app/tasks/connector_indexers/google_calendar_indexer.py b/surfsense_backend/app/tasks/connector_indexers/google_calendar_indexer.py
index 5bc805549..ef1f821d2 100644
--- a/surfsense_backend/app/tasks/connector_indexers/google_calendar_indexer.py
+++ b/surfsense_backend/app/tasks/connector_indexers/google_calendar_indexer.py
@@ -243,10 +243,9 @@ async def index_google_calendar_events(
)
if error:
- logger.error(f"Failed to get Google Calendar events: {error}")
-
# Don't treat "No events found" as an error that should stop indexing
if "No events found" in error:
+ logger.info(f"No Google Calendar events found: {error}")
logger.info(
"No events found is not a critical error, continuing with update"
)
@@ -266,6 +265,7 @@ async def index_google_calendar_events(
)
return 0, None
else:
+ logger.error(f"Failed to get Google Calendar events: {error}")
# Check if this is an authentication error that requires re-authentication
error_message = error
error_type = "APIError"
diff --git a/surfsense_backend/app/tasks/connector_indexers/jira_indexer.py b/surfsense_backend/app/tasks/connector_indexers/jira_indexer.py
index fdbeb93b0..4851a6466 100644
--- a/surfsense_backend/app/tasks/connector_indexers/jira_indexer.py
+++ b/surfsense_backend/app/tasks/connector_indexers/jira_indexer.py
@@ -126,10 +126,9 @@ async def index_jira_issues(
)
if error:
- logger.error(f"Failed to get Jira issues: {error}")
-
# Don't treat "No issues found" as an error that should stop indexing
if "No issues found" in error:
+ logger.info(f"No Jira issues found: {error}")
logger.info(
"No issues found is not a critical error, continuing with update"
)
@@ -149,6 +148,7 @@ async def index_jira_issues(
)
return 0, None
else:
+ logger.error(f"Failed to get Jira issues: {error}")
await task_logger.log_task_failure(
log_entry,
f"Failed to get Jira issues: {error}",
diff --git a/surfsense_backend/app/tasks/connector_indexers/linear_indexer.py b/surfsense_backend/app/tasks/connector_indexers/linear_indexer.py
index f1bfd42e8..7d8e0c30e 100644
--- a/surfsense_backend/app/tasks/connector_indexers/linear_indexer.py
+++ b/surfsense_backend/app/tasks/connector_indexers/linear_indexer.py
@@ -145,10 +145,9 @@ async def index_linear_issues(
)
if error:
- logger.error(f"Failed to get Linear issues: {error}")
-
# Don't treat "No issues found" as an error that should stop indexing
if "No issues found" in error:
+ logger.info(f"No Linear issues found: {error}")
logger.info(
"No issues found is not a critical error, continuing with update"
)
@@ -162,6 +161,7 @@ async def index_linear_issues(
)
return 0, None
else:
+ logger.error(f"Failed to get Linear issues: {error}")
return 0, f"Failed to get Linear issues: {error}"
logger.info(f"Retrieved {len(issues)} issues from Linear API")
diff --git a/surfsense_backend/app/tasks/connector_indexers/luma_indexer.py b/surfsense_backend/app/tasks/connector_indexers/luma_indexer.py
index 0d7a979be..ead259a44 100644
--- a/surfsense_backend/app/tasks/connector_indexers/luma_indexer.py
+++ b/surfsense_backend/app/tasks/connector_indexers/luma_indexer.py
@@ -179,10 +179,9 @@ async def index_luma_events(
)
if error:
- logger.error(f"Failed to get Luma events: {error}")
-
# Don't treat "No events found" as an error that should stop indexing
if "No events found" in error or "no events" in error.lower():
+ logger.info(f"No Luma events found: {error}")
logger.info(
"No events found is not a critical error, continuing with update"
)
@@ -202,6 +201,7 @@ async def index_luma_events(
)
return 0, None
else:
+ logger.error(f"Failed to get Luma events: {error}")
await task_logger.log_task_failure(
log_entry,
f"Failed to get Luma events: {error}",
From 97d7207bd4e76a5c76b1d6ed88a0784ea76f0445 Mon Sep 17 00:00:00 2001
From: Anish Sarkar <104695310+AnishSarkar22@users.noreply.github.com>
Date: Sat, 24 Jan 2026 04:33:10 +0530
Subject: [PATCH 23/28] fix: update Google Drive indexer to use SQLAlchemy
casting for metadata queries
- Modified the Google Drive indexer to use SQLAlchemy's cast function for querying document metadata, ensuring proper type handling for file IDs.
- Improved the consistency of metadata queries across the indexing functions, enhancing reliability in document retrieval and processing.
---
.../app/tasks/connector_indexers/google_drive_indexer.py | 8 ++++----
1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/surfsense_backend/app/tasks/connector_indexers/google_drive_indexer.py b/surfsense_backend/app/tasks/connector_indexers/google_drive_indexer.py
index 48282a1af..af180c36b 100644
--- a/surfsense_backend/app/tasks/connector_indexers/google_drive_indexer.py
+++ b/surfsense_backend/app/tasks/connector_indexers/google_drive_indexer.py
@@ -578,7 +578,7 @@ async def _check_rename_only_update(
- (True, message): Only filename changed, document was updated
- (False, None): Content changed or new file, needs full processing
"""
- from sqlalchemy import select
+ from sqlalchemy import cast, select, String
from sqlalchemy.orm.attributes import flag_modified
from app.db import Document
@@ -603,7 +603,7 @@ async def _check_rename_only_update(
select(Document).where(
Document.search_space_id == search_space_id,
Document.document_type == DocumentType.GOOGLE_DRIVE_FILE,
- Document.document_metadata["google_drive_file_id"].astext == file_id,
+ cast(Document.document_metadata["google_drive_file_id"], String) == file_id,
)
)
existing_document = result.scalar_one_or_none()
@@ -755,7 +755,7 @@ async def _remove_document(session: AsyncSession, file_id: str, search_space_id:
Handles both new (file_id-based) and legacy (filename-based) hash schemes.
"""
- from sqlalchemy import select
+ from sqlalchemy import cast, select, String
from app.db import Document
@@ -774,7 +774,7 @@ async def _remove_document(session: AsyncSession, file_id: str, search_space_id:
select(Document).where(
Document.search_space_id == search_space_id,
Document.document_type == DocumentType.GOOGLE_DRIVE_FILE,
- Document.document_metadata["google_drive_file_id"].astext == file_id,
+ cast(Document.document_metadata["google_drive_file_id"], String) == file_id,
)
)
existing_document = result.scalar_one_or_none()
From a5103da3d74fded873e311108b601d8b36740fce Mon Sep 17 00:00:00 2001
From: Anish Sarkar <104695310+AnishSarkar22@users.noreply.github.com>
Date: Sat, 24 Jan 2026 04:36:34 +0530
Subject: [PATCH 24/28] chore: ran linting
---
.../connectors/composio_gmail_connector.py | 1 -
.../composio_google_calendar_connector.py | 14 +-
.../composio_google_drive_connector.py | 83 +++++-----
.../connectors/google_calendar_connector.py | 17 ++-
.../app/connectors/google_gmail_connector.py | 17 ++-
.../app/routes/composio_routes.py | 4 +-
.../routes/search_source_connectors_routes.py | 31 ++--
.../app/services/composio_service.py | 142 ++++++++++--------
.../app/services/notification_service.py | 4 +-
.../google_calendar_indexer.py | 21 ++-
.../google_drive_indexer.py | 10 +-
.../google_gmail_indexer.py | 13 +-
.../assistant-ui/connector-popup.tsx | 5 +-
.../components/composio-calendar-config.tsx | 1 -
.../components/composio-drive-config.tsx | 24 +--
.../components/composio-gmail-config.tsx | 1 -
.../components/google-drive-config.tsx | 10 +-
.../views/connector-edit-view.tsx | 3 +-
.../views/indexing-configuration-view.tsx | 17 ++-
.../hooks/use-connector-dialog.ts | 18 +--
.../hooks/use-indexing-connectors.ts | 4 +-
21 files changed, 259 insertions(+), 181 deletions(-)
diff --git a/surfsense_backend/app/connectors/composio_gmail_connector.py b/surfsense_backend/app/connectors/composio_gmail_connector.py
index 5a9645a66..953e2e8fc 100644
--- a/surfsense_backend/app/connectors/composio_gmail_connector.py
+++ b/surfsense_backend/app/connectors/composio_gmail_connector.py
@@ -611,4 +611,3 @@ async def index_composio_gmail(
except Exception as e:
logger.error(f"Failed to index Gmail via Composio: {e!s}", exc_info=True)
return 0, f"Failed to index Gmail via Composio: {e!s}"
-
diff --git a/surfsense_backend/app/connectors/composio_google_calendar_connector.py b/surfsense_backend/app/connectors/composio_google_calendar_connector.py
index 3ac235848..ec5b22b7f 100644
--- a/surfsense_backend/app/connectors/composio_google_calendar_connector.py
+++ b/surfsense_backend/app/connectors/composio_google_calendar_connector.py
@@ -259,7 +259,9 @@ async def index_composio_google_calendar(
documents_indexed = 0
documents_skipped = 0
- duplicate_content_count = 0 # Track events skipped due to duplicate content_hash
+ duplicate_content_count = (
+ 0 # Track events skipped due to duplicate content_hash
+ )
for event in events:
try:
@@ -353,7 +355,7 @@ async def index_composio_google_calendar(
logger.info(
f"Committing batch: {documents_indexed} Google Calendar events processed so far"
)
- await session.commit( )
+ await session.commit()
continue
# Document doesn't exist by unique_identifier_hash
@@ -362,7 +364,7 @@ async def index_composio_google_calendar(
duplicate_by_content = await check_duplicate_document_by_hash(
session, content_hash
)
-
+
if duplicate_by_content:
# A document with the same content already exists (likely from standard connector)
logger.info(
@@ -458,7 +460,10 @@ async def index_composio_google_calendar(
)
except Exception as e:
# Handle any remaining integrity errors gracefully (race conditions, etc.)
- if "duplicate key value violates unique constraint" in str(e).lower() or "uniqueviolationerror" in str(e).lower():
+ if (
+ "duplicate key value violates unique constraint" in str(e).lower()
+ or "uniqueviolationerror" in str(e).lower()
+ ):
logger.warning(
f"Duplicate content_hash detected during final commit. "
f"This may occur if the same event was indexed by multiple connectors. "
@@ -495,4 +500,3 @@ async def index_composio_google_calendar(
f"Failed to index Google Calendar via Composio: {e!s}", exc_info=True
)
return 0, f"Failed to index Google Calendar via Composio: {e!s}"
-
diff --git a/surfsense_backend/app/connectors/composio_google_drive_connector.py b/surfsense_backend/app/connectors/composio_google_drive_connector.py
index e19436611..e3b988676 100644
--- a/surfsense_backend/app/connectors/composio_google_drive_connector.py
+++ b/surfsense_backend/app/connectors/composio_google_drive_connector.py
@@ -453,8 +453,8 @@ async def check_document_by_unique_identifier(
session: AsyncSession, unique_identifier_hash: str
) -> Document | None:
"""Check if a document with the given unique identifier hash already exists."""
- from sqlalchemy.orm import selectinload
from sqlalchemy.future import select
+ from sqlalchemy.orm import selectinload
existing_doc_result = await session.execute(
select(Document)
@@ -517,14 +517,20 @@ async def index_composio_google_drive(
# Route to delta sync or full scan
if use_delta_sync:
- logger.info(f"Using delta sync for Composio Google Drive connector {connector_id}")
+ logger.info(
+ f"Using delta sync for Composio Google Drive connector {connector_id}"
+ )
await task_logger.log_task_progress(
log_entry,
f"Starting delta sync for Google Drive via Composio (connector {connector_id})",
{"stage": "delta_sync", "token": stored_page_token[:20] + "..."},
)
- documents_indexed, documents_skipped, processing_errors = await _index_composio_drive_delta_sync(
+ (
+ documents_indexed,
+ documents_skipped,
+ processing_errors,
+ ) = await _index_composio_drive_delta_sync(
session=session,
composio_connector=composio_connector,
connector_id=connector_id,
@@ -536,7 +542,9 @@ async def index_composio_google_drive(
log_entry=log_entry,
)
else:
- logger.info(f"Using full scan for Composio Google Drive connector {connector_id} (first sync or no token)")
+ logger.info(
+ f"Using full scan for Composio Google Drive connector {connector_id} (first sync or no token)"
+ )
await task_logger.log_task_progress(
log_entry,
f"Fetching Google Drive files via Composio for connector {connector_id}",
@@ -547,7 +555,11 @@ async def index_composio_google_drive(
},
)
- documents_indexed, documents_skipped, processing_errors = await _index_composio_drive_full_scan(
+ (
+ documents_indexed,
+ documents_skipped,
+ processing_errors,
+ ) = await _index_composio_drive_full_scan(
session=session,
composio_connector=composio_connector,
connector_id=connector_id,
@@ -580,9 +592,13 @@ async def index_composio_google_drive(
await update_connector_last_indexed(session, connector, update_last_indexed)
# Final commit
- logger.info(f"Final commit: Total {documents_indexed} Google Drive files processed")
+ logger.info(
+ f"Final commit: Total {documents_indexed} Google Drive files processed"
+ )
await session.commit()
- logger.info("Successfully committed all Composio Google Drive document changes to database")
+ logger.info(
+ "Successfully committed all Composio Google Drive document changes to database"
+ )
# Handle processing errors
error_message = None
@@ -731,7 +747,9 @@ async def _index_composio_drive_delta_sync(
processing_errors.append(error_msg)
documents_skipped += 1
- logger.info(f"Delta sync complete: {documents_indexed} indexed, {documents_skipped} skipped")
+ logger.info(
+ f"Delta sync complete: {documents_indexed} indexed, {documents_skipped} skipped"
+ )
return documents_indexed, documents_skipped, processing_errors
@@ -858,20 +876,18 @@ async def _index_composio_drive_full_scan(
logger.info("No Google Drive files found")
return 0, 0, []
- logger.info(f"Found {len(all_files)} Google Drive files to index via Composio (full scan)")
+ logger.info(
+ f"Found {len(all_files)} Google Drive files to index via Composio (full scan)"
+ )
for file_info in all_files:
try:
# Handle both standard Google API and potential Composio variations
file_id = file_info.get("id", "") or file_info.get("fileId", "")
file_name = (
- file_info.get("name", "")
- or file_info.get("fileName", "")
- or "Untitled"
- )
- mime_type = file_info.get("mimeType", "") or file_info.get(
- "mime_type", ""
+ file_info.get("name", "") or file_info.get("fileName", "") or "Untitled"
)
+ mime_type = file_info.get("mimeType", "") or file_info.get("mime_type", "")
if not file_id:
documents_skipped += 1
@@ -901,7 +917,9 @@ async def _index_composio_drive_full_scan(
# Batch commit every 10 documents
if documents_indexed > 0 and documents_indexed % 10 == 0:
- logger.info(f"Committing batch: {documents_indexed} Google Drive files processed so far")
+ logger.info(
+ f"Committing batch: {documents_indexed} Google Drive files processed so far"
+ )
await session.commit()
except Exception as e:
@@ -910,7 +928,9 @@ async def _index_composio_drive_full_scan(
processing_errors.append(error_msg)
documents_skipped += 1
- logger.info(f"Full scan complete: {documents_indexed} indexed, {documents_skipped} skipped")
+ logger.info(
+ f"Full scan complete: {documents_indexed} indexed, {documents_skipped} skipped"
+ )
return documents_indexed, documents_skipped, processing_errors
@@ -948,9 +968,7 @@ async def _process_single_drive_file(
content, content_error = await composio_connector.get_drive_file_content(file_id)
if content_error or not content:
- logger.warning(
- f"Could not get content for file {file_name}: {content_error}"
- )
+ logger.warning(f"Could not get content for file {file_name}: {content_error}")
# Use metadata as content fallback
markdown_content = f"# {file_name}\n\n"
markdown_content += f"**File ID:** {file_id}\n"
@@ -985,9 +1003,7 @@ async def _process_single_drive_file(
return 0, 1, processing_errors # Skipped
# Update existing document
- user_llm = await get_user_long_context_llm(
- session, user_id, search_space_id
- )
+ user_llm = await get_user_long_context_llm(session, user_id, search_space_id)
if user_llm:
document_metadata = {
@@ -1003,12 +1019,8 @@ async def _process_single_drive_file(
markdown_content, user_llm, document_metadata
)
else:
- summary_content = (
- f"Google Drive File: {file_name}\n\nType: {mime_type}"
- )
- summary_embedding = config.embedding_model_instance.embed(
- summary_content
- )
+ summary_content = f"Google Drive File: {file_name}\n\nType: {mime_type}"
+ summary_embedding = config.embedding_model_instance.embed(summary_content)
chunks = await create_document_chunks(markdown_content)
@@ -1030,9 +1042,7 @@ async def _process_single_drive_file(
return 1, 0, processing_errors # Indexed
# Create new document
- user_llm = await get_user_long_context_llm(
- session, user_id, search_space_id
- )
+ user_llm = await get_user_long_context_llm(session, user_id, search_space_id)
if user_llm:
document_metadata = {
@@ -1048,12 +1058,8 @@ async def _process_single_drive_file(
markdown_content, user_llm, document_metadata
)
else:
- summary_content = (
- f"Google Drive File: {file_name}\n\nType: {mime_type}"
- )
- summary_embedding = config.embedding_model_instance.embed(
- summary_content
- )
+ summary_content = f"Google Drive File: {file_name}\n\nType: {mime_type}"
+ summary_embedding = config.embedding_model_instance.embed(summary_content)
chunks = await create_document_chunks(markdown_content)
@@ -1159,4 +1165,3 @@ async def _fetch_folder_files_recursively(
except Exception as e:
logger.error(f"Error in recursive folder fetch: {e!s}")
return all_files
-
diff --git a/surfsense_backend/app/connectors/google_calendar_connector.py b/surfsense_backend/app/connectors/google_calendar_connector.py
index ac60b02a8..d8160cf25 100644
--- a/surfsense_backend/app/connectors/google_calendar_connector.py
+++ b/surfsense_backend/app/connectors/google_calendar_connector.py
@@ -144,7 +144,10 @@ class GoogleCalendarConnector:
except Exception as e:
error_str = str(e)
# Check if this is an invalid_grant error (token expired/revoked)
- if "invalid_grant" in error_str.lower() or "token has been expired or revoked" in error_str.lower():
+ if (
+ "invalid_grant" in error_str.lower()
+ or "token has been expired or revoked" in error_str.lower()
+ ):
raise Exception(
"Google Calendar authentication failed. Please re-authenticate."
) from e
@@ -173,7 +176,11 @@ class GoogleCalendarConnector:
except Exception as e:
error_str = str(e)
# If the error already contains a user-friendly re-authentication message, preserve it
- if "re-authenticate" in error_str.lower() or "expired or been revoked" in error_str.lower() or "authentication failed" in error_str.lower():
+ if (
+ "re-authenticate" in error_str.lower()
+ or "expired or been revoked" in error_str.lower()
+ or "authentication failed" in error_str.lower()
+ ):
raise Exception(error_str) from e
raise Exception(f"Failed to create Google Calendar service: {e!s}") from e
@@ -283,7 +290,11 @@ class GoogleCalendarConnector:
except Exception as e:
error_str = str(e)
# If the error already contains a user-friendly re-authentication message, preserve it
- if "re-authenticate" in error_str.lower() or "expired or been revoked" in error_str.lower() or "authentication failed" in error_str.lower():
+ if (
+ "re-authenticate" in error_str.lower()
+ or "expired or been revoked" in error_str.lower()
+ or "authentication failed" in error_str.lower()
+ ):
return [], error_str
return [], f"Error fetching events: {e!s}"
diff --git a/surfsense_backend/app/connectors/google_gmail_connector.py b/surfsense_backend/app/connectors/google_gmail_connector.py
index c86a96413..7c7262bff 100644
--- a/surfsense_backend/app/connectors/google_gmail_connector.py
+++ b/surfsense_backend/app/connectors/google_gmail_connector.py
@@ -143,7 +143,10 @@ class GoogleGmailConnector:
except Exception as e:
error_str = str(e)
# Check if this is an invalid_grant error (token expired/revoked)
- if "invalid_grant" in error_str.lower() or "token has been expired or revoked" in error_str.lower():
+ if (
+ "invalid_grant" in error_str.lower()
+ or "token has been expired or revoked" in error_str.lower()
+ ):
raise Exception(
"Gmail authentication failed. Please re-authenticate."
) from e
@@ -172,7 +175,11 @@ class GoogleGmailConnector:
except Exception as e:
error_str = str(e)
# If the error already contains a user-friendly re-authentication message, preserve it
- if "re-authenticate" in error_str.lower() or "expired or been revoked" in error_str.lower() or "authentication failed" in error_str.lower():
+ if (
+ "re-authenticate" in error_str.lower()
+ or "expired or been revoked" in error_str.lower()
+ or "authentication failed" in error_str.lower()
+ ):
raise Exception(error_str) from e
raise Exception(f"Failed to create Gmail service: {e!s}") from e
@@ -237,7 +244,11 @@ class GoogleGmailConnector:
except Exception as e:
error_str = str(e)
# If the error already contains a user-friendly re-authentication message, preserve it
- if "re-authenticate" in error_str.lower() or "expired or been revoked" in error_str.lower() or "authentication failed" in error_str.lower():
+ if (
+ "re-authenticate" in error_str.lower()
+ or "expired or been revoked" in error_str.lower()
+ or "authentication failed" in error_str.lower()
+ ):
return [], error_str
return [], f"Error fetching messages list: {e!s}"
diff --git a/surfsense_backend/app/routes/composio_routes.py b/surfsense_backend/app/routes/composio_routes.py
index 14ef9efcf..a28361132 100644
--- a/surfsense_backend/app/routes/composio_routes.py
+++ b/surfsense_backend/app/routes/composio_routes.py
@@ -350,10 +350,10 @@ async def composio_callback(
count = await count_connectors_of_type(
session, connector_type, space_id, user_id
)
-
+
# Generate base name (e.g., "Gmail", "Google Drive")
base_name = get_base_name_for_type(connector_type)
-
+
# Format: "Gmail (Composio) 1", "Gmail (Composio) 2", etc.
if count == 0:
connector_name = f"{base_name} (Composio) 1"
diff --git a/surfsense_backend/app/routes/search_source_connectors_routes.py b/surfsense_backend/app/routes/search_source_connectors_routes.py
index 487a689dc..191c6f954 100644
--- a/surfsense_backend/app/routes/search_source_connectors_routes.py
+++ b/surfsense_backend/app/routes/search_source_connectors_routes.py
@@ -662,16 +662,16 @@ async def index_connector_content(
# Use UTC for "today" to match how last_indexed_at is stored
today_utc = datetime.now(UTC).replace(tzinfo=None).date()
last_indexed_date = last_indexed_naive.date()
-
+
if last_indexed_date == today_utc:
# If last indexed today, go back 1 day to ensure we don't miss anything
indexing_from = (today_utc - timedelta(days=1)).strftime("%Y-%m-%d")
else:
indexing_from = last_indexed_naive.strftime("%Y-%m-%d")
else:
- indexing_from = (datetime.now(UTC).replace(tzinfo=None) - timedelta(days=365)).strftime(
- "%Y-%m-%d"
- )
+ indexing_from = (
+ datetime.now(UTC).replace(tzinfo=None) - timedelta(days=365)
+ ).strftime("%Y-%m-%d")
else:
indexing_from = start_date
@@ -683,7 +683,7 @@ async def index_connector_content(
]:
# Default to today if no end_date provided (users can manually select future dates)
indexing_to = today_str if end_date is None else end_date
-
+
# If start_date and end_date are the same, adjust end_date to be one day later
# to ensure valid date range (start_date must be strictly before end_date)
if indexing_from == indexing_to:
@@ -1251,16 +1251,19 @@ async def _run_indexing_with_notifications(
if error_or_warning:
# Check if this is a duplicate warning or empty result (success cases) or an actual error
# Handle both normal and Composio calendar connectors
- error_or_warning_lower = str(error_or_warning).lower() if error_or_warning else ""
+ error_or_warning_lower = (
+ str(error_or_warning).lower() if error_or_warning else ""
+ )
is_duplicate_warning = "skipped (duplicate)" in error_or_warning_lower
# "No X found" messages are success cases - sync worked, just found nothing in date range
- is_empty_result = ("no " in error_or_warning_lower and "found" in error_or_warning_lower)
-
+ is_empty_result = (
+ "no " in error_or_warning_lower
+ and "found" in error_or_warning_lower
+ )
+
if is_duplicate_warning or is_empty_result:
# These are success cases - sync worked, just found nothing new
- logger.info(
- f"Indexing completed successfully: {error_or_warning}"
- )
+ logger.info(f"Indexing completed successfully: {error_or_warning}")
# Still update timestamp so ElectricSQL syncs and clears "Syncing" UI
if update_timestamp_func:
await update_timestamp_func(session, connector_id)
@@ -1269,7 +1272,11 @@ async def _run_indexing_with_notifications(
# Refresh notification to ensure it's not stale after timestamp update commit
await session.refresh(notification)
# For empty results, use a cleaner message
- notification_message = "No new items found in date range" if is_empty_result else error_or_warning
+ notification_message = (
+ "No new items found in date range"
+ if is_empty_result
+ else error_or_warning
+ )
await NotificationService.connector_indexing.notify_indexing_completed(
session=session,
notification=notification,
diff --git a/surfsense_backend/app/services/composio_service.py b/surfsense_backend/app/services/composio_service.py
index 3ea2d1bf2..ad7841a8b 100644
--- a/surfsense_backend/app/services/composio_service.py
+++ b/surfsense_backend/app/services/composio_service.py
@@ -81,7 +81,9 @@ class ComposioService:
# Default download directory for files from Composio
DEFAULT_DOWNLOAD_DIR = "/tmp/composio_downloads"
- def __init__(self, api_key: str | None = None, file_download_dir: str | None = None):
+ def __init__(
+ self, api_key: str | None = None, file_download_dir: str | None = None
+ ):
"""
Initialize the Composio service.
@@ -90,18 +92,20 @@ class ComposioService:
file_download_dir: Directory for downloaded files. Defaults to /tmp/composio_downloads.
"""
import os
-
+
self.api_key = api_key or config.COMPOSIO_API_KEY
if not self.api_key:
raise ValueError("COMPOSIO_API_KEY is required but not configured")
-
+
# Set up download directory
self.file_download_dir = file_download_dir or self.DEFAULT_DOWNLOAD_DIR
os.makedirs(self.file_download_dir, exist_ok=True)
-
+
# Initialize Composio client with download directory
# Per docs: file_download_dir configures where files are downloaded
- self.client = Composio(api_key=self.api_key, file_download_dir=self.file_download_dir)
+ self.client = Composio(
+ api_key=self.api_key, file_download_dir=self.file_download_dir
+ )
@staticmethod
def is_enabled() -> bool:
@@ -512,7 +516,7 @@ class ComposioService:
Tuple of (file content bytes, error message).
"""
from pathlib import Path
-
+
try:
result = await self.execute_tool(
connected_account_id=connected_account_id,
@@ -532,35 +536,37 @@ class ComposioService:
# Response structure: {data: {...}, error: ..., successful: ...}
# The actual file info is nested inside data["data"]
file_path = None
-
+
if isinstance(data, dict):
# Handle nested response structure: data contains {data, error, successful}
# The actual file info is in data["data"]
inner_data = data
if "data" in data and isinstance(data["data"], dict):
inner_data = data["data"]
- logger.debug(f"Found nested data structure. Inner keys: {list(inner_data.keys())}")
+ logger.debug(
+ f"Found nested data structure. Inner keys: {list(inner_data.keys())}"
+ )
elif "successful" in data and "data" in data:
# Standard Composio response wrapper
inner_data = data["data"] if data["data"] else data
-
+
# Try documented fields: file_path, downloaded_file_content, path, uri
file_path = (
- inner_data.get("file_path") or
- inner_data.get("downloaded_file_content") or
- inner_data.get("path") or
- inner_data.get("uri")
+ inner_data.get("file_path")
+ or inner_data.get("downloaded_file_content")
+ or inner_data.get("path")
+ or inner_data.get("uri")
)
-
+
# Handle nested dict case where downloaded_file_content contains the path
if isinstance(file_path, dict):
file_path = (
- file_path.get("file_path") or
- file_path.get("downloaded_file_content") or
- file_path.get("path") or
- file_path.get("uri")
+ file_path.get("file_path")
+ or file_path.get("downloaded_file_content")
+ or file_path.get("path")
+ or file_path.get("uri")
)
-
+
# If still no path, check if inner_data itself has the nested structure
if not file_path and isinstance(inner_data, dict):
for key in ["downloaded_file_content", "file_path", "path", "uri"]:
@@ -572,15 +578,17 @@ class ComposioService:
elif isinstance(val, dict):
# One more level of nesting
file_path = (
- val.get("file_path") or
- val.get("downloaded_file_content") or
- val.get("path") or
- val.get("uri")
+ val.get("file_path")
+ or val.get("downloaded_file_content")
+ or val.get("path")
+ or val.get("uri")
)
if file_path:
break
-
- logger.debug(f"Composio response keys: {list(data.keys())}, inner keys: {list(inner_data.keys()) if isinstance(inner_data, dict) else 'N/A'}, extracted path: {file_path}")
+
+ logger.debug(
+ f"Composio response keys: {list(data.keys())}, inner keys: {list(inner_data.keys()) if isinstance(inner_data, dict) else 'N/A'}, extracted path: {file_path}"
+ )
elif isinstance(data, str):
# Direct string response (could be path or content)
file_path = data
@@ -591,24 +599,31 @@ class ComposioService:
# Read file from the path
if file_path and isinstance(file_path, str):
path_obj = Path(file_path)
-
+
# Check if it's a valid file path (absolute or in .composio directory)
- if path_obj.is_absolute() or '.composio' in str(path_obj):
+ if path_obj.is_absolute() or ".composio" in str(path_obj):
try:
if path_obj.exists():
content = path_obj.read_bytes()
- logger.info(f"Successfully read {len(content)} bytes from Composio file: {file_path}")
+ logger.info(
+ f"Successfully read {len(content)} bytes from Composio file: {file_path}"
+ )
return content, None
else:
- logger.warning(f"File path from Composio does not exist: {file_path}")
+ logger.warning(
+ f"File path from Composio does not exist: {file_path}"
+ )
return None, f"File not found at path: {file_path}"
except Exception as e:
- logger.error(f"Failed to read file from Composio path {file_path}: {e!s}")
+ logger.error(
+ f"Failed to read file from Composio path {file_path}: {e!s}"
+ )
return None, f"Failed to read file: {e!s}"
else:
# Not a file path - might be base64 encoded content
try:
import base64
+
content = base64.b64decode(file_path)
return content, None
except Exception:
@@ -625,8 +640,11 @@ class ComposioService:
f"Inner data keys: {list(inner_data.keys()) if isinstance(inner_data, dict) else type(inner_data).__name__}, "
f"Full inner data: {inner_data}"
)
- return None, f"No file path in Composio response. Keys: {list(data.keys())}, inner: {list(inner_data.keys()) if isinstance(inner_data, dict) else 'N/A'}"
-
+ return (
+ None,
+ f"No file path in Composio response. Keys: {list(data.keys())}, inner: {list(inner_data.keys()) if isinstance(inner_data, dict) else 'N/A'}",
+ )
+
return None, f"Unexpected data type from Composio: {type(data).__name__}"
except Exception as e:
@@ -638,14 +656,14 @@ class ComposioService:
) -> tuple[str | None, str | None]:
"""
Get the starting page token for Google Drive change tracking.
-
+
This token represents the current state and is used for future delta syncs.
Per Composio docs: Use GOOGLEDRIVE_GET_CHANGES_START_PAGE_TOKEN to get initial token.
-
+
Args:
connected_account_id: Composio connected account ID.
entity_id: The entity/user ID that owns the connected account.
-
+
Returns:
Tuple of (start_page_token, error message).
"""
@@ -656,27 +674,27 @@ class ComposioService:
params={},
entity_id=entity_id,
)
-
+
if not result.get("success"):
return None, result.get("error", "Unknown error")
-
+
data = result.get("data", {})
# Handle nested response: {data: {startPageToken: ...}, successful: ...}
if isinstance(data, dict):
inner_data = data.get("data", data)
token = (
- inner_data.get("startPageToken") or
- inner_data.get("start_page_token") or
- data.get("startPageToken") or
- data.get("start_page_token")
+ inner_data.get("startPageToken")
+ or inner_data.get("start_page_token")
+ or data.get("startPageToken")
+ or data.get("start_page_token")
)
if token:
logger.info(f"Got Drive start page token: {token}")
return token, None
-
+
logger.warning(f"Could not extract start page token from response: {data}")
return None, "No start page token in response"
-
+
except Exception as e:
logger.error(f"Failed to get Drive start page token: {e!s}")
return None, str(e)
@@ -691,18 +709,18 @@ class ComposioService:
) -> tuple[list[dict[str, Any]], str | None, str | None]:
"""
List changes in Google Drive since the given page token.
-
+
Per Composio docs: GOOGLEDRIVE_LIST_CHANGES tracks modifications to files/folders.
If pageToken is not provided, it auto-fetches the current start page token.
Response includes nextPageToken for pagination and newStartPageToken for future syncs.
-
+
Args:
connected_account_id: Composio connected account ID.
entity_id: The entity/user ID that owns the connected account.
page_token: Page token from previous sync (optional - will auto-fetch if not provided).
page_size: Number of changes per page.
include_removed: Whether to include removed items in the response.
-
+
Returns:
Tuple of (changes list, new_start_page_token, error message).
"""
@@ -713,42 +731,44 @@ class ComposioService:
}
if page_token:
params["pageToken"] = page_token
-
+
result = await self.execute_tool(
connected_account_id=connected_account_id,
tool_name="GOOGLEDRIVE_LIST_CHANGES",
params=params,
entity_id=entity_id,
)
-
+
if not result.get("success"):
return [], None, result.get("error", "Unknown error")
-
+
data = result.get("data", {})
-
+
# Handle nested response structure
changes = []
new_start_token = None
-
+
if isinstance(data, dict):
inner_data = data.get("data", data)
changes = inner_data.get("changes", []) or data.get("changes", [])
-
+
# Get the token for next sync
# newStartPageToken is returned when all changes have been fetched
# nextPageToken is for pagination within the current fetch
new_start_token = (
- inner_data.get("newStartPageToken") or
- inner_data.get("new_start_page_token") or
- inner_data.get("nextPageToken") or
- inner_data.get("next_page_token") or
- data.get("newStartPageToken") or
- data.get("nextPageToken")
+ inner_data.get("newStartPageToken")
+ or inner_data.get("new_start_page_token")
+ or inner_data.get("nextPageToken")
+ or inner_data.get("next_page_token")
+ or data.get("newStartPageToken")
+ or data.get("nextPageToken")
)
-
- logger.info(f"Got {len(changes)} Drive changes, new token: {new_start_token[:20] if new_start_token else 'None'}...")
+
+ logger.info(
+ f"Got {len(changes)} Drive changes, new token: {new_start_token[:20] if new_start_token else 'None'}..."
+ )
return changes, new_start_token, None
-
+
except Exception as e:
logger.error(f"Failed to list Drive changes: {e!s}")
return [], None, str(e)
diff --git a/surfsense_backend/app/services/notification_service.py b/surfsense_backend/app/services/notification_service.py
index 9fcf807e7..04f39d8ef 100644
--- a/surfsense_backend/app/services/notification_service.py
+++ b/surfsense_backend/app/services/notification_service.py
@@ -385,7 +385,9 @@ class ConnectorIndexingNotificationHandler(BaseNotificationHandler):
metadata_updates = {
"indexed_count": indexed_count,
- "sync_stage": "completed" if (not error_message or is_warning or indexed_count > 0) else "failed",
+ "sync_stage": "completed"
+ if (not error_message or is_warning or indexed_count > 0)
+ else "failed",
"error_message": error_message,
}
diff --git a/surfsense_backend/app/tasks/connector_indexers/google_calendar_indexer.py b/surfsense_backend/app/tasks/connector_indexers/google_calendar_indexer.py
index ef1f821d2..2365ff984 100644
--- a/surfsense_backend/app/tasks/connector_indexers/google_calendar_indexer.py
+++ b/surfsense_backend/app/tasks/connector_indexers/google_calendar_indexer.py
@@ -208,7 +208,7 @@ async def index_google_calendar_events(
# Use provided dates (including future dates)
start_date_str = start_date
end_date_str = end_date
-
+
# If start_date and end_date are the same, adjust end_date to be one day later
# to ensure valid date range (start_date must be strictly before end_date)
if start_date_str == end_date_str:
@@ -269,10 +269,14 @@ async def index_google_calendar_events(
# Check if this is an authentication error that requires re-authentication
error_message = error
error_type = "APIError"
- if "re-authenticate" in error.lower() or "expired or been revoked" in error.lower() or "authentication failed" in error.lower():
+ if (
+ "re-authenticate" in error.lower()
+ or "expired or been revoked" in error.lower()
+ or "authentication failed" in error.lower()
+ ):
error_message = "Google Calendar authentication failed. Please re-authenticate."
error_type = "AuthenticationError"
-
+
await task_logger.log_task_failure(
log_entry,
error_message,
@@ -290,7 +294,9 @@ async def index_google_calendar_events(
documents_indexed = 0
documents_skipped = 0
skipped_events = []
- duplicate_content_count = 0 # Track events skipped due to duplicate content_hash
+ duplicate_content_count = (
+ 0 # Track events skipped due to duplicate content_hash
+ )
for event in events:
try:
@@ -417,7 +423,7 @@ async def index_google_calendar_events(
duplicate_by_content = await check_duplicate_document_by_hash(
session, content_hash
)
-
+
if duplicate_by_content:
# A document with the same content already exists (likely from Composio connector)
logger.info(
@@ -528,7 +534,10 @@ async def index_google_calendar_events(
await session.commit()
except Exception as e:
# Handle any remaining integrity errors gracefully (race conditions, etc.)
- if "duplicate key value violates unique constraint" in str(e).lower() or "uniqueviolationerror" in str(e).lower():
+ if (
+ "duplicate key value violates unique constraint" in str(e).lower()
+ or "uniqueviolationerror" in str(e).lower()
+ ):
logger.warning(
f"Duplicate content_hash detected during final commit. "
f"This may occur if the same event was indexed by multiple connectors. "
diff --git a/surfsense_backend/app/tasks/connector_indexers/google_drive_indexer.py b/surfsense_backend/app/tasks/connector_indexers/google_drive_indexer.py
index af180c36b..f50e149d3 100644
--- a/surfsense_backend/app/tasks/connector_indexers/google_drive_indexer.py
+++ b/surfsense_backend/app/tasks/connector_indexers/google_drive_indexer.py
@@ -578,7 +578,7 @@ async def _check_rename_only_update(
- (True, message): Only filename changed, document was updated
- (False, None): Content changed or new file, needs full processing
"""
- from sqlalchemy import cast, select, String
+ from sqlalchemy import String, cast, select
from sqlalchemy.orm.attributes import flag_modified
from app.db import Document
@@ -603,7 +603,8 @@ async def _check_rename_only_update(
select(Document).where(
Document.search_space_id == search_space_id,
Document.document_type == DocumentType.GOOGLE_DRIVE_FILE,
- cast(Document.document_metadata["google_drive_file_id"], String) == file_id,
+ cast(Document.document_metadata["google_drive_file_id"], String)
+ == file_id,
)
)
existing_document = result.scalar_one_or_none()
@@ -755,7 +756,7 @@ async def _remove_document(session: AsyncSession, file_id: str, search_space_id:
Handles both new (file_id-based) and legacy (filename-based) hash schemes.
"""
- from sqlalchemy import cast, select, String
+ from sqlalchemy import String, cast, select
from app.db import Document
@@ -774,7 +775,8 @@ async def _remove_document(session: AsyncSession, file_id: str, search_space_id:
select(Document).where(
Document.search_space_id == search_space_id,
Document.document_type == DocumentType.GOOGLE_DRIVE_FILE,
- cast(Document.document_metadata["google_drive_file_id"], String) == file_id,
+ cast(Document.document_metadata["google_drive_file_id"], String)
+ == file_id,
)
)
existing_document = result.scalar_one_or_none()
diff --git a/surfsense_backend/app/tasks/connector_indexers/google_gmail_indexer.py b/surfsense_backend/app/tasks/connector_indexers/google_gmail_indexer.py
index 6a3057437..08d2904d6 100644
--- a/surfsense_backend/app/tasks/connector_indexers/google_gmail_indexer.py
+++ b/surfsense_backend/app/tasks/connector_indexers/google_gmail_indexer.py
@@ -173,15 +173,16 @@ async def index_google_gmail_messages(
# Check if this is an authentication error that requires re-authentication
error_message = error
error_type = "APIError"
- if "re-authenticate" in error.lower() or "expired or been revoked" in error.lower() or "authentication failed" in error.lower():
+ if (
+ "re-authenticate" in error.lower()
+ or "expired or been revoked" in error.lower()
+ or "authentication failed" in error.lower()
+ ):
error_message = "Gmail authentication failed. Please re-authenticate."
error_type = "AuthenticationError"
-
+
await task_logger.log_task_failure(
- log_entry,
- error_message,
- error,
- {"error_type": error_type}
+ log_entry, error_message, error, {"error_type": error_type}
)
return 0, error_message
diff --git a/surfsense_web/components/assistant-ui/connector-popup.tsx b/surfsense_web/components/assistant-ui/connector-popup.tsx
index 68a548409..293d4a243 100644
--- a/surfsense_web/components/assistant-ui/connector-popup.tsx
+++ b/surfsense_web/components/assistant-ui/connector-popup.tsx
@@ -18,7 +18,10 @@ import { ConnectorDialogHeader } from "./connector-popup/components/connector-di
import { ConnectorConnectView } from "./connector-popup/connector-configs/views/connector-connect-view";
import { ConnectorEditView } from "./connector-popup/connector-configs/views/connector-edit-view";
import { IndexingConfigurationView } from "./connector-popup/connector-configs/views/indexing-configuration-view";
-import { COMPOSIO_CONNECTORS, OAUTH_CONNECTORS } from "./connector-popup/constants/connector-constants";
+import {
+ COMPOSIO_CONNECTORS,
+ OAUTH_CONNECTORS,
+} from "./connector-popup/constants/connector-constants";
import { useConnectorDialog } from "./connector-popup/hooks/use-connector-dialog";
import { useIndexingConnectors } from "./connector-popup/hooks/use-indexing-connectors";
import { ActiveConnectorsTab } from "./connector-popup/tabs/active-connectors-tab";
diff --git a/surfsense_web/components/assistant-ui/connector-popup/connector-configs/components/composio-calendar-config.tsx b/surfsense_web/components/assistant-ui/connector-popup/connector-configs/components/composio-calendar-config.tsx
index ce5133a9d..6f282d892 100644
--- a/surfsense_web/components/assistant-ui/connector-popup/connector-configs/components/composio-calendar-config.tsx
+++ b/surfsense_web/components/assistant-ui/connector-popup/connector-configs/components/composio-calendar-config.tsx
@@ -12,4 +12,3 @@ interface ComposioCalendarConfigProps {
export const ComposioCalendarConfig: FC = () => {
return ;
};
-
diff --git a/surfsense_web/components/assistant-ui/connector-popup/connector-configs/components/composio-drive-config.tsx b/surfsense_web/components/assistant-ui/connector-popup/connector-configs/components/composio-drive-config.tsx
index 0ab0869ff..239125565 100644
--- a/surfsense_web/components/assistant-ui/connector-popup/connector-configs/components/composio-drive-config.tsx
+++ b/surfsense_web/components/assistant-ui/connector-popup/connector-configs/components/composio-drive-config.tsx
@@ -1,6 +1,14 @@
"use client";
-import { File, FileSpreadsheet, FileText, FolderClosed, Image, Presentation, X } from "lucide-react";
+import {
+ File,
+ FileSpreadsheet,
+ FileText,
+ FolderClosed,
+ Image,
+ Presentation,
+ X,
+} from "lucide-react";
import type { FC } from "react";
import { useEffect, useState } from "react";
import { ComposioDriveFolderTree } from "@/components/connectors/composio-drive-folder-tree";
@@ -85,7 +93,10 @@ function getFileIconFromName(fileName: string, className: string = "size-3.5 shr
return ;
}
-export const ComposioDriveConfig: FC = ({ connector, onConfigChange }) => {
+export const ComposioDriveConfig: FC = ({
+ connector,
+ onConfigChange,
+}) => {
const isIndexable = connector.config?.is_indexable as boolean;
// Initialize with existing selected folders and files from connector config
@@ -184,9 +195,7 @@ export const ComposioDriveConfig: FC = ({ connector, o
);
}
if (selectedFiles.length > 0) {
- parts.push(
- `${selectedFiles.length} file${selectedFiles.length > 1 ? "s" : ""}`
- );
+ parts.push(`${selectedFiles.length} file${selectedFiles.length > 1 ? "s" : ""}`);
}
return parts.length > 0 ? `(${parts.join(", ")})` : "";
})()}
@@ -329,13 +338,10 @@ export const ComposioDriveConfig: FC = ({ connector, o
- handleIndexingOptionChange("include_subfolders", checked)
- }
+ onCheckedChange={(checked) => handleIndexingOptionChange("include_subfolders", checked)}
/>
);
};
-
diff --git a/surfsense_web/components/assistant-ui/connector-popup/connector-configs/components/composio-gmail-config.tsx b/surfsense_web/components/assistant-ui/connector-popup/connector-configs/components/composio-gmail-config.tsx
index 4664e3e64..494e1362f 100644
--- a/surfsense_web/components/assistant-ui/connector-popup/connector-configs/components/composio-gmail-config.tsx
+++ b/surfsense_web/components/assistant-ui/connector-popup/connector-configs/components/composio-gmail-config.tsx
@@ -12,4 +12,3 @@ interface ComposioGmailConfigProps {
export const ComposioGmailConfig: FC = () => {
return ;
};
-
diff --git a/surfsense_web/components/assistant-ui/connector-popup/connector-configs/components/google-drive-config.tsx b/surfsense_web/components/assistant-ui/connector-popup/connector-configs/components/google-drive-config.tsx
index b6cfb39ae..383f6ce0e 100644
--- a/surfsense_web/components/assistant-ui/connector-popup/connector-configs/components/google-drive-config.tsx
+++ b/surfsense_web/components/assistant-ui/connector-popup/connector-configs/components/google-drive-config.tsx
@@ -1,6 +1,14 @@
"use client";
-import { File, FileSpreadsheet, FileText, FolderClosed, Image, Presentation, X } from "lucide-react";
+import {
+ File,
+ FileSpreadsheet,
+ FileText,
+ FolderClosed,
+ Image,
+ Presentation,
+ X,
+} from "lucide-react";
import type { FC } from "react";
import { useEffect, useState } from "react";
import { GoogleDriveFolderTree } from "@/components/connectors/google-drive-folder-tree";
diff --git a/surfsense_web/components/assistant-ui/connector-popup/connector-configs/views/connector-edit-view.tsx b/surfsense_web/components/assistant-ui/connector-popup/connector-configs/views/connector-edit-view.tsx
index 8f58db542..5668d398e 100644
--- a/surfsense_web/components/assistant-ui/connector-popup/connector-configs/views/connector-edit-view.tsx
+++ b/surfsense_web/components/assistant-ui/connector-popup/connector-configs/views/connector-edit-view.tsx
@@ -276,7 +276,8 @@ export const ConnectorEditView: FC = ({
Re-indexing runs in the background
- You can continue using SurfSense while we sync your data. Check inbox for updates.
+ You can continue using SurfSense while we sync your data. Check inbox for
+ updates.
diff --git a/surfsense_web/components/assistant-ui/connector-popup/connector-configs/views/indexing-configuration-view.tsx b/surfsense_web/components/assistant-ui/connector-popup/connector-configs/views/indexing-configuration-view.tsx
index 019e6b37f..684f03252 100644
--- a/surfsense_web/components/assistant-ui/connector-popup/connector-configs/views/indexing-configuration-view.tsx
+++ b/surfsense_web/components/assistant-ui/connector-popup/connector-configs/views/indexing-configuration-view.tsx
@@ -170,13 +170,13 @@ export const IndexingConfigurationView: FC = ({
{/* Periodic sync - not shown for Google Drive (regular and Composio) */}
{config.connectorType !== "GOOGLE_DRIVE_CONNECTOR" &&
config.connectorType !== "COMPOSIO_GOOGLE_DRIVE_CONNECTOR" && (
-
- )}
+
+ )}
>
)}
@@ -189,7 +189,8 @@ export const IndexingConfigurationView: FC = ({
Indexing runs in the background
- You can continue using SurfSense while we sync your data. Check inbox for updates.
+ You can continue using SurfSense while we sync your data. Check inbox for
+ updates.
diff --git a/surfsense_web/components/assistant-ui/connector-popup/hooks/use-connector-dialog.ts b/surfsense_web/components/assistant-ui/connector-popup/hooks/use-connector-dialog.ts
index 9a7f15b0c..639d0f7ed 100644
--- a/surfsense_web/components/assistant-ui/connector-popup/hooks/use-connector-dialog.ts
+++ b/surfsense_web/components/assistant-ui/connector-popup/hooks/use-connector-dialog.ts
@@ -328,11 +328,7 @@ export const useConnectorDialog = () => {
return;
}
- if (
- params.success === "true" &&
- searchSpaceId &&
- params.modal === "connectors"
- ) {
+ if (params.success === "true" && searchSpaceId && params.modal === "connectors") {
refetchAllConnectors().then((result) => {
if (!result.data) return;
@@ -346,16 +342,12 @@ export const useConnectorDialog = () => {
if (params.connectorId) {
const connectorId = parseInt(params.connectorId, 10);
newConnector = result.data.find((c: SearchSourceConnector) => c.id === connectorId);
-
+
// If we found the connector, find the matching OAuth/Composio connector by type
if (newConnector) {
oauthConnector =
- OAUTH_CONNECTORS.find(
- (c) => c.connectorType === newConnector!.connector_type
- ) ||
- COMPOSIO_CONNECTORS.find(
- (c) => c.connectorType === newConnector!.connector_type
- );
+ OAUTH_CONNECTORS.find((c) => c.connectorType === newConnector!.connector_type) ||
+ COMPOSIO_CONNECTORS.find((c) => c.connectorType === newConnector!.connector_type);
}
}
@@ -364,7 +356,7 @@ export const useConnectorDialog = () => {
oauthConnector =
OAUTH_CONNECTORS.find((c) => c.id === params.connector) ||
COMPOSIO_CONNECTORS.find((c) => c.id === params.connector);
-
+
if (oauthConnector) {
newConnector = result.data.find(
(c: SearchSourceConnector) => c.connector_type === oauthConnector!.connectorType
diff --git a/surfsense_web/components/assistant-ui/connector-popup/hooks/use-indexing-connectors.ts b/surfsense_web/components/assistant-ui/connector-popup/hooks/use-indexing-connectors.ts
index 289da475d..19741e020 100644
--- a/surfsense_web/components/assistant-ui/connector-popup/hooks/use-indexing-connectors.ts
+++ b/surfsense_web/components/assistant-ui/connector-popup/hooks/use-indexing-connectors.ts
@@ -68,9 +68,7 @@ export function useIndexingConnectors(
// Only check connector_indexing notifications
if (item.type !== "connector_indexing") continue;
- const metadata = isConnectorIndexingMetadata(item.metadata)
- ? item.metadata
- : null;
+ const metadata = isConnectorIndexingMetadata(item.metadata) ? item.metadata : null;
if (!metadata) continue;
// If status is "in_progress", add connector to indexing set
From 3368a65b0c5ab714e7673128bdb4782d31734b63 Mon Sep 17 00:00:00 2001
From: Rohan Verma <122026167+MODSetter@users.noreply.github.com>
Date: Sat, 24 Jan 2026 16:11:27 -0800
Subject: [PATCH 25/28] Change video link in README
Updated video link in README.
---
README.md | 3 +--
1 file changed, 1 insertion(+), 2 deletions(-)
diff --git a/README.md b/README.md
index 0c5f06029..4dd368c04 100644
--- a/README.md
+++ b/README.md
@@ -29,8 +29,7 @@ SurfSense is a highly customizable AI research agent, connected to external sour
# Video
-https://github.com/user-attachments/assets/42a29ea1-d4d8-4213-9c69-972b5b806d58
-
+https://github.com/user-attachments/assets/cc0c84d3-1f2f-4f7a-b519-2ecce22310b1
## Podcast Sample
From 20efc63f3003971a0db6c62c1c34cfdbf756cc3c Mon Sep 17 00:00:00 2001
From: "DESKTOP-RTLN3BA\\$punk"
Date: Sat, 24 Jan 2026 17:42:44 -0800
Subject: [PATCH 26/28] feat: implement dynamic connector and document type
discovery for knowledge base tool
- Added functionality to dynamically discover available connectors and document types for the knowledge base tool, enhancing its flexibility and usability.
- Introduced new mapping functions and updated existing search methods to accommodate Composio connectors, improving integration with external services.
- Enhanced error handling and logging for connector discovery processes, ensuring better feedback during failures.
---
.../app/agents/new_chat/chat_deepagent.py | 112 ++++++
.../app/agents/new_chat/tools/__init__.py | 4 +-
.../agents/new_chat/tools/knowledge_base.py | 306 +++++++++++----
.../app/agents/new_chat/tools/registry.py | 5 +
.../app/services/connector_service.py | 347 ++++++++++++++++++
5 files changed, 708 insertions(+), 66 deletions(-)
diff --git a/surfsense_backend/app/agents/new_chat/chat_deepagent.py b/surfsense_backend/app/agents/new_chat/chat_deepagent.py
index 5bc6ac2e2..53e1b14bd 100644
--- a/surfsense_backend/app/agents/new_chat/chat_deepagent.py
+++ b/surfsense_backend/app/agents/new_chat/chat_deepagent.py
@@ -7,6 +7,7 @@ via NewLLMConfig.
"""
from collections.abc import Sequence
+from typing import Any
from deepagents import create_deep_agent
from langchain_core.tools import BaseTool
@@ -23,6 +24,90 @@ from app.agents.new_chat.system_prompt import (
from app.agents.new_chat.tools.registry import build_tools_async
from app.services.connector_service import ConnectorService
+# =============================================================================
+# Connector Type Mapping
+# =============================================================================
+
+# Maps SearchSourceConnectorType enum values to the searchable document/connector types
+# used by the knowledge_base tool. Some connectors map to different document types.
+_CONNECTOR_TYPE_TO_SEARCHABLE: dict[str, str] = {
+ # Direct mappings (connector type == searchable type)
+ "TAVILY_API": "TAVILY_API",
+ "SEARXNG_API": "SEARXNG_API",
+ "LINKUP_API": "LINKUP_API",
+ "BAIDU_SEARCH_API": "BAIDU_SEARCH_API",
+ "SLACK_CONNECTOR": "SLACK_CONNECTOR",
+ "TEAMS_CONNECTOR": "TEAMS_CONNECTOR",
+ "NOTION_CONNECTOR": "NOTION_CONNECTOR",
+ "GITHUB_CONNECTOR": "GITHUB_CONNECTOR",
+ "LINEAR_CONNECTOR": "LINEAR_CONNECTOR",
+ "DISCORD_CONNECTOR": "DISCORD_CONNECTOR",
+ "JIRA_CONNECTOR": "JIRA_CONNECTOR",
+ "CONFLUENCE_CONNECTOR": "CONFLUENCE_CONNECTOR",
+ "CLICKUP_CONNECTOR": "CLICKUP_CONNECTOR",
+ "GOOGLE_CALENDAR_CONNECTOR": "GOOGLE_CALENDAR_CONNECTOR",
+ "GOOGLE_GMAIL_CONNECTOR": "GOOGLE_GMAIL_CONNECTOR",
+ "GOOGLE_DRIVE_CONNECTOR": "GOOGLE_DRIVE_FILE", # Connector type differs from document type
+ "AIRTABLE_CONNECTOR": "AIRTABLE_CONNECTOR",
+ "LUMA_CONNECTOR": "LUMA_CONNECTOR",
+ "ELASTICSEARCH_CONNECTOR": "ELASTICSEARCH_CONNECTOR",
+ "WEBCRAWLER_CONNECTOR": "CRAWLED_URL", # Maps to document type
+ "BOOKSTACK_CONNECTOR": "BOOKSTACK_CONNECTOR",
+ "CIRCLEBACK_CONNECTOR": "CIRCLEBACK", # Connector type differs from document type
+ "OBSIDIAN_CONNECTOR": "OBSIDIAN_CONNECTOR",
+ # Composio connectors
+ "COMPOSIO_GOOGLE_DRIVE_CONNECTOR": "COMPOSIO_GOOGLE_DRIVE_CONNECTOR",
+ "COMPOSIO_GMAIL_CONNECTOR": "COMPOSIO_GMAIL_CONNECTOR",
+ "COMPOSIO_GOOGLE_CALENDAR_CONNECTOR": "COMPOSIO_GOOGLE_CALENDAR_CONNECTOR",
+}
+
+# Document types that don't come from SearchSourceConnector but should always be searchable
+_ALWAYS_AVAILABLE_DOC_TYPES: list[str] = [
+ "EXTENSION", # Browser extension data
+ "FILE", # Uploaded files
+ "NOTE", # User notes
+ "YOUTUBE_VIDEO", # YouTube videos
+]
+
+
+def _map_connectors_to_searchable_types(
+ connector_types: list[Any],
+) -> list[str]:
+ """
+ Map SearchSourceConnectorType enums to searchable document/connector types.
+
+ This function:
+ 1. Converts connector type enums to their searchable counterparts
+ 2. Includes always-available document types (EXTENSION, FILE, NOTE, YOUTUBE_VIDEO)
+ 3. Deduplicates while preserving order
+
+ Args:
+ connector_types: List of SearchSourceConnectorType enum values
+
+ Returns:
+ List of searchable connector/document type strings
+ """
+ result_set: set[str] = set()
+ result_list: list[str] = []
+
+ # Add always-available document types first
+ for doc_type in _ALWAYS_AVAILABLE_DOC_TYPES:
+ if doc_type not in result_set:
+ result_set.add(doc_type)
+ result_list.append(doc_type)
+
+ # Map each connector type to its searchable equivalent
+ for ct in connector_types:
+ # Handle both enum and string types
+ ct_str = ct.value if hasattr(ct, "value") else str(ct)
+ searchable = _CONNECTOR_TYPE_TO_SEARCHABLE.get(ct_str)
+ if searchable and searchable not in result_set:
+ result_set.add(searchable)
+ result_list.append(searchable)
+
+ return result_list
+
+
# =============================================================================
# Deep Agent Factory
# =============================================================================
@@ -116,6 +201,30 @@ async def create_surfsense_deep_agent(
additional_tools=[my_custom_tool]
)
"""
+ # Discover available connectors and document types for this search space
+ # This enables dynamic tool docstrings that inform the LLM about what's actually available
+ available_connectors: list[str] | None = None
+ available_document_types: list[str] | None = None
+
+ try:
+ # Get enabled search source connectors for this search space
+ connector_types = await connector_service.get_available_connectors(
+ search_space_id
+ )
+ if connector_types:
+ # Convert enum values to strings and also include mapped document types
+ available_connectors = _map_connectors_to_searchable_types(connector_types)
+
+ # Get document types that have at least one document indexed
+ available_document_types = await connector_service.get_available_document_types(
+ search_space_id
+ )
+ except Exception as e:
+ # Log but don't fail - fall back to all connectors if discovery fails
+ import logging
+
+ logging.warning(f"Failed to discover available connectors/document types: {e}")
+
# Build dependencies dict for the tools registry
dependencies = {
"search_space_id": search_space_id,
@@ -123,6 +232,9 @@ async def create_surfsense_deep_agent(
"connector_service": connector_service,
"firecrawl_api_key": firecrawl_api_key,
"user_id": user_id, # Required for memory tools
+ # Dynamic connector/document type discovery for knowledge base tool
+ "available_connectors": available_connectors,
+ "available_document_types": available_document_types,
}
# Build tools using the async registry (includes MCP tools)
diff --git a/surfsense_backend/app/agents/new_chat/tools/__init__.py b/surfsense_backend/app/agents/new_chat/tools/__init__.py
index acbdbcb3a..9e1a4f19c 100644
--- a/surfsense_backend/app/agents/new_chat/tools/__init__.py
+++ b/surfsense_backend/app/agents/new_chat/tools/__init__.py
@@ -19,6 +19,7 @@ Available tools:
# Tool factory exports (for direct use)
from .display_image import create_display_image_tool
from .knowledge_base import (
+ CONNECTOR_DESCRIPTIONS,
create_search_knowledge_base_tool,
format_documents_for_context,
search_knowledge_base_async,
@@ -40,6 +41,8 @@ from .user_memory import create_recall_memory_tool, create_save_memory_tool
__all__ = [
# Registry
"BUILTIN_TOOLS",
+ # Knowledge base utilities
+ "CONNECTOR_DESCRIPTIONS",
"ToolDefinition",
"build_tools",
# Tool factories
@@ -51,7 +54,6 @@ __all__ = [
"create_scrape_webpage_tool",
"create_search_knowledge_base_tool",
"create_search_surfsense_docs_tool",
- # Knowledge base utilities
"format_documents_for_context",
"get_all_tool_names",
"get_default_enabled_tools",
diff --git a/surfsense_backend/app/agents/new_chat/tools/knowledge_base.py b/surfsense_backend/app/agents/new_chat/tools/knowledge_base.py
index 552019dda..a11e4ac38 100644
--- a/surfsense_backend/app/agents/new_chat/tools/knowledge_base.py
+++ b/surfsense_backend/app/agents/new_chat/tools/knowledge_base.py
@@ -12,7 +12,8 @@ import json
from datetime import datetime
from typing import Any
-from langchain_core.tools import tool
+from langchain_core.tools import StructuredTool
+from pydantic import BaseModel, Field
from sqlalchemy.ext.asyncio import AsyncSession
from app.services.connector_service import ConnectorService
@@ -22,6 +23,7 @@ from app.services.connector_service import ConnectorService
# =============================================================================
# Canonical connector values used internally by ConnectorService
+# Includes all document types and search source connectors
_ALL_CONNECTORS: list[str] = [
"EXTENSION",
"FILE",
@@ -50,41 +52,117 @@ _ALL_CONNECTORS: list[str] = [
"CRAWLED_URL",
"CIRCLEBACK",
"OBSIDIAN_CONNECTOR",
+ # Composio connectors
+ "COMPOSIO_GOOGLE_DRIVE_CONNECTOR",
+ "COMPOSIO_GMAIL_CONNECTOR",
+ "COMPOSIO_GOOGLE_CALENDAR_CONNECTOR",
]
+# Human-readable descriptions for each connector type
+# Used for generating dynamic docstrings and informing the LLM
+CONNECTOR_DESCRIPTIONS: dict[str, str] = {
+ "EXTENSION": "Web content saved via SurfSense browser extension (personal browsing history)",
+ "FILE": "User-uploaded documents (PDFs, Word, etc.) (personal files)",
+ "NOTE": "SurfSense Notes (notes created inside SurfSense)",
+ "SLACK_CONNECTOR": "Slack conversations and shared content (personal workspace communications)",
+ "TEAMS_CONNECTOR": "Microsoft Teams messages and conversations (personal Teams communications)",
+ "NOTION_CONNECTOR": "Notion workspace pages and databases (personal knowledge management)",
+ "YOUTUBE_VIDEO": "YouTube video transcripts and metadata (personally saved videos)",
+ "GITHUB_CONNECTOR": "GitHub repository content and issues (personal repositories and interactions)",
+ "ELASTICSEARCH_CONNECTOR": "Elasticsearch indexed documents and data (personal Elasticsearch instances)",
+ "LINEAR_CONNECTOR": "Linear project issues and discussions (personal project management)",
+ "JIRA_CONNECTOR": "Jira project issues, tickets, and comments (personal project tracking)",
+ "CONFLUENCE_CONNECTOR": "Confluence pages and comments (personal project documentation)",
+ "CLICKUP_CONNECTOR": "ClickUp tasks and project data (personal task management)",
+ "GOOGLE_CALENDAR_CONNECTOR": "Google Calendar events, meetings, and schedules (personal calendar)",
+ "GOOGLE_GMAIL_CONNECTOR": "Google Gmail emails and conversations (personal emails)",
+ "GOOGLE_DRIVE_FILE": "Google Drive files and documents (personal cloud storage)",
+ "DISCORD_CONNECTOR": "Discord server conversations and shared content (personal community)",
+ "AIRTABLE_CONNECTOR": "Airtable records, tables, and database content (personal data)",
+ "TAVILY_API": "Tavily web search API results (real-time web search)",
+ "SEARXNG_API": "SearxNG search API results (privacy-focused web search)",
+ "LINKUP_API": "Linkup search API results (web search)",
+ "BAIDU_SEARCH_API": "Baidu search API results (Chinese web search)",
+ "LUMA_CONNECTOR": "Luma events and meetings",
+ "WEBCRAWLER_CONNECTOR": "Webpages indexed by SurfSense (personally selected websites)",
+ "CRAWLED_URL": "Webpages indexed by SurfSense (personally selected websites)",
+ "BOOKSTACK_CONNECTOR": "BookStack pages (personal documentation)",
+ "CIRCLEBACK": "Circleback meeting notes, transcripts, and action items",
+ "OBSIDIAN_CONNECTOR": "Obsidian vault notes and markdown files (personal notes)",
+ # Composio connectors
+ "COMPOSIO_GOOGLE_DRIVE_CONNECTOR": "Google Drive files via Composio (personal cloud storage)",
+ "COMPOSIO_GMAIL_CONNECTOR": "Gmail emails via Composio (personal emails)",
+ "COMPOSIO_GOOGLE_CALENDAR_CONNECTOR": "Google Calendar events via Composio (personal calendar)",
+}
-def _normalize_connectors(connectors_to_search: list[str] | None) -> list[str]:
+
+def _normalize_connectors(
+ connectors_to_search: list[str] | None,
+ available_connectors: list[str] | None = None,
+) -> list[str]:
"""
Normalize connectors provided by the model.
- Accepts user-facing enums like WEBCRAWLER_CONNECTOR and maps them to canonical
ConnectorService types.
- Drops unknown values.
- - If None/empty, defaults to searching across all known connectors.
+ - If available_connectors is provided, only includes connectors from that list.
+ - If connectors_to_search is None/empty, defaults to available_connectors or all.
+
+ Args:
+ connectors_to_search: List of connectors requested by the model
+ available_connectors: List of connectors actually available in the search space
+
+ Returns:
+ List of normalized connector strings to search
"""
+ # Determine the set of valid connectors to consider
+ valid_set = (
+ set(available_connectors) if available_connectors else set(_ALL_CONNECTORS)
+ )
+
if not connectors_to_search:
- return list(_ALL_CONNECTORS)
+ # Search all available connectors if none specified
+ return (
+ list(available_connectors)
+ if available_connectors
+ else list(_ALL_CONNECTORS)
+ )
normalized: list[str] = []
for raw in connectors_to_search:
c = (raw or "").strip().upper()
if not c:
continue
+ # Map user-facing aliases to canonical names
if c == "WEBCRAWLER_CONNECTOR":
c = "CRAWLED_URL"
normalized.append(c)
- # de-dupe while preserving order + filter unknown
+ # de-dupe while preserving order + filter to valid connectors
seen: set[str] = set()
out: list[str] = []
for c in normalized:
if c in seen:
continue
+ # Only include if it's a known connector AND available
if c not in _ALL_CONNECTORS:
continue
+ if c not in valid_set:
+ continue
seen.add(c)
out.append(c)
- return out if out else list(_ALL_CONNECTORS)
+
+ # Fallback to all available if nothing matched
+ return (
+ out
+ if out
+ else (
+ list(available_connectors)
+ if available_connectors
+ else list(_ALL_CONNECTORS)
+ )
+ )
# =============================================================================
@@ -233,6 +311,7 @@ async def search_knowledge_base_async(
top_k: int = 10,
start_date: datetime | None = None,
end_date: datetime | None = None,
+ available_connectors: list[str] | None = None,
) -> str:
"""
Search the user's knowledge base for relevant documents.
@@ -248,6 +327,8 @@ async def search_knowledge_base_async(
top_k: Number of results per connector
start_date: Optional start datetime (UTC) for filtering documents
end_date: Optional end datetime (UTC) for filtering documents
+ available_connectors: Optional list of connectors actually available in the search space.
+ If provided, only these connectors will be searched.
Returns:
Formatted string with search results
@@ -262,7 +343,7 @@ async def search_knowledge_base_async(
end_date=end_date,
)
- connectors = _normalize_connectors(connectors_to_search)
+ connectors = _normalize_connectors(connectors_to_search, available_connectors)
for connector in connectors:
try:
@@ -316,6 +397,16 @@ async def search_knowledge_base_async(
)
all_documents.extend(chunks)
+ elif connector == "TEAMS_CONNECTOR":
+ _, chunks = await connector_service.search_teams(
+ user_query=query,
+ search_space_id=search_space_id,
+ top_k=top_k,
+ start_date=resolved_start_date,
+ end_date=resolved_end_date,
+ )
+ all_documents.extend(chunks)
+
elif connector == "NOTION_CONNECTOR":
_, chunks = await connector_service.search_notion(
user_query=query,
@@ -519,6 +610,39 @@ async def search_knowledge_base_async(
)
all_documents.extend(chunks)
+ # =========================================================
+ # Composio Connectors
+ # =========================================================
+ elif connector == "COMPOSIO_GOOGLE_DRIVE_CONNECTOR":
+ _, chunks = await connector_service.search_composio_google_drive(
+ user_query=query,
+ search_space_id=search_space_id,
+ top_k=top_k,
+ start_date=resolved_start_date,
+ end_date=resolved_end_date,
+ )
+ all_documents.extend(chunks)
+
+ elif connector == "COMPOSIO_GMAIL_CONNECTOR":
+ _, chunks = await connector_service.search_composio_gmail(
+ user_query=query,
+ search_space_id=search_space_id,
+ top_k=top_k,
+ start_date=resolved_start_date,
+ end_date=resolved_end_date,
+ )
+ all_documents.extend(chunks)
+
+ elif connector == "COMPOSIO_GOOGLE_CALENDAR_CONNECTOR":
+ _, chunks = await connector_service.search_composio_google_calendar(
+ user_query=query,
+ search_space_id=search_space_id,
+ top_k=top_k,
+ start_date=resolved_start_date,
+ end_date=resolved_end_date,
+ )
+ all_documents.extend(chunks)
+
except Exception as e:
print(f"Error searching connector {connector}: {e}")
continue
@@ -543,11 +667,68 @@ async def search_knowledge_base_async(
return format_documents_for_context(deduplicated)
+def _build_connector_docstring(available_connectors: list[str] | None) -> str:
+ """
+ Build the connector documentation section for the tool docstring.
+
+ Args:
+ available_connectors: List of available connector types, or None for all
+
+ Returns:
+ Formatted docstring section listing available connectors
+ """
+ connectors = available_connectors if available_connectors else list(_ALL_CONNECTORS)
+
+ lines = []
+ for connector in connectors:
+ # Skip internal names, prefer user-facing aliases
+ if connector == "CRAWLED_URL":
+ # Show as WEBCRAWLER_CONNECTOR for user-facing docs
+ description = CONNECTOR_DESCRIPTIONS.get(connector, connector)
+ lines.append(f"- WEBCRAWLER_CONNECTOR: {description}")
+ else:
+ description = CONNECTOR_DESCRIPTIONS.get(connector, connector)
+ lines.append(f"- {connector}: {description}")
+
+ return "\n".join(lines)
+
+
+# =============================================================================
+# Tool Input Schema
+# =============================================================================
+
+
+class SearchKnowledgeBaseInput(BaseModel):
+ """Input schema for the search_knowledge_base tool."""
+
+ query: str = Field(
+ description="The search query - be specific and include key terms"
+ )
+ top_k: int = Field(
+ default=10,
+ description="Number of results to retrieve (default: 10)",
+ )
+ start_date: str | None = Field(
+ default=None,
+ description="Optional ISO date/datetime (e.g. '2025-12-12' or '2025-12-12T00:00:00+00:00')",
+ )
+ end_date: str | None = Field(
+ default=None,
+ description="Optional ISO date/datetime (e.g. '2025-12-19' or '2025-12-19T23:59:59+00:00')",
+ )
+ connectors_to_search: list[str] | None = Field(
+ default=None,
+ description="Optional list of connector enums to search. If omitted, searches all available.",
+ )
+
+
def create_search_knowledge_base_tool(
search_space_id: int,
db_session: AsyncSession,
connector_service: ConnectorService,
-):
+ available_connectors: list[str] | None = None,
+ available_document_types: list[str] | None = None,
+) -> StructuredTool:
"""
Factory function to create the search_knowledge_base tool with injected dependencies.
@@ -555,72 +736,57 @@ def create_search_knowledge_base_tool(
search_space_id: The user's search space ID
db_session: Database session
connector_service: Initialized connector service
+ available_connectors: Optional list of connector types available in the search space.
+ Used to dynamically generate the tool docstring.
+ available_document_types: Optional list of document types that have data in the search space.
+ Used to inform the LLM about what data exists.
Returns:
- A configured tool function
+ A configured StructuredTool instance
"""
+ # Build connector documentation dynamically
+ connector_docs = _build_connector_docstring(available_connectors)
- @tool
- async def search_knowledge_base(
+ # Build context about available document types
+ doc_types_info = ""
+ if available_document_types:
+ doc_types_info = f"""
+
+## Document types with indexed content in this search space
+
+The following document types have content available for search:
+{", ".join(available_document_types)}
+
+Focus searches on these types for best results."""
+
+ # Build the dynamic description for the tool
+ # This is what the LLM sees when deciding whether/how to use the tool
+ dynamic_description = f"""Search the user's personal knowledge base for relevant information.
+
+Use this tool to find documents, notes, files, web pages, and other content that may help answer the user's question.
+
+IMPORTANT:
+- If the user requests a specific source type (e.g. "my notes", "Slack messages"), pass `connectors_to_search=[...]` using the enums below.
+- If `connectors_to_search` is omitted/empty, the system will search broadly.
+- Only connectors that are enabled/configured for this search space are available.{doc_types_info}
+
+## Available connector enums for `connectors_to_search`
+
+{connector_docs}
+
+NOTE: `WEBCRAWLER_CONNECTOR` is mapped internally to the canonical document type `CRAWLED_URL`."""
+
+ # Capture for closure
+ _available_connectors = available_connectors
+
+ async def _search_knowledge_base_impl(
query: str,
top_k: int = 10,
start_date: str | None = None,
end_date: str | None = None,
connectors_to_search: list[str] | None = None,
) -> str:
- """
- Search the user's personal knowledge base for relevant information.
-
- Use this tool to find documents, notes, files, web pages, and other content
- that may help answer the user's question.
-
- IMPORTANT:
- - If the user requests a specific source type (e.g. "my notes", "Slack messages"),
- pass `connectors_to_search=[...]` using the enums below.
- - If `connectors_to_search` is omitted/empty, the system will search broadly.
-
- ## Available connector enums for `connectors_to_search`
-
- - EXTENSION: "Web content saved via SurfSense browser extension" (personal browsing history)
- - FILE: "User-uploaded documents (PDFs, Word, etc.)" (personal files)
- - NOTE: "SurfSense Notes" (notes created inside SurfSense)
- - SLACK_CONNECTOR: "Slack conversations and shared content" (personal workspace communications)
- - TEAMS_CONNECTOR: "Microsoft Teams messages and conversations" (personal Teams communications)
- - NOTION_CONNECTOR: "Notion workspace pages and databases" (personal knowledge management)
- - YOUTUBE_VIDEO: "YouTube video transcripts and metadata" (personally saved videos)
- - GITHUB_CONNECTOR: "GitHub repository content and issues" (personal repositories and interactions)
- - ELASTICSEARCH_CONNECTOR: "Elasticsearch indexed documents and data" (personal Elasticsearch instances and custom data sources)
- - LINEAR_CONNECTOR: "Linear project issues and discussions" (personal project management)
- - JIRA_CONNECTOR: "Jira project issues, tickets, and comments" (personal project tracking)
- - CONFLUENCE_CONNECTOR: "Confluence pages and comments" (personal project documentation)
- - CLICKUP_CONNECTOR: "ClickUp tasks and project data" (personal task management)
- - GOOGLE_CALENDAR_CONNECTOR: "Google Calendar events, meetings, and schedules" (personal calendar and time management)
- - GOOGLE_GMAIL_CONNECTOR: "Google Gmail emails and conversations" (personal emails and communications)
- - GOOGLE_DRIVE_FILE: "Google Drive files and documents" (personal cloud storage and file management)
- - DISCORD_CONNECTOR: "Discord server conversations and shared content" (personal community communications)
- - AIRTABLE_CONNECTOR: "Airtable records, tables, and database content" (personal data management and organization)
- - TAVILY_API: "Tavily search API results" (personalized search results)
- - SEARXNG_API: "SearxNG search API results" (personalized search results)
- - LINKUP_API: "Linkup search API results" (personalized search results)
- - BAIDU_SEARCH_API: "Baidu search API results" (personalized search results)
- - LUMA_CONNECTOR: "Luma events"
- - WEBCRAWLER_CONNECTOR: "Webpages indexed by SurfSense" (personally selected websites)
- - BOOKSTACK_CONNECTOR: "BookStack pages" (personal documentation)
- - CIRCLEBACK: "Circleback meeting notes, transcripts, and action items" (personal meeting records)
- - OBSIDIAN_CONNECTOR: "Obsidian vault notes and markdown files" (personal notes and knowledge management)
-
- NOTE: `WEBCRAWLER_CONNECTOR` is mapped internally to the canonical document type `CRAWLED_URL`.
-
- Args:
- query: The search query - be specific and include key terms
- top_k: Number of results to retrieve (default: 10)
- start_date: Optional ISO date/datetime (e.g. "2025-12-12" or "2025-12-12T00:00:00+00:00")
- end_date: Optional ISO date/datetime (e.g. "2025-12-19" or "2025-12-19T23:59:59+00:00")
- connectors_to_search: Optional list of connector enums to search. If omitted, searches all.
-
- Returns:
- Formatted string with relevant documents and their content
- """
+ """Implementation function for knowledge base search."""
from app.agents.new_chat.utils import parse_date_or_datetime
parsed_start: datetime | None = None
@@ -640,6 +806,16 @@ def create_search_knowledge_base_tool(
top_k=top_k,
start_date=parsed_start,
end_date=parsed_end,
+ available_connectors=_available_connectors,
)
- return search_knowledge_base
+ # Create StructuredTool with dynamic description
+ # This properly sets the description that the LLM sees
+ tool = StructuredTool(
+ name="search_knowledge_base",
+ description=dynamic_description,
+ coroutine=_search_knowledge_base_impl,
+ args_schema=SearchKnowledgeBaseInput,
+ )
+
+ return tool
diff --git a/surfsense_backend/app/agents/new_chat/tools/registry.py b/surfsense_backend/app/agents/new_chat/tools/registry.py
index e4ce7a6b7..968e51445 100644
--- a/surfsense_backend/app/agents/new_chat/tools/registry.py
+++ b/surfsense_backend/app/agents/new_chat/tools/registry.py
@@ -85,6 +85,7 @@ class ToolDefinition:
# Contributors: Add your new tools here!
BUILTIN_TOOLS: list[ToolDefinition] = [
# Core tool - searches the user's knowledge base
+ # Now supports dynamic connector/document type discovery
ToolDefinition(
name="search_knowledge_base",
description="Search the user's personal knowledge base for relevant information",
@@ -92,8 +93,12 @@ BUILTIN_TOOLS: list[ToolDefinition] = [
search_space_id=deps["search_space_id"],
db_session=deps["db_session"],
connector_service=deps["connector_service"],
+ # Optional: dynamically discovered connectors/document types
+ available_connectors=deps.get("available_connectors"),
+ available_document_types=deps.get("available_document_types"),
),
requires=["search_space_id", "db_session", "connector_service"],
+ # Note: available_connectors and available_document_types are optional
),
# Podcast generation tool
ToolDefinition(
diff --git a/surfsense_backend/app/services/connector_service.py b/surfsense_backend/app/services/connector_service.py
index dc43697e7..4c5599815 100644
--- a/surfsense_backend/app/services/connector_service.py
+++ b/surfsense_backend/app/services/connector_service.py
@@ -2871,3 +2871,350 @@ class ConnectorService:
}
return result_object, obsidian_docs
+
+ # =========================================================================
+ # Composio Connector Search Methods
+ # =========================================================================
+
+ async def search_composio_google_drive(
+ self,
+ user_query: str,
+ search_space_id: int,
+ top_k: int = 20,
+ start_date: datetime | None = None,
+ end_date: datetime | None = None,
+ ) -> tuple:
+ """
+ Search for Composio Google Drive files and return both the source information
+ and langchain documents.
+
+ Uses combined chunk-level and document-level hybrid search with RRF fusion.
+
+ Args:
+ user_query: The user's query
+ search_space_id: The search space ID to search in
+ top_k: Maximum number of results to return
+ start_date: Optional start date for filtering documents by updated_at
+ end_date: Optional end date for filtering documents by updated_at
+
+ Returns:
+ tuple: (sources_info, langchain_documents)
+ """
+ composio_drive_docs = await self._combined_rrf_search(
+ query_text=user_query,
+ search_space_id=search_space_id,
+ document_type="COMPOSIO_GOOGLE_DRIVE_CONNECTOR",
+ top_k=top_k,
+ start_date=start_date,
+ end_date=end_date,
+ )
+
+ # Early return if no results
+ if not composio_drive_docs:
+ return {
+ "id": 54,
+ "name": "Google Drive (Composio)",
+ "type": "COMPOSIO_GOOGLE_DRIVE_CONNECTOR",
+ "sources": [],
+ }, []
+
+ def _title_fn(doc_info: dict[str, Any], metadata: dict[str, Any]) -> str:
+ return (
+ doc_info.get("title")
+ or metadata.get("title")
+ or metadata.get("file_name")
+ or "Untitled Document"
+ )
+
+ def _url_fn(_doc_info: dict[str, Any], metadata: dict[str, Any]) -> str:
+ return metadata.get("url") or metadata.get("web_view_link") or ""
+
+ def _description_fn(
+ chunk: dict[str, Any], _doc_info: dict[str, Any], metadata: dict[str, Any]
+ ) -> str:
+ description = self._chunk_preview(chunk.get("content", ""), limit=200)
+ info_parts = []
+ mime_type = metadata.get("mime_type")
+ modified_time = metadata.get("modified_time")
+ if mime_type:
+ info_parts.append(f"Type: {mime_type}")
+ if modified_time:
+ info_parts.append(f"Modified: {modified_time}")
+ if info_parts:
+ description = (description + " | " + " | ".join(info_parts)).strip(" |")
+ return description
+
+ def _extra_fields_fn(
+ _chunk: dict[str, Any], _doc_info: dict[str, Any], metadata: dict[str, Any]
+ ) -> dict[str, Any]:
+ return {
+ "mime_type": metadata.get("mime_type", ""),
+ "file_id": metadata.get("file_id", ""),
+ "modified_time": metadata.get("modified_time", ""),
+ }
+
+ sources_list = self._build_chunk_sources_from_documents(
+ composio_drive_docs,
+ title_fn=_title_fn,
+ url_fn=_url_fn,
+ description_fn=_description_fn,
+ extra_fields_fn=_extra_fields_fn,
+ )
+
+ # Create result object
+ result_object = {
+ "id": 54,
+ "name": "Google Drive (Composio)",
+ "type": "COMPOSIO_GOOGLE_DRIVE_CONNECTOR",
+ "sources": sources_list,
+ }
+
+ return result_object, composio_drive_docs
+
+ async def search_composio_gmail(
+ self,
+ user_query: str,
+ search_space_id: int,
+ top_k: int = 20,
+ start_date: datetime | None = None,
+ end_date: datetime | None = None,
+ ) -> tuple:
+ """
+ Search for Composio Gmail messages and return both the source information
+ and langchain documents.
+
+ Uses combined chunk-level and document-level hybrid search with RRF fusion.
+
+ Args:
+ user_query: The user's query
+ search_space_id: The search space ID to search in
+ top_k: Maximum number of results to return
+ start_date: Optional start date for filtering documents by updated_at
+ end_date: Optional end date for filtering documents by updated_at
+
+ Returns:
+ tuple: (sources_info, langchain_documents)
+ """
+ composio_gmail_docs = await self._combined_rrf_search(
+ query_text=user_query,
+ search_space_id=search_space_id,
+ document_type="COMPOSIO_GMAIL_CONNECTOR",
+ top_k=top_k,
+ start_date=start_date,
+ end_date=end_date,
+ )
+
+ # Early return if no results
+ if not composio_gmail_docs:
+ return {
+ "id": 55,
+ "name": "Gmail (Composio)",
+ "type": "COMPOSIO_GMAIL_CONNECTOR",
+ "sources": [],
+ }, []
+
+ def _title_fn(doc_info: dict[str, Any], metadata: dict[str, Any]) -> str:
+ return (
+ doc_info.get("title")
+ or metadata.get("subject")
+ or metadata.get("title")
+ or "Untitled Email"
+ )
+
+ def _url_fn(_doc_info: dict[str, Any], metadata: dict[str, Any]) -> str:
+ return metadata.get("url") or ""
+
+ def _description_fn(
+ chunk: dict[str, Any], _doc_info: dict[str, Any], metadata: dict[str, Any]
+ ) -> str:
+ description = self._chunk_preview(chunk.get("content", ""), limit=200)
+ info_parts = []
+ sender = metadata.get("from") or metadata.get("sender")
+ date = metadata.get("date") or metadata.get("received_at")
+ if sender:
+ info_parts.append(f"From: {sender}")
+ if date:
+ info_parts.append(f"Date: {date}")
+ if info_parts:
+ description = (description + " | " + " | ".join(info_parts)).strip(" |")
+ return description
+
+ def _extra_fields_fn(
+ _chunk: dict[str, Any], _doc_info: dict[str, Any], metadata: dict[str, Any]
+ ) -> dict[str, Any]:
+ return {
+ "message_id": metadata.get("message_id", ""),
+ "thread_id": metadata.get("thread_id", ""),
+ "from": metadata.get("from", ""),
+ "to": metadata.get("to", ""),
+ "date": metadata.get("date", ""),
+ }
+
+ sources_list = self._build_chunk_sources_from_documents(
+ composio_gmail_docs,
+ title_fn=_title_fn,
+ url_fn=_url_fn,
+ description_fn=_description_fn,
+ extra_fields_fn=_extra_fields_fn,
+ )
+
+ # Create result object
+ result_object = {
+ "id": 55,
+ "name": "Gmail (Composio)",
+ "type": "COMPOSIO_GMAIL_CONNECTOR",
+ "sources": sources_list,
+ }
+
+ return result_object, composio_gmail_docs
+
+ async def search_composio_google_calendar(
+ self,
+ user_query: str,
+ search_space_id: int,
+ top_k: int = 20,
+ start_date: datetime | None = None,
+ end_date: datetime | None = None,
+ ) -> tuple:
+ """
+ Search for Composio Google Calendar events and return both the source information
+ and langchain documents.
+
+ Uses combined chunk-level and document-level hybrid search with RRF fusion.
+
+ Args:
+ user_query: The user's query
+ search_space_id: The search space ID to search in
+ top_k: Maximum number of results to return
+ start_date: Optional start date for filtering documents by updated_at
+ end_date: Optional end date for filtering documents by updated_at
+
+ Returns:
+ tuple: (sources_info, langchain_documents)
+ """
+ composio_calendar_docs = await self._combined_rrf_search(
+ query_text=user_query,
+ search_space_id=search_space_id,
+ document_type="COMPOSIO_GOOGLE_CALENDAR_CONNECTOR",
+ top_k=top_k,
+ start_date=start_date,
+ end_date=end_date,
+ )
+
+ # Early return if no results
+ if not composio_calendar_docs:
+ return {
+ "id": 56,
+ "name": "Google Calendar (Composio)",
+ "type": "COMPOSIO_GOOGLE_CALENDAR_CONNECTOR",
+ "sources": [],
+ }, []
+
+ def _title_fn(doc_info: dict[str, Any], metadata: dict[str, Any]) -> str:
+ return (
+ doc_info.get("title")
+ or metadata.get("summary")
+ or metadata.get("title")
+ or "Untitled Event"
+ )
+
+ def _url_fn(_doc_info: dict[str, Any], metadata: dict[str, Any]) -> str:
+ return metadata.get("url") or metadata.get("html_link") or ""
+
+ def _description_fn(
+ chunk: dict[str, Any], _doc_info: dict[str, Any], metadata: dict[str, Any]
+ ) -> str:
+ description = self._chunk_preview(chunk.get("content", ""), limit=200)
+ info_parts = []
+ start_time = metadata.get("start_time") or metadata.get("start")
+ end_time = metadata.get("end_time") or metadata.get("end")
+ if start_time:
+ info_parts.append(f"Start: {start_time}")
+ if end_time:
+ info_parts.append(f"End: {end_time}")
+ if info_parts:
+ description = (description + " | " + " | ".join(info_parts)).strip(" |")
+ return description
+
+ def _extra_fields_fn(
+ _chunk: dict[str, Any], _doc_info: dict[str, Any], metadata: dict[str, Any]
+ ) -> dict[str, Any]:
+ return {
+ "event_id": metadata.get("event_id", ""),
+ "calendar_id": metadata.get("calendar_id", ""),
+ "start_time": metadata.get("start_time", ""),
+ "end_time": metadata.get("end_time", ""),
+ "location": metadata.get("location", ""),
+ }
+
+ sources_list = self._build_chunk_sources_from_documents(
+ composio_calendar_docs,
+ title_fn=_title_fn,
+ url_fn=_url_fn,
+ description_fn=_description_fn,
+ extra_fields_fn=_extra_fields_fn,
+ )
+
+ # Create result object
+ result_object = {
+ "id": 56,
+ "name": "Google Calendar (Composio)",
+ "type": "COMPOSIO_GOOGLE_CALENDAR_CONNECTOR",
+ "sources": sources_list,
+ }
+
+ return result_object, composio_calendar_docs
+
+ # =========================================================================
+ # Utility Methods for Connector Discovery
+ # =========================================================================
+
+ async def get_available_connectors(
+ self,
+ search_space_id: int,
+ ) -> list[SearchSourceConnectorType]:
+ """
+ Get all available (enabled) connector types for a search space.
+
+ Args:
+ search_space_id: The search space ID
+
+ Returns:
+ List of SearchSourceConnectorType enums for enabled connectors
+ """
+ query = (
+ select(SearchSourceConnector.connector_type)
+ .filter(
+ SearchSourceConnector.search_space_id == search_space_id,
+ )
+ .distinct()
+ )
+
+ result = await self.session.execute(query)
+ connector_types = result.scalars().all()
+ return list(connector_types)
+
+ async def get_available_document_types(
+ self,
+ search_space_id: int,
+ ) -> list[str]:
+ """
+ Get all document types that have at least one document in the search space.
+
+ Args:
+ search_space_id: The search space ID
+
+ Returns:
+ List of document type strings that have documents indexed
+ """
+ from sqlalchemy import distinct
+
+ from app.db import Document
+
+ query = select(distinct(Document.document_type)).filter(
+ Document.search_space_id == search_space_id,
+ )
+
+ result = await self.session.execute(query)
+ doc_types = result.scalars().all()
+ return [str(dt) for dt in doc_types]
From 555df90c842c52ac50a708a38f424287e1fc88b5 Mon Sep 17 00:00:00 2001
From: "DESKTOP-RTLN3BA\\$punk"
Date: Sat, 24 Jan 2026 17:47:18 -0800
Subject: [PATCH 27/28] chore: New connector statuses for Composio and GitHub
---
.../config/connector-status-config.json | 10 ++++++++++
1 file changed, 10 insertions(+)
diff --git a/surfsense_web/components/assistant-ui/connector-popup/config/connector-status-config.json b/surfsense_web/components/assistant-ui/connector-popup/config/connector-status-config.json
index b729c3f8b..2c1010b1c 100644
--- a/surfsense_web/components/assistant-ui/connector-popup/config/connector-status-config.json
+++ b/surfsense_web/components/assistant-ui/connector-popup/config/connector-status-config.json
@@ -24,6 +24,16 @@
"enabled": true,
"status": "warning",
"statusMessage": "Some requests may be blocked if not using Firecrawl."
+ },
+ "COMPOSIO_GOOGLE_DRIVE_CONNECTOR": {
+ "enabled": false,
+ "status": "disabled",
+ "statusMessage": "Not available yet."
+ },
+ "GITHUB_CONNECTOR": {
+ "enabled": false,
+ "status": "warning",
+ "statusMessage": "Some issues with indexing repositories."
}
},
"globalSettings": {
From 09162ad5cad4d627aa070f881830f9ca95b9d2ee Mon Sep 17 00:00:00 2001
From: "DESKTOP-RTLN3BA\\$punk"
Date: Sat, 24 Jan 2026 17:53:57 -0800
Subject: [PATCH 28/28] release: 0.0.12
---
surfsense_backend/pyproject.toml | 2 +-
surfsense_backend/uv.lock | 2 +-
surfsense_browser_extension/package.json | 2 +-
surfsense_web/package.json | 2 +-
4 files changed, 4 insertions(+), 4 deletions(-)
diff --git a/surfsense_backend/pyproject.toml b/surfsense_backend/pyproject.toml
index ffe9e5232..57dbdc7b5 100644
--- a/surfsense_backend/pyproject.toml
+++ b/surfsense_backend/pyproject.toml
@@ -1,6 +1,6 @@
[project]
name = "surf-new-backend"
-version = "0.0.11"
+version = "0.0.12"
description = "SurfSense Backend"
requires-python = ">=3.12"
dependencies = [
diff --git a/surfsense_backend/uv.lock b/surfsense_backend/uv.lock
index 18f04288e..16b77a7b2 100644
--- a/surfsense_backend/uv.lock
+++ b/surfsense_backend/uv.lock
@@ -6545,7 +6545,7 @@ wheels = [
[[package]]
name = "surf-new-backend"
-version = "0.0.11"
+version = "0.0.12"
source = { editable = "." }
dependencies = [
{ name = "alembic" },
diff --git a/surfsense_browser_extension/package.json b/surfsense_browser_extension/package.json
index b225bc206..bf926d09f 100644
--- a/surfsense_browser_extension/package.json
+++ b/surfsense_browser_extension/package.json
@@ -1,7 +1,7 @@
{
"name": "surfsense_browser_extension",
"displayName": "Surfsense Browser Extension",
- "version": "0.0.11",
+ "version": "0.0.12",
"description": "Extension to collect Browsing History for SurfSense.",
"author": "https://github.com/MODSetter",
"engines": {
diff --git a/surfsense_web/package.json b/surfsense_web/package.json
index 7ec05c95d..235f4b9db 100644
--- a/surfsense_web/package.json
+++ b/surfsense_web/package.json
@@ -1,6 +1,6 @@
{
"name": "surfsense_web",
- "version": "0.0.11",
+ "version": "0.0.12",
"private": true,
"description": "SurfSense Frontend",
"scripts": {