From f7b52470eb4d1adcf77d8fd36e7416f0f0b5e850 Mon Sep 17 00:00:00 2001 From: Anish Sarkar <104695310+AnishSarkar22@users.noreply.github.com> Date: Wed, 25 Mar 2026 18:33:44 +0530 Subject: [PATCH 01/71] feat: enhance Google connectors indexing with content extraction and document migration - Added `download_and_extract_content` function to extract content from Google Drive files as markdown. - Updated Google Drive indexer to utilize the new content extraction method. - Implemented document migration logic to update legacy Composio document types to their native Google types. - Introduced identifier hashing for stable document identification. - Improved file pre-filtering to handle unchanged and rename-only files efficiently. --- .../app/connectors/google_drive/__init__.py | 3 +- .../google_drive/content_extractor.py | 154 ++ .../app/indexing_pipeline/document_hashing.py | 11 +- .../indexing_pipeline_service.py | 59 +- .../google_calendar_indexer.py | 373 ++-- .../google_drive_indexer.py | 1539 +++++------------ .../google_gmail_indexer.py | 379 ++-- .../test_document_hashing.py | 21 + 8 files changed, 951 insertions(+), 1588 deletions(-) diff --git a/surfsense_backend/app/connectors/google_drive/__init__.py b/surfsense_backend/app/connectors/google_drive/__init__.py index 47cc8598e..a0e9c4484 100644 --- a/surfsense_backend/app/connectors/google_drive/__init__.py +++ b/surfsense_backend/app/connectors/google_drive/__init__.py @@ -2,13 +2,14 @@ from .change_tracker import categorize_change, fetch_all_changes, get_start_page_token from .client import GoogleDriveClient -from .content_extractor import download_and_process_file +from .content_extractor import download_and_extract_content, download_and_process_file from .credentials import get_valid_credentials, validate_credentials from .folder_manager import get_file_by_id, get_files_in_folder, list_folder_contents __all__ = [ "GoogleDriveClient", "categorize_change", + "download_and_extract_content", "download_and_process_file", "fetch_all_changes", "get_file_by_id", diff --git a/surfsense_backend/app/connectors/google_drive/content_extractor.py b/surfsense_backend/app/connectors/google_drive/content_extractor.py index 1d08d38f7..6fa20bf8e 100644 --- a/surfsense_backend/app/connectors/google_drive/content_extractor.py +++ b/surfsense_backend/app/connectors/google_drive/content_extractor.py @@ -17,6 +17,160 @@ from .file_types import get_export_mime_type, is_google_workspace_file, should_s logger = logging.getLogger(__name__) +async def download_and_extract_content( + client: GoogleDriveClient, + file: dict[str, Any], +) -> tuple[str | None, dict[str, Any], str | None]: + """Download a Google Drive file and extract its content as markdown. + + ETL only -- no DB writes, no indexing, no summarization. + + Returns: + (markdown_content, drive_metadata, error_message) + On success error_message is None. + """ + file_id = file.get("id") + file_name = file.get("name", "Unknown") + mime_type = file.get("mimeType", "") + + if should_skip_file(mime_type): + return None, {}, f"Skipping {mime_type}" + + logger.info(f"Downloading file for content extraction: {file_name} ({mime_type})") + + drive_metadata: dict[str, Any] = { + "google_drive_file_id": file_id, + "google_drive_file_name": file_name, + "google_drive_mime_type": mime_type, + "source_connector": "google_drive", + } + if "modifiedTime" in file: + drive_metadata["modified_time"] = file["modifiedTime"] + if "createdTime" in file: + drive_metadata["created_time"] = file["createdTime"] + if "size" in file: + drive_metadata["file_size"] = file["size"] + if "webViewLink" in file: + drive_metadata["web_view_link"] = file["webViewLink"] + if "md5Checksum" in file: + drive_metadata["md5_checksum"] = file["md5Checksum"] + if is_google_workspace_file(mime_type): + drive_metadata["exported_as"] = "pdf" + drive_metadata["original_workspace_type"] = mime_type.split(".")[-1] + + temp_file_path = None + try: + # Download / export + if is_google_workspace_file(mime_type): + export_mime = get_export_mime_type(mime_type) + if not export_mime: + return None, drive_metadata, f"Cannot export Google Workspace type: {mime_type}" + content_bytes, error = await client.export_google_file(file_id, export_mime) + if error: + return None, drive_metadata, error + extension = ".pdf" if export_mime == "application/pdf" else ".txt" + else: + content_bytes, error = await client.download_file(file_id) + if error: + return None, drive_metadata, error + extension = Path(file_name).suffix or ".bin" + + with tempfile.NamedTemporaryFile(delete=False, suffix=extension) as tmp: + tmp.write(content_bytes) + temp_file_path = tmp.name + + # Parse to markdown + markdown = await _parse_file_to_markdown(temp_file_path, file_name) + return markdown, drive_metadata, None + + except Exception as e: + logger.warning(f"Failed to extract content from {file_name}: {e!s}") + return None, drive_metadata, str(e) + finally: + if temp_file_path and os.path.exists(temp_file_path): + try: + os.unlink(temp_file_path) + except Exception: + pass + + +async def _parse_file_to_markdown(file_path: str, filename: str) -> str: + """Parse a local file to markdown using the configured ETL service.""" + lower = filename.lower() + + if lower.endswith((".md", ".markdown", ".txt")): + with open(file_path, encoding="utf-8") as f: + return f.read() + + if lower.endswith((".mp3", ".mp4", ".mpeg", ".mpga", ".m4a", ".wav", ".webm")): + from app.config import config as app_config + from litellm import atranscription + + stt_service_type = ( + "local" + if app_config.STT_SERVICE and app_config.STT_SERVICE.startswith("local/") + else "external" + ) + if stt_service_type == "local": + from app.services.stt_service import stt_service + result = stt_service.transcribe_file(file_path) + text = result.get("text", "") + else: + with open(file_path, "rb") as audio_file: + kwargs: dict[str, Any] = { + "model": app_config.STT_SERVICE, + "file": audio_file, + "api_key": app_config.STT_SERVICE_API_KEY, + } + if app_config.STT_SERVICE_API_BASE: + kwargs["api_base"] = app_config.STT_SERVICE_API_BASE + resp = await atranscription(**kwargs) + text = resp.get("text", "") + + if not text: + raise ValueError("Transcription returned empty text") + return f"# Transcription of {filename}\n\n{text}" + + # Document files -- use configured ETL service + from app.config import config as app_config + + if app_config.ETL_SERVICE == "UNSTRUCTURED": + from langchain_unstructured import UnstructuredLoader + from app.utils.document_converters import convert_document_to_markdown + + loader = UnstructuredLoader( + file_path, + mode="elements", + post_processors=[], + languages=["eng"], + include_orig_elements=False, + include_metadata=False, + strategy="auto", + ) + docs = await loader.aload() + return await convert_document_to_markdown(docs) + + if app_config.ETL_SERVICE == "LLAMACLOUD": + from app.tasks.document_processors.file_processors import ( + parse_with_llamacloud_retry, + ) + + result = await parse_with_llamacloud_retry(file_path=file_path, estimated_pages=50) + markdown_documents = await result.aget_markdown_documents(split_by_page=False) + if not markdown_documents: + raise RuntimeError(f"LlamaCloud returned no documents for {filename}") + return markdown_documents[0].text + + if app_config.ETL_SERVICE == "DOCLING": + from docling.document_converter import DocumentConverter + + converter = DocumentConverter() + result = converter.convert(file_path) + return result.document.export_to_markdown() + + raise RuntimeError(f"Unknown ETL_SERVICE: {app_config.ETL_SERVICE}") + + async def download_and_process_file( client: GoogleDriveClient, file: dict[str, Any], diff --git a/surfsense_backend/app/indexing_pipeline/document_hashing.py b/surfsense_backend/app/indexing_pipeline/document_hashing.py index 5dd7767a4..9edebd140 100644 --- a/surfsense_backend/app/indexing_pipeline/document_hashing.py +++ b/surfsense_backend/app/indexing_pipeline/document_hashing.py @@ -3,10 +3,17 @@ import hashlib from app.indexing_pipeline.connector_document import ConnectorDocument +def compute_identifier_hash( + document_type_value: str, unique_id: str, search_space_id: int +) -> str: + """Return a stable SHA-256 hash from raw identity components.""" + combined = f"{document_type_value}:{unique_id}:{search_space_id}" + return hashlib.sha256(combined.encode("utf-8")).hexdigest() + + def compute_unique_identifier_hash(doc: ConnectorDocument) -> str: """Return a stable SHA-256 hash identifying a document by its source identity.""" - combined = f"{doc.document_type.value}:{doc.unique_id}:{doc.search_space_id}" - return hashlib.sha256(combined.encode("utf-8")).hexdigest() + return compute_identifier_hash(doc.document_type.value, doc.unique_id, doc.search_space_id) def compute_content_hash(doc: ConnectorDocument) -> str: diff --git a/surfsense_backend/app/indexing_pipeline/indexing_pipeline_service.py b/surfsense_backend/app/indexing_pipeline/indexing_pipeline_service.py index 490aac782..c6a29f204 100644 --- a/surfsense_backend/app/indexing_pipeline/indexing_pipeline_service.py +++ b/surfsense_backend/app/indexing_pipeline/indexing_pipeline_service.py @@ -6,12 +6,13 @@ from sqlalchemy import delete, select from sqlalchemy.exc import IntegrityError from sqlalchemy.ext.asyncio import AsyncSession -from app.db import Chunk, Document, DocumentStatus +from app.db import NATIVE_TO_LEGACY_DOCTYPE, Chunk, Document, DocumentStatus from app.indexing_pipeline.connector_document import ConnectorDocument from app.indexing_pipeline.document_chunker import chunk_text from app.indexing_pipeline.document_embedder import embed_texts from app.indexing_pipeline.document_hashing import ( compute_content_hash, + compute_identifier_hash, compute_unique_identifier_hash, ) from app.indexing_pipeline.document_persistence import ( @@ -54,6 +55,62 @@ class IndexingPipelineService: def __init__(self, session: AsyncSession) -> None: self.session = session + async def migrate_legacy_docs( + self, connector_docs: list[ConnectorDocument] + ) -> None: + """Migrate legacy Composio documents to their native Google type. + + For each ConnectorDocument whose document_type has a Composio equivalent + in NATIVE_TO_LEGACY_DOCTYPE, look up the old document by legacy hash and + update its unique_identifier_hash and document_type so that + prepare_for_indexing() can find it under the native hash. + """ + for doc in connector_docs: + legacy_type = NATIVE_TO_LEGACY_DOCTYPE.get(doc.document_type.value) + if not legacy_type: + continue + + legacy_hash = compute_identifier_hash( + legacy_type, doc.unique_id, doc.search_space_id + ) + result = await self.session.execute( + select(Document).filter( + Document.unique_identifier_hash == legacy_hash + ) + ) + existing = result.scalars().first() + if existing is None: + continue + + native_hash = compute_identifier_hash( + doc.document_type.value, doc.unique_id, doc.search_space_id + ) + existing.unique_identifier_hash = native_hash + existing.document_type = doc.document_type + + await self.session.commit() + + async def index_batch( + self, connector_docs: list[ConnectorDocument], llm + ) -> list[Document]: + """Convenience method: prepare_for_indexing then index each document. + + Indexers that need heartbeat callbacks or custom per-document logic + should call prepare_for_indexing() + index() directly instead. + """ + doc_map = { + compute_unique_identifier_hash(cd): cd for cd in connector_docs + } + documents = await self.prepare_for_indexing(connector_docs) + results: list[Document] = [] + for document in documents: + connector_doc = doc_map.get(document.unique_identifier_hash) + if connector_doc is None: + continue + result = await self.index(document, connector_doc, llm) + results.append(result) + return results + async def prepare_for_indexing( self, connector_docs: list[ConnectorDocument] ) -> list[Document]: diff --git a/surfsense_backend/app/tasks/connector_indexers/google_calendar_indexer.py b/surfsense_backend/app/tasks/connector_indexers/google_calendar_indexer.py index 233bc66e4..a69b33bdc 100644 --- a/surfsense_backend/app/tasks/connector_indexers/google_calendar_indexer.py +++ b/surfsense_backend/app/tasks/connector_indexers/google_calendar_indexer.py @@ -1,9 +1,8 @@ """ Google Calendar connector indexer. -Implements 2-phase document status updates for real-time UI feedback: -- Phase 1: Create all documents with 'pending' status (visible in UI immediately) -- Phase 2: Process each document: pending → processing → ready/failed +Uses the shared IndexingPipelineService for document deduplication, +summarization, chunking, and embedding. """ import time @@ -15,29 +14,25 @@ from sqlalchemy.exc import SQLAlchemyError from sqlalchemy.ext.asyncio import AsyncSession from app.connectors.google_calendar_connector import GoogleCalendarConnector -from app.db import Document, DocumentStatus, DocumentType, SearchSourceConnectorType +from app.db import DocumentType, SearchSourceConnectorType +from app.indexing_pipeline.connector_document import ConnectorDocument +from app.indexing_pipeline.document_hashing import ( + compute_content_hash, + compute_unique_identifier_hash, +) +from app.indexing_pipeline.indexing_pipeline_service import IndexingPipelineService from app.services.llm_service import get_user_long_context_llm from app.services.task_logging_service import TaskLoggingService -from app.utils.document_converters import ( - create_document_chunks, - embed_text, - generate_content_hash, - generate_document_summary, - generate_unique_identifier_hash, -) from app.utils.google_credentials import ( COMPOSIO_GOOGLE_CONNECTOR_TYPES, build_composio_credentials, ) from .base import ( - check_document_by_unique_identifier, check_duplicate_document_by_hash, get_connector_by_id, - get_current_timestamp, logger, parse_date_flexible, - safe_set_chunks, update_connector_last_indexed, ) @@ -46,13 +41,60 @@ ACCEPTED_CALENDAR_CONNECTOR_TYPES = { SearchSourceConnectorType.COMPOSIO_GOOGLE_CALENDAR_CONNECTOR, } -# Type hint for heartbeat callback HeartbeatCallbackType = Callable[[int], Awaitable[None]] - -# Heartbeat interval in seconds HEARTBEAT_INTERVAL_SECONDS = 30 +def _build_connector_doc( + event: dict, + event_markdown: str, + *, + connector_id: int, + search_space_id: int, + user_id: str, + enable_summary: bool, +) -> ConnectorDocument: + """Map a raw Google Calendar API event dict to a ConnectorDocument.""" + event_id = event.get("id", "") + event_summary = event.get("summary", "No Title") + calendar_id = event.get("calendarId", "") + + start = event.get("start", {}) + end = event.get("end", {}) + start_time = start.get("dateTime") or start.get("date", "") + end_time = end.get("dateTime") or end.get("date", "") + location = event.get("location", "") + + metadata = { + "event_id": event_id, + "event_summary": event_summary, + "calendar_id": calendar_id, + "start_time": start_time, + "end_time": end_time, + "location": location, + "connector_id": connector_id, + "document_type": "Google Calendar Event", + "connector_type": "Google Calendar", + } + + fallback_summary = ( + f"Google Calendar Event: {event_summary}\n\n{event_markdown}" + ) + + return ConnectorDocument( + title=event_summary, + source_markdown=event_markdown, + unique_id=event_id, + document_type=DocumentType.GOOGLE_CALENDAR_CONNECTOR, + search_space_id=search_space_id, + connector_id=connector_id, + created_by_id=user_id, + should_summarize=enable_summary, + fallback_summary=fallback_summary, + metadata=metadata, + ) + + async def index_google_calendar_events( session: AsyncSession, connector_id: int, @@ -82,7 +124,6 @@ async def index_google_calendar_events( """ task_logger = TaskLoggingService(session, search_space_id) - # Log task start log_entry = await task_logger.log_task_start( task_name="google_calendar_events_indexing", source="connector_indexing_task", @@ -96,7 +137,7 @@ async def index_google_calendar_events( ) try: - # Accept both native and Composio Calendar connectors + # ── Connector lookup ────────────────────────────────────────── connector = None for ct in ACCEPTED_CALENDAR_CONNECTOR_TYPES: connector = await get_connector_by_id(session, connector_id, ct) @@ -112,7 +153,7 @@ async def index_google_calendar_events( ) return 0, 0, f"Connector with ID {connector_id} not found" - # Build credentials based on connector type + # ── Credential building ─────────────────────────────────────── if connector.connector_type in COMPOSIO_GOOGLE_CONNECTOR_TYPES: connected_account_id = connector.config.get("composio_connected_account_id") if not connected_account_id: @@ -184,6 +225,7 @@ async def index_google_calendar_events( ) return 0, 0, "Google Calendar credentials not found in connector config" + # ── Calendar client init ────────────────────────────────────── await task_logger.log_task_progress( log_entry, f"Initializing Google Calendar client for connector {connector_id}", @@ -203,36 +245,26 @@ async def index_google_calendar_events( if end_date == "undefined" or end_date == "": end_date = None - # Calculate date range - # For calendar connectors, allow future dates to index upcoming events + # ── Date range calculation ──────────────────────────────────── if start_date is None or end_date is None: - # Fall back to calculating dates based on last_indexed_at - # Default to today (users can manually select future dates if needed) calculated_end_date = datetime.now() - # Use last_indexed_at as start date if available, otherwise use 30 days ago if connector.last_indexed_at: - # Convert dates to be comparable (both timezone-naive) last_indexed_naive = ( connector.last_indexed_at.replace(tzinfo=None) if connector.last_indexed_at.tzinfo else connector.last_indexed_at ) - - # Allow future dates - use last_indexed_at as start date calculated_start_date = last_indexed_naive logger.info( f"Using last_indexed_at ({calculated_start_date.strftime('%Y-%m-%d')}) as start date" ) else: - calculated_start_date = datetime.now() - timedelta( - days=365 - ) # Use 365 days as default for calendar events (matches frontend) + calculated_start_date = datetime.now() - timedelta(days=365) logger.info( f"No last_indexed_at found, using {calculated_start_date.strftime('%Y-%m-%d')} (365 days ago) as start date" ) - # Use calculated dates if not provided start_date_str = ( start_date if start_date else calculated_start_date.strftime("%Y-%m-%d") ) @@ -240,19 +272,14 @@ async def index_google_calendar_events( end_date if end_date else calculated_end_date.strftime("%Y-%m-%d") ) else: - # Use provided dates (including future dates) start_date_str = start_date end_date_str = end_date - # FIX: Ensure end_date is at least 1 day after start_date to avoid - # "start_date must be strictly before end_date" errors when dates are the same - # (e.g., when last_indexed_at is today) if start_date_str == end_date_str: logger.info( f"Start date ({start_date_str}) equals end date ({end_date_str}), " "adjusting end date to next day to ensure valid date range" ) - # Parse end_date and add 1 day try: end_dt = parse_date_flexible(end_date_str) except ValueError: @@ -264,6 +291,7 @@ async def index_google_calendar_events( end_date_str = end_dt.strftime("%Y-%m-%d") logger.info(f"Adjusted end date to {end_date_str}") + # ── Fetch events ────────────────────────────────────────────── await task_logger.log_task_progress( log_entry, f"Fetching Google Calendar events from {start_date_str} to {end_date_str}", @@ -274,27 +302,19 @@ async def index_google_calendar_events( }, ) - # Get events within date range from primary calendar try: events, error = await calendar_client.get_all_primary_calendar_events( start_date=start_date_str, end_date=end_date_str ) if error: - # Don't treat "No events found" as an error that should stop indexing if "No events found" in error: logger.info(f"No Google Calendar events found: {error}") - logger.info( - "No events found is not a critical error, continuing with update" - ) if update_last_indexed: await update_connector_last_indexed( session, connector, update_last_indexed ) await session.commit() - logger.info( - f"Updated last_indexed_at to {connector.last_indexed_at} despite no events found" - ) await task_logger.log_task_success( log_entry, @@ -304,7 +324,6 @@ async def index_google_calendar_events( return 0, 0, None else: logger.error(f"Failed to get Google Calendar events: {error}") - # Check if this is an authentication error that requires re-authentication error_message = error error_type = "APIError" if ( @@ -329,28 +348,15 @@ async def index_google_calendar_events( logger.error(f"Error fetching Google Calendar events: {e!s}", exc_info=True) return 0, 0, f"Error fetching Google Calendar events: {e!s}" - documents_indexed = 0 + # ── Build ConnectorDocuments ────────────────────────────────── + connector_docs: list[ConnectorDocument] = [] documents_skipped = 0 - documents_failed = 0 # Track events that failed processing - duplicate_content_count = ( - 0 # Track events skipped due to duplicate content_hash - ) - - # Heartbeat tracking - update notification periodically to prevent appearing stuck - last_heartbeat_time = time.time() - - # ======================================================================= - # PHASE 1: Analyze all events, create pending documents - # This makes ALL documents visible in the UI immediately with pending status - # ======================================================================= - events_to_process = [] # List of dicts with document and event data - new_documents_created = False + duplicate_content_count = 0 for event in events: try: event_id = event.get("id") event_summary = event.get("summary", "No Title") - calendar_id = event.get("calendarId", "") if not event_id: logger.warning(f"Skipping event with missing ID: {event_summary}") @@ -363,223 +369,73 @@ async def index_google_calendar_events( documents_skipped += 1 continue - start = event.get("start", {}) - end = event.get("end", {}) - start_time = start.get("dateTime") or start.get("date", "") - end_time = end.get("dateTime") or end.get("date", "") - location = event.get("location", "") - description = event.get("description", "") - - # Generate unique identifier hash for this Google Calendar event - unique_identifier_hash = generate_unique_identifier_hash( - DocumentType.GOOGLE_CALENDAR_CONNECTOR, event_id, search_space_id + doc = _build_connector_doc( + event, + event_markdown, + connector_id=connector_id, + search_space_id=search_space_id, + user_id=user_id, + enable_summary=connector.enable_summary, ) - # Generate content hash - content_hash = generate_content_hash(event_markdown, search_space_id) - - # Check if document with this unique identifier already exists - existing_document = await check_document_by_unique_identifier( - session, unique_identifier_hash - ) - - # Fallback: legacy Composio hash - if not existing_document: - legacy_hash = generate_unique_identifier_hash( - DocumentType.COMPOSIO_GOOGLE_CALENDAR_CONNECTOR, - event_id, - search_space_id, - ) - existing_document = await check_document_by_unique_identifier( - session, legacy_hash - ) - if existing_document: - existing_document.unique_identifier_hash = ( - unique_identifier_hash - ) - if ( - existing_document.document_type - == DocumentType.COMPOSIO_GOOGLE_CALENDAR_CONNECTOR - ): - existing_document.document_type = ( - DocumentType.GOOGLE_CALENDAR_CONNECTOR - ) - logger.info( - f"Migrated legacy Composio Calendar document: {event_id}" - ) - - if existing_document: - # Document exists - check if content has changed - if existing_document.content_hash == content_hash: - # Ensure status is ready (might have been stuck in processing/pending) - if not DocumentStatus.is_state( - existing_document.status, DocumentStatus.READY - ): - existing_document.status = DocumentStatus.ready() - documents_skipped += 1 - continue - - # Queue existing document for update (will be set to processing in Phase 2) - events_to_process.append( - { - "document": existing_document, - "is_new": False, - "event_markdown": event_markdown, - "content_hash": content_hash, - "event_id": event_id, - "event_summary": event_summary, - "calendar_id": calendar_id, - "start_time": start_time, - "end_time": end_time, - "location": location, - "description": description, - } - ) - continue - - # Document doesn't exist by unique_identifier_hash - # Check if a document with the same content_hash exists (from another connector) with session.no_autoflush: - duplicate_by_content = await check_duplicate_document_by_hash( - session, content_hash + duplicate = await check_duplicate_document_by_hash( + session, compute_content_hash(doc) ) - - if duplicate_by_content: - # A document with the same content already exists (likely from Composio connector) + if duplicate: logger.info( - f"Event {event_summary} already indexed by another connector " - f"(existing document ID: {duplicate_by_content.id}, " - f"type: {duplicate_by_content.document_type}). Skipping to avoid duplicate content." + f"Event {doc.title} already indexed by another connector " + f"(existing document ID: {duplicate.id}, " + f"type: {duplicate.document_type}). Skipping." ) duplicate_content_count += 1 documents_skipped += 1 continue - # Create new document with PENDING status (visible in UI immediately) - document = Document( - search_space_id=search_space_id, - title=event_summary, - document_type=DocumentType.GOOGLE_CALENDAR_CONNECTOR, - document_metadata={ - "event_id": event_id, - "event_summary": event_summary, - "calendar_id": calendar_id, - "start_time": start_time, - "end_time": end_time, - "location": location, - "connector_id": connector_id, - }, - content="Pending...", # Placeholder until processed - content_hash=unique_identifier_hash, # Temporary unique value - updated when ready - unique_identifier_hash=unique_identifier_hash, - embedding=None, - chunks=[], # Empty at creation - safe for async - status=DocumentStatus.pending(), # Pending until processing starts - updated_at=get_current_timestamp(), - created_by_id=user_id, - connector_id=connector_id, - ) - session.add(document) - new_documents_created = True - - events_to_process.append( - { - "document": document, - "is_new": True, - "event_markdown": event_markdown, - "content_hash": content_hash, - "event_id": event_id, - "event_summary": event_summary, - "calendar_id": calendar_id, - "start_time": start_time, - "end_time": end_time, - "location": location, - "description": description, - } - ) + connector_docs.append(doc) except Exception as e: - logger.error(f"Error in Phase 1 for event: {e!s}", exc_info=True) - documents_failed += 1 + logger.error(f"Error building ConnectorDocument for event: {e!s}", exc_info=True) + documents_skipped += 1 continue - # Commit all pending documents - they all appear in UI now - if new_documents_created: - logger.info( - f"Phase 1: Committing {len([e for e in events_to_process if e['is_new']])} pending documents" - ) - await session.commit() + # ── Pipeline: migrate legacy docs + prepare + index ─────────── + pipeline = IndexingPipelineService(session) - # ======================================================================= - # PHASE 2: Process each document one by one - # Each document transitions: pending → processing → ready/failed - # ======================================================================= - logger.info(f"Phase 2: Processing {len(events_to_process)} documents") + await pipeline.migrate_legacy_docs(connector_docs) - for item in events_to_process: - # Send heartbeat periodically + documents = await pipeline.prepare_for_indexing(connector_docs) + + doc_map = { + compute_unique_identifier_hash(cd): cd for cd in connector_docs + } + + documents_indexed = 0 + documents_failed = 0 + last_heartbeat_time = time.time() + + for document in documents: if on_heartbeat_callback: current_time = time.time() if current_time - last_heartbeat_time >= HEARTBEAT_INTERVAL_SECONDS: await on_heartbeat_callback(documents_indexed) last_heartbeat_time = current_time - document = item["document"] - try: - # Set to PROCESSING and commit - shows "processing" in UI for THIS document only - document.status = DocumentStatus.processing() - await session.commit() + connector_doc = doc_map.get(document.unique_identifier_hash) + if connector_doc is None: + logger.warning( + f"No matching ConnectorDocument for document {document.id}, skipping" + ) + documents_failed += 1 + continue - # Heavy processing (LLM, embeddings, chunks) + try: user_llm = await get_user_long_context_llm( session, user_id, search_space_id ) - - if user_llm and connector.enable_summary: - document_metadata_for_summary = { - "event_id": item["event_id"], - "event_summary": item["event_summary"], - "calendar_id": item["calendar_id"], - "start_time": item["start_time"], - "end_time": item["end_time"], - "location": item["location"] or "No location", - "document_type": "Google Calendar Event", - "connector_type": "Google Calendar", - } - ( - summary_content, - summary_embedding, - ) = await generate_document_summary( - item["event_markdown"], user_llm, document_metadata_for_summary - ) - else: - summary_content = f"Google Calendar Event: {item['event_summary']}\n\n{item['event_markdown']}" - summary_embedding = embed_text(summary_content) - - chunks = await create_document_chunks(item["event_markdown"]) - - # Update document to READY with actual content - document.title = item["event_summary"] - document.content = summary_content - document.content_hash = item["content_hash"] - document.embedding = summary_embedding - document.document_metadata = { - "event_id": item["event_id"], - "event_summary": item["event_summary"], - "calendar_id": item["calendar_id"], - "start_time": item["start_time"], - "end_time": item["end_time"], - "location": item["location"], - "indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"), - "connector_id": connector_id, - } - await safe_set_chunks(session, document, chunks) - document.updated_at = get_current_timestamp() - document.status = DocumentStatus.ready() - + await pipeline.index(document, connector_doc, user_llm) documents_indexed += 1 - # Batch commit every 10 documents (for ready status updates) if documents_indexed % 10 == 0: logger.info( f"Committing batch: {documents_indexed} Google Calendar events processed so far" @@ -588,21 +444,12 @@ async def index_google_calendar_events( except Exception as e: logger.error(f"Error processing Calendar event: {e!s}", exc_info=True) - # Mark document as failed with reason (visible in UI) - try: - document.status = DocumentStatus.failed(str(e)) - document.updated_at = get_current_timestamp() - except Exception as status_error: - logger.error( - f"Failed to update document status to failed: {status_error}" - ) documents_failed += 1 continue - # CRITICAL: Always update timestamp (even if 0 documents indexed) so Zero syncs + # ── Finalize ────────────────────────────────────────────────── await update_connector_last_indexed(session, connector, update_last_indexed) - # Final commit for any remaining documents not yet committed in batches logger.info( f"Final commit: Total {documents_indexed} Google Calendar events processed" ) @@ -612,22 +459,18 @@ async def index_google_calendar_events( "Successfully committed all Google Calendar document changes to database" ) except Exception as e: - # Handle any remaining integrity errors gracefully (race conditions, etc.) if ( "duplicate key value violates unique constraint" in str(e).lower() or "uniqueviolationerror" in str(e).lower() ): logger.warning( f"Duplicate content_hash detected during final commit. " - f"This may occur if the same event was indexed by multiple connectors. " f"Rolling back and continuing. Error: {e!s}" ) await session.rollback() - # Don't fail the entire task - some documents may have been successfully indexed else: raise - # Build warning message if there were issues warning_parts = [] if duplicate_content_count > 0: warning_parts.append(f"{duplicate_content_count} duplicate") diff --git a/surfsense_backend/app/tasks/connector_indexers/google_drive_indexer.py b/surfsense_backend/app/tasks/connector_indexers/google_drive_indexer.py index 260db0ce6..92c074812 100644 --- a/surfsense_backend/app/tasks/connector_indexers/google_drive_indexer.py +++ b/surfsense_backend/app/tasks/connector_indexers/google_drive_indexer.py @@ -1,36 +1,41 @@ -"""Google Drive indexer using Surfsense file processors. +"""Google Drive indexer using the shared IndexingPipelineService. -Implements 2-phase document status updates for real-time UI feedback: -- Phase 1: Create all documents with 'pending' status (visible in UI immediately) -- Phase 2: Process each document: pending → processing → ready/failed +File-level pre-filter (_should_skip_file) handles md5/modifiedTime +checks and rename-only detection. download_and_extract_content() +returns markdown which is fed into ConnectorDocument -> pipeline. """ import logging import time from collections.abc import Awaitable, Callable +from sqlalchemy import String, cast, select from sqlalchemy.exc import SQLAlchemyError from sqlalchemy.ext.asyncio import AsyncSession +from sqlalchemy.orm.attributes import flag_modified from app.config import config from app.connectors.google_drive import ( GoogleDriveClient, categorize_change, - download_and_process_file, + download_and_extract_content, fetch_all_changes, get_file_by_id, get_files_in_folder, get_start_page_token, ) +from app.connectors.google_drive.file_types import should_skip_file as skip_mime from app.db import Document, DocumentStatus, DocumentType, SearchSourceConnectorType +from app.indexing_pipeline.connector_document import ConnectorDocument +from app.indexing_pipeline.document_hashing import compute_identifier_hash +from app.indexing_pipeline.indexing_pipeline_service import IndexingPipelineService +from app.services.llm_service import get_user_long_context_llm from app.services.task_logging_service import TaskLoggingService from app.tasks.connector_indexers.base import ( check_document_by_unique_identifier, get_connector_by_id, - get_current_timestamp, update_connector_last_indexed, ) -from app.utils.document_converters import generate_unique_identifier_hash from app.utils.google_credentials import ( COMPOSIO_GOOGLE_CONNECTOR_TYPES, build_composio_credentials, @@ -41,15 +46,423 @@ ACCEPTED_DRIVE_CONNECTOR_TYPES = { SearchSourceConnectorType.COMPOSIO_GOOGLE_DRIVE_CONNECTOR, } -# Type hint for heartbeat callback HeartbeatCallbackType = Callable[[int], Awaitable[None]] - -# Heartbeat interval in seconds HEARTBEAT_INTERVAL_SECONDS = 30 logger = logging.getLogger(__name__) +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + +async def _should_skip_file( + session: AsyncSession, + file: dict, + search_space_id: int, +) -> tuple[bool, str | None]: + """Pre-filter: detect unchanged / rename-only files. + + Returns (should_skip, message). + Side-effects: migrates legacy Composio hashes, updates renames in-place. + """ + file_id = file.get("id") + file_name = file.get("name", "Unknown") + mime_type = file.get("mimeType", "") + + if skip_mime(mime_type): + return True, "folder/shortcut" + if not file_id: + return True, "missing file_id" + + # --- locate existing document --- + primary_hash = compute_identifier_hash( + DocumentType.GOOGLE_DRIVE_FILE.value, file_id, search_space_id + ) + existing = await check_document_by_unique_identifier(session, primary_hash) + + if not existing: + legacy_hash = compute_identifier_hash( + DocumentType.COMPOSIO_GOOGLE_DRIVE_CONNECTOR.value, file_id, search_space_id + ) + existing = await check_document_by_unique_identifier(session, legacy_hash) + if existing: + existing.unique_identifier_hash = primary_hash + if existing.document_type == DocumentType.COMPOSIO_GOOGLE_DRIVE_CONNECTOR: + existing.document_type = DocumentType.GOOGLE_DRIVE_FILE + logger.info(f"Migrated legacy Composio Drive document: {file_id}") + + if not existing: + result = await session.execute( + select(Document).where( + Document.search_space_id == search_space_id, + Document.document_type.in_([ + DocumentType.GOOGLE_DRIVE_FILE, + DocumentType.COMPOSIO_GOOGLE_DRIVE_CONNECTOR, + ]), + cast(Document.document_metadata["google_drive_file_id"], String) == file_id, + ) + ) + existing = result.scalar_one_or_none() + if existing: + existing.unique_identifier_hash = primary_hash + if existing.document_type == DocumentType.COMPOSIO_GOOGLE_DRIVE_CONNECTOR: + existing.document_type = DocumentType.GOOGLE_DRIVE_FILE + logger.debug(f"Found legacy doc by metadata for file_id: {file_id}") + + if not existing: + return False, None + + # --- content-change check via md5 / modifiedTime --- + incoming_md5 = file.get("md5Checksum") + incoming_mtime = file.get("modifiedTime") + meta = existing.document_metadata or {} + stored_md5 = meta.get("md5_checksum") + stored_mtime = meta.get("modified_time") + + content_unchanged = False + if incoming_md5 and stored_md5: + content_unchanged = incoming_md5 == stored_md5 + elif incoming_md5 and not stored_md5: + return False, None + elif not incoming_md5 and incoming_mtime and stored_mtime: + content_unchanged = incoming_mtime == stored_mtime + elif not incoming_md5: + return False, None + + if not content_unchanged: + return False, None + + # --- rename-only detection --- + old_name = meta.get("FILE_NAME") or meta.get("google_drive_file_name") + if old_name and old_name != file_name: + existing.title = file_name + if not existing.document_metadata: + existing.document_metadata = {} + existing.document_metadata["FILE_NAME"] = file_name + existing.document_metadata["google_drive_file_name"] = file_name + if incoming_mtime: + existing.document_metadata["modified_time"] = incoming_mtime + flag_modified(existing, "document_metadata") + await session.commit() + logger.info(f"Rename-only update: '{old_name}' → '{file_name}'") + return True, f"File renamed: '{old_name}' → '{file_name}'" + + if not DocumentStatus.is_state(existing.status, DocumentStatus.READY): + existing.status = DocumentStatus.ready() + return True, "unchanged" + + +def _build_connector_doc( + file: dict, + markdown: str, + drive_metadata: dict, + *, + connector_id: int, + search_space_id: int, + user_id: str, + enable_summary: bool, +) -> ConnectorDocument: + """Build a ConnectorDocument from Drive file metadata + extracted markdown.""" + file_id = file.get("id", "") + file_name = file.get("name", "Unknown") + + metadata = { + **drive_metadata, + "connector_id": connector_id, + "document_type": "Google Drive File", + "connector_type": "Google Drive", + } + + fallback_summary = f"File: {file_name}\n\n{markdown[:4000]}" + + return ConnectorDocument( + title=file_name, + source_markdown=markdown, + unique_id=file_id, + document_type=DocumentType.GOOGLE_DRIVE_FILE, + search_space_id=search_space_id, + connector_id=connector_id, + created_by_id=user_id, + should_summarize=enable_summary, + fallback_summary=fallback_summary, + metadata=metadata, + ) + + +async def _process_single_file( + drive_client: GoogleDriveClient, + session: AsyncSession, + file: dict, + connector_id: int, + search_space_id: int, + user_id: str, + enable_summary: bool = True, +) -> tuple[int, int, int]: + """Download, extract, and index a single Drive file via the pipeline. + + Returns (indexed, skipped, failed). + """ + file_name = file.get("name", "Unknown") + + try: + skip, msg = await _should_skip_file(session, file, search_space_id) + if skip: + if msg and "renamed" in msg.lower(): + return 1, 0, 0 + return 0, 1, 0 + + markdown, drive_metadata, error = await download_and_extract_content( + drive_client, file + ) + if error or not markdown: + logger.warning(f"ETL failed for {file_name}: {error}") + return 0, 1, 0 + + doc = _build_connector_doc( + file, + markdown, + drive_metadata, + connector_id=connector_id, + search_space_id=search_space_id, + user_id=user_id, + enable_summary=enable_summary, + ) + + pipeline = IndexingPipelineService(session) + documents = await pipeline.prepare_for_indexing([doc]) + if not documents: + return 0, 1, 0 + + from app.indexing_pipeline.document_hashing import compute_unique_identifier_hash + + doc_map = {compute_unique_identifier_hash(doc): doc} + for document in documents: + connector_doc = doc_map.get(document.unique_identifier_hash) + if not connector_doc: + continue + user_llm = await get_user_long_context_llm(session, user_id, search_space_id) + await pipeline.index(document, connector_doc, user_llm) + + logger.info(f"Successfully indexed Google Drive file: {file_name}") + return 1, 0, 0 + + except Exception as e: + logger.error(f"Error processing file {file_name}: {e!s}", exc_info=True) + return 0, 0, 1 + + +async def _remove_document(session: AsyncSession, file_id: str, search_space_id: int): + """Remove a document that was deleted in Drive.""" + primary_hash = compute_identifier_hash( + DocumentType.GOOGLE_DRIVE_FILE.value, file_id, search_space_id + ) + existing = await check_document_by_unique_identifier(session, primary_hash) + + if not existing: + legacy_hash = compute_identifier_hash( + DocumentType.COMPOSIO_GOOGLE_DRIVE_CONNECTOR.value, file_id, search_space_id + ) + existing = await check_document_by_unique_identifier(session, legacy_hash) + + if not existing: + result = await session.execute( + select(Document).where( + Document.search_space_id == search_space_id, + Document.document_type.in_([ + DocumentType.GOOGLE_DRIVE_FILE, + DocumentType.COMPOSIO_GOOGLE_DRIVE_CONNECTOR, + ]), + cast(Document.document_metadata["google_drive_file_id"], String) == file_id, + ) + ) + existing = result.scalar_one_or_none() + + if existing: + await session.delete(existing) + logger.info(f"Removed deleted file document: {file_id}") + + +# --------------------------------------------------------------------------- +# Scan strategies +# --------------------------------------------------------------------------- + +async def _index_full_scan( + drive_client: GoogleDriveClient, + session: AsyncSession, + connector: object, + connector_id: int, + search_space_id: int, + user_id: str, + folder_id: str | None, + folder_name: str, + task_logger: TaskLoggingService, + log_entry: object, + max_files: int, + include_subfolders: bool = False, + on_heartbeat_callback: HeartbeatCallbackType | None = None, + enable_summary: bool = True, +) -> tuple[int, int]: + """Full scan indexing of a folder.""" + await task_logger.log_task_progress( + log_entry, + f"Starting full scan of folder: {folder_name} (include_subfolders={include_subfolders})", + {"stage": "full_scan", "folder_id": folder_id, "include_subfolders": include_subfolders}, + ) + + indexed = 0 + skipped = 0 + failed = 0 + files_processed = 0 + last_heartbeat = time.time() + folders_to_process = [(folder_id, folder_name)] + first_error: str | None = None + + while folders_to_process and files_processed < max_files: + cur_id, cur_name = folders_to_process.pop(0) + page_token = None + + while files_processed < max_files: + files, next_token, error = await get_files_in_folder( + drive_client, cur_id, include_subfolders=True, page_token=page_token, + ) + if error: + logger.error(f"Error listing files in {cur_name}: {error}") + if first_error is None: + first_error = error + break + if not files: + break + + for file in files: + if files_processed >= max_files: + break + + mime = file.get("mimeType", "") + if mime == "application/vnd.google-apps.folder": + if include_subfolders: + folders_to_process.append((file["id"], file.get("name", "Unknown"))) + continue + + files_processed += 1 + + if on_heartbeat_callback: + now = time.time() + if now - last_heartbeat >= HEARTBEAT_INTERVAL_SECONDS: + await on_heartbeat_callback(indexed) + last_heartbeat = now + + i, s, f = await _process_single_file( + drive_client, session, file, + connector_id, search_space_id, user_id, enable_summary, + ) + indexed += i + skipped += s + failed += f + + if indexed > 0 and indexed % 10 == 0: + await session.commit() + + page_token = next_token + if not page_token: + break + + if not files_processed and first_error: + err_lower = first_error.lower() + if "401" in first_error or "invalid credentials" in err_lower or "authError" in first_error: + raise Exception( + f"Google Drive authentication failed. Please re-authenticate. (Error: {first_error})" + ) + raise Exception(f"Failed to list Google Drive files: {first_error}") + + logger.info(f"Full scan complete: {indexed} indexed, {skipped} skipped, {failed} failed") + return indexed, skipped + + +async def _index_with_delta_sync( + drive_client: GoogleDriveClient, + session: AsyncSession, + connector: object, + connector_id: int, + search_space_id: int, + user_id: str, + folder_id: str | None, + start_page_token: str, + task_logger: TaskLoggingService, + log_entry: object, + max_files: int, + include_subfolders: bool = False, + on_heartbeat_callback: HeartbeatCallbackType | None = None, + enable_summary: bool = True, +) -> tuple[int, int]: + """Delta sync using change tracking.""" + await task_logger.log_task_progress( + log_entry, + f"Starting delta sync from token: {start_page_token[:20]}...", + {"stage": "delta_sync", "start_token": start_page_token}, + ) + + changes, _final_token, error = await fetch_all_changes(drive_client, start_page_token, folder_id) + if error: + err_lower = error.lower() + if "401" in error or "invalid credentials" in err_lower or "authError" in error: + raise Exception( + f"Google Drive authentication failed. Please re-authenticate. (Error: {error})" + ) + raise Exception(f"Failed to fetch Google Drive changes: {error}") + + if not changes: + logger.info("No changes detected since last sync") + return 0, 0 + + logger.info(f"Processing {len(changes)} changes") + indexed = 0 + skipped = 0 + failed = 0 + files_processed = 0 + last_heartbeat = time.time() + + for change in changes: + if files_processed >= max_files: + break + files_processed += 1 + change_type = categorize_change(change) + + if change_type in ["removed", "trashed"]: + fid = change.get("fileId") + if fid: + await _remove_document(session, fid, search_space_id) + continue + + file = change.get("file") + if not file: + continue + + if on_heartbeat_callback: + now = time.time() + if now - last_heartbeat >= HEARTBEAT_INTERVAL_SECONDS: + await on_heartbeat_callback(indexed) + last_heartbeat = now + + i, s, f = await _process_single_file( + drive_client, session, file, + connector_id, search_space_id, user_id, enable_summary, + ) + indexed += i + skipped += s + failed += f + + if indexed > 0 and indexed % 10 == 0: + await session.commit() + + logger.info(f"Delta sync complete: {indexed} indexed, {skipped} skipped, {failed} failed") + return indexed, skipped + + +# --------------------------------------------------------------------------- +# Public entry points +# --------------------------------------------------------------------------- + async def index_google_drive_files( session: AsyncSession, connector_id: int, @@ -63,234 +476,125 @@ async def index_google_drive_files( include_subfolders: bool = False, on_heartbeat_callback: HeartbeatCallbackType | None = None, ) -> tuple[int, int, str | None]: - """ - Index Google Drive files for a specific connector. - - Args: - session: Database session - connector_id: ID of the Drive connector - search_space_id: ID of the search space - user_id: ID of the user - folder_id: Specific folder to index (from UI/request, takes precedence) - folder_name: Folder name for display (from UI/request) - use_delta_sync: Whether to use change tracking for incremental sync - update_last_indexed: Whether to update last_indexed_at timestamp - max_files: Maximum number of files to index - include_subfolders: Whether to recursively index files in subfolders - on_heartbeat_callback: Optional callback to update notification during long-running indexing. - - Returns: - Tuple of (number_of_indexed_files, number_of_skipped_files, error_message) - """ + """Index Google Drive files for a specific connector.""" task_logger = TaskLoggingService(session, search_space_id) - log_entry = await task_logger.log_task_start( task_name="google_drive_files_indexing", source="connector_indexing_task", message=f"Starting Google Drive indexing for connector {connector_id}", metadata={ - "connector_id": connector_id, - "user_id": str(user_id), - "folder_id": folder_id, - "use_delta_sync": use_delta_sync, - "max_files": max_files, + "connector_id": connector_id, "user_id": str(user_id), + "folder_id": folder_id, "use_delta_sync": use_delta_sync, "max_files": max_files, }, ) try: - # Accept both native and Composio Drive connectors connector = None for ct in ACCEPTED_DRIVE_CONNECTOR_TYPES: connector = await get_connector_by_id(session, connector_id, ct) if connector: break - if not connector: error_msg = f"Google Drive connector with ID {connector_id} not found" - await task_logger.log_task_failure( - log_entry, error_msg, None, {"error_type": "ConnectorNotFound"} - ) + await task_logger.log_task_failure(log_entry, error_msg, None, {"error_type": "ConnectorNotFound"}) return 0, 0, error_msg await task_logger.log_task_progress( - log_entry, - f"Initializing Google Drive client for connector {connector_id}", + log_entry, f"Initializing Google Drive client for connector {connector_id}", {"stage": "client_initialization"}, ) - # Build credentials based on connector type pre_built_credentials = None if connector.connector_type in COMPOSIO_GOOGLE_CONNECTOR_TYPES: connected_account_id = connector.config.get("composio_connected_account_id") if not connected_account_id: error_msg = f"Composio connected_account_id not found for connector {connector_id}" - await task_logger.log_task_failure( - log_entry, - error_msg, - "Missing Composio account", - {"error_type": "MissingComposioAccount"}, - ) + await task_logger.log_task_failure(log_entry, error_msg, "Missing Composio account", {"error_type": "MissingComposioAccount"}) return 0, 0, error_msg pre_built_credentials = build_composio_credentials(connected_account_id) else: token_encrypted = connector.config.get("_token_encrypted", False) - if token_encrypted: - if not config.SECRET_KEY: - await task_logger.log_task_failure( - log_entry, - f"SECRET_KEY not configured but credentials are marked as encrypted for connector {connector_id}", - "Missing SECRET_KEY for token decryption", - {"error_type": "MissingSecretKey"}, - ) - return ( - 0, - 0, - "SECRET_KEY not configured but credentials are marked as encrypted", - ) - logger.info( - f"Google Drive credentials are encrypted for connector {connector_id}, will decrypt during client initialization" + if token_encrypted and not config.SECRET_KEY: + await task_logger.log_task_failure( + log_entry, "SECRET_KEY not configured but credentials are encrypted", + "Missing SECRET_KEY", {"error_type": "MissingSecretKey"}, ) + return 0, 0, "SECRET_KEY not configured but credentials are marked as encrypted" connector_enable_summary = getattr(connector, "enable_summary", True) - - drive_client = GoogleDriveClient( - session, connector_id, credentials=pre_built_credentials - ) + drive_client = GoogleDriveClient(session, connector_id, credentials=pre_built_credentials) if not folder_id: error_msg = "folder_id is required for Google Drive indexing" - await task_logger.log_task_failure( - log_entry, error_msg, {"error_type": "MissingParameter"} - ) + await task_logger.log_task_failure(log_entry, error_msg, {"error_type": "MissingParameter"}) return 0, 0, error_msg target_folder_id = folder_id target_folder_name = folder_name or "Selected Folder" - logger.info( - f"Indexing Google Drive folder: {target_folder_name} ({target_folder_id})" - ) - folder_tokens = connector.config.get("folder_tokens", {}) start_page_token = folder_tokens.get(target_folder_id) - can_use_delta_sync = ( - use_delta_sync and start_page_token and connector.last_indexed_at - ) + can_use_delta = use_delta_sync and start_page_token and connector.last_indexed_at - if can_use_delta_sync: + if can_use_delta: logger.info(f"Using delta sync for connector {connector_id}") - result = await _index_with_delta_sync( - drive_client=drive_client, - session=session, - connector=connector, - connector_id=connector_id, - search_space_id=search_space_id, - user_id=user_id, - folder_id=target_folder_id, - start_page_token=start_page_token, - task_logger=task_logger, - log_entry=log_entry, - max_files=max_files, - include_subfolders=include_subfolders, - on_heartbeat_callback=on_heartbeat_callback, - enable_summary=connector_enable_summary, + documents_indexed, documents_skipped = await _index_with_delta_sync( + drive_client, session, connector, connector_id, search_space_id, user_id, + target_folder_id, start_page_token, task_logger, log_entry, max_files, + include_subfolders, on_heartbeat_callback, connector_enable_summary, ) - documents_indexed, documents_skipped = result - - # Reconciliation: full scan re-indexes documents that were manually - # deleted from SurfSense but still exist in Google Drive. - # Already-indexed files are skipped via md5/modifiedTime checks, - # so the overhead is just one API listing call + fast DB lookups. logger.info("Running reconciliation scan after delta sync") - reconcile_result = await _index_full_scan( - drive_client=drive_client, - session=session, - connector=connector, - connector_id=connector_id, - search_space_id=search_space_id, - user_id=user_id, - folder_id=target_folder_id, - folder_name=target_folder_name, - task_logger=task_logger, - log_entry=log_entry, - max_files=max_files, - include_subfolders=include_subfolders, - on_heartbeat_callback=on_heartbeat_callback, - enable_summary=connector_enable_summary, + ri, rs = await _index_full_scan( + drive_client, session, connector, connector_id, search_space_id, user_id, + target_folder_id, target_folder_name, task_logger, log_entry, max_files, + include_subfolders, on_heartbeat_callback, connector_enable_summary, ) - documents_indexed += reconcile_result[0] - documents_skipped += reconcile_result[1] + documents_indexed += ri + documents_skipped += rs else: logger.info(f"Using full scan for connector {connector_id}") - result = await _index_full_scan( - drive_client=drive_client, - session=session, - connector=connector, - connector_id=connector_id, - search_space_id=search_space_id, - user_id=user_id, - folder_id=target_folder_id, - folder_name=target_folder_name, - task_logger=task_logger, - log_entry=log_entry, - max_files=max_files, - include_subfolders=include_subfolders, - on_heartbeat_callback=on_heartbeat_callback, - enable_summary=connector_enable_summary, + documents_indexed, documents_skipped = await _index_full_scan( + drive_client, session, connector, connector_id, search_space_id, user_id, + target_folder_id, target_folder_name, task_logger, log_entry, max_files, + include_subfolders, on_heartbeat_callback, connector_enable_summary, ) - documents_indexed, documents_skipped = result - if documents_indexed > 0 or can_use_delta_sync: + if documents_indexed > 0 or can_use_delta: new_token, token_error = await get_start_page_token(drive_client) if new_token and not token_error: - from sqlalchemy.orm.attributes import flag_modified - - # Refresh connector to reload attributes that may have been expired by earlier commits await session.refresh(connector) - if "folder_tokens" not in connector.config: connector.config["folder_tokens"] = {} connector.config["folder_tokens"][target_folder_id] = new_token flag_modified(connector, "config") - await update_connector_last_indexed(session, connector, update_last_indexed) await session.commit() - logger.info("Successfully committed Google Drive indexing changes to database") await task_logger.log_task_success( log_entry, f"Successfully completed Google Drive indexing for connector {connector_id}", { - "files_processed": documents_indexed, - "files_skipped": documents_skipped, - "sync_type": "delta" if can_use_delta_sync else "full", - "folder": target_folder_name, + "files_processed": documents_indexed, "files_skipped": documents_skipped, + "sync_type": "delta" if can_use_delta else "full", "folder": target_folder_name, }, ) - - logger.info( - f"Google Drive indexing completed: {documents_indexed} files indexed, {documents_skipped} skipped" - ) + logger.info(f"Google Drive indexing completed: {documents_indexed} indexed, {documents_skipped} skipped") return documents_indexed, documents_skipped, None except SQLAlchemyError as db_error: await session.rollback() await task_logger.log_task_failure( - log_entry, - f"Database error during Google Drive indexing for connector {connector_id}", - str(db_error), - {"error_type": "SQLAlchemyError"}, + log_entry, f"Database error during Google Drive indexing for connector {connector_id}", + str(db_error), {"error_type": "SQLAlchemyError"}, ) logger.error(f"Database error: {db_error!s}", exc_info=True) return 0, 0, f"Database error: {db_error!s}" except Exception as e: await session.rollback() await task_logger.log_task_failure( - log_entry, - f"Failed to index Google Drive files for connector {connector_id}", - str(e), - {"error_type": type(e).__name__}, + log_entry, f"Failed to index Google Drive files for connector {connector_id}", + str(e), {"error_type": type(e).__name__}, ) logger.error(f"Failed to index Google Drive files: {e!s}", exc_info=True) return 0, 0, f"Failed to index Google Drive files: {e!s}" @@ -304,964 +608,81 @@ async def index_google_drive_single_file( file_id: str, file_name: str | None = None, ) -> tuple[int, str | None]: - """ - Index a single Google Drive file by its ID. - - Args: - session: Database session - connector_id: ID of the Drive connector - search_space_id: ID of the search space - user_id: ID of the user - file_id: Specific file ID to index - file_name: File name for display (optional) - - Returns: - Tuple of (number_of_indexed_files, error_message) - """ + """Index a single Google Drive file by its ID.""" task_logger = TaskLoggingService(session, search_space_id) - log_entry = await task_logger.log_task_start( task_name="google_drive_single_file_indexing", source="connector_indexing_task", message=f"Starting Google Drive single file indexing for file {file_id}", - metadata={ - "connector_id": connector_id, - "user_id": str(user_id), - "file_id": file_id, - "file_name": file_name, - }, + metadata={"connector_id": connector_id, "user_id": str(user_id), "file_id": file_id, "file_name": file_name}, ) try: - # Accept both native and Composio Drive connectors connector = None for ct in ACCEPTED_DRIVE_CONNECTOR_TYPES: connector = await get_connector_by_id(session, connector_id, ct) if connector: break - if not connector: error_msg = f"Google Drive connector with ID {connector_id} not found" - await task_logger.log_task_failure( - log_entry, error_msg, None, {"error_type": "ConnectorNotFound"} - ) + await task_logger.log_task_failure(log_entry, error_msg, None, {"error_type": "ConnectorNotFound"}) return 0, error_msg - await task_logger.log_task_progress( - log_entry, - f"Initializing Google Drive client for connector {connector_id}", - {"stage": "client_initialization"}, - ) - pre_built_credentials = None if connector.connector_type in COMPOSIO_GOOGLE_CONNECTOR_TYPES: connected_account_id = connector.config.get("composio_connected_account_id") if not connected_account_id: error_msg = f"Composio connected_account_id not found for connector {connector_id}" - await task_logger.log_task_failure( - log_entry, - error_msg, - "Missing Composio account", - {"error_type": "MissingComposioAccount"}, - ) + await task_logger.log_task_failure(log_entry, error_msg, "Missing Composio account", {"error_type": "MissingComposioAccount"}) return 0, error_msg pre_built_credentials = build_composio_credentials(connected_account_id) else: token_encrypted = connector.config.get("_token_encrypted", False) - if token_encrypted: - if not config.SECRET_KEY: - await task_logger.log_task_failure( - log_entry, - f"SECRET_KEY not configured but credentials are marked as encrypted for connector {connector_id}", - "Missing SECRET_KEY for token decryption", - {"error_type": "MissingSecretKey"}, - ) - return ( - 0, - "SECRET_KEY not configured but credentials are marked as encrypted", - ) - logger.info( - f"Google Drive credentials are encrypted for connector {connector_id}, will decrypt during client initialization" + if token_encrypted and not config.SECRET_KEY: + await task_logger.log_task_failure( + log_entry, "SECRET_KEY not configured but credentials are encrypted", + "Missing SECRET_KEY", {"error_type": "MissingSecretKey"}, ) + return 0, "SECRET_KEY not configured but credentials are marked as encrypted" connector_enable_summary = getattr(connector, "enable_summary", True) + drive_client = GoogleDriveClient(session, connector_id, credentials=pre_built_credentials) - drive_client = GoogleDriveClient( - session, connector_id, credentials=pre_built_credentials - ) - - # Fetch the file metadata file, error = await get_file_by_id(drive_client, file_id) - if error or not file: error_msg = f"Failed to fetch file {file_id}: {error or 'File not found'}" - await task_logger.log_task_failure( - log_entry, error_msg, {"error_type": "FileNotFound"} - ) + await task_logger.log_task_failure(log_entry, error_msg, {"error_type": "FileNotFound"}) return 0, error_msg display_name = file_name or file.get("name", "Unknown") - logger.info(f"Indexing Google Drive file: {display_name} ({file_id})") - # Create pending document for status visibility - pending_doc, should_skip = await _create_pending_document_for_file( - session=session, - file=file, - connector_id=connector_id, - search_space_id=search_space_id, - user_id=user_id, - ) - - if should_skip: - await task_logger.log_task_progress( - log_entry, - f"File {display_name} is unchanged or not indexable", - {"status": "skipped"}, - ) - return 0, None - - # Commit pending document so it appears in UI - if pending_doc and pending_doc.id is None: - await session.commit() - - # Process the file indexed, _skipped, failed = await _process_single_file( - drive_client=drive_client, - session=session, - file=file, - connector_id=connector_id, - search_space_id=search_space_id, - user_id=user_id, - task_logger=task_logger, - log_entry=log_entry, - pending_document=pending_doc, - enable_summary=connector_enable_summary, + drive_client, session, file, + connector_id, search_space_id, user_id, connector_enable_summary, ) - await session.commit() - logger.info( - "Successfully committed Google Drive file indexing changes to database" - ) if failed > 0: error_msg = f"Failed to index file {display_name}" - await task_logger.log_task_failure( - log_entry, - error_msg, - {"file_name": display_name, "file_id": file_id}, - ) + await task_logger.log_task_failure(log_entry, error_msg, {"file_name": display_name, "file_id": file_id}) return 0, error_msg if indexed > 0: await task_logger.log_task_success( - log_entry, - f"Successfully indexed file {display_name}", - { - "file_name": display_name, - "file_id": file_id, - }, + log_entry, f"Successfully indexed file {display_name}", + {"file_name": display_name, "file_id": file_id}, ) - logger.info(f"Google Drive file indexing completed: {display_name}") return 1, None - else: - await task_logger.log_task_progress( - log_entry, - f"File {display_name} was skipped", - {"status": "skipped"}, - ) - return 0, None + + return 0, None except SQLAlchemyError as db_error: await session.rollback() - await task_logger.log_task_failure( - log_entry, - "Database error during file indexing", - str(db_error), - {"error_type": "SQLAlchemyError"}, - ) + await task_logger.log_task_failure(log_entry, "Database error during file indexing", str(db_error), {"error_type": "SQLAlchemyError"}) logger.error(f"Database error: {db_error!s}", exc_info=True) return 0, f"Database error: {db_error!s}" except Exception as e: await session.rollback() - await task_logger.log_task_failure( - log_entry, - "Failed to index Google Drive file", - str(e), - {"error_type": type(e).__name__}, - ) + await task_logger.log_task_failure(log_entry, "Failed to index Google Drive file", str(e), {"error_type": type(e).__name__}) logger.error(f"Failed to index Google Drive file: {e!s}", exc_info=True) return 0, f"Failed to index Google Drive file: {e!s}" - - -async def _index_full_scan( - drive_client: GoogleDriveClient, - session: AsyncSession, - connector: any, - connector_id: int, - search_space_id: int, - user_id: str, - folder_id: str | None, - folder_name: str, - task_logger: TaskLoggingService, - log_entry: any, - max_files: int, - include_subfolders: bool = False, - on_heartbeat_callback: HeartbeatCallbackType | None = None, - enable_summary: bool = True, -) -> tuple[int, int]: - """Perform full scan indexing of a folder. - - Implements 2-phase document status updates for real-time UI feedback: - - Phase 1: Collect all files and create pending documents (visible in UI immediately) - - Phase 2: Process each file: pending → processing → ready/failed - """ - await task_logger.log_task_progress( - log_entry, - f"Starting full scan of folder: {folder_name} (include_subfolders={include_subfolders})", - { - "stage": "full_scan", - "folder_id": folder_id, - "include_subfolders": include_subfolders, - }, - ) - - documents_indexed = 0 - documents_skipped = 0 - documents_failed = 0 - files_processed = 0 - - # Heartbeat tracking - update notification periodically to prevent appearing stuck - last_heartbeat_time = time.time() - - # ======================================================================= - # PHASE 1: Collect all files and create pending documents - # This makes ALL documents visible in the UI immediately with pending status - # ======================================================================= - files_to_process = [] # List of (file, pending_document or None) - new_documents_created = False - - # Queue of folders to process: (folder_id, folder_name) - folders_to_process = [(folder_id, folder_name)] - first_listing_error: str | None = None - - logger.info("Phase 1: Collecting files and creating pending documents") - - while folders_to_process and files_processed < max_files: - current_folder_id, current_folder_name = folders_to_process.pop(0) - logger.info(f"Scanning folder: {current_folder_name} ({current_folder_id})") - page_token = None - - while files_processed < max_files: - # Get files and folders in current folder - files, next_token, error = await get_files_in_folder( - drive_client, - current_folder_id, - include_subfolders=True, - page_token=page_token, - ) - - if error: - logger.error(f"Error listing files in {current_folder_name}: {error}") - if first_listing_error is None: - first_listing_error = error - break - - if not files: - break - - for file in files: - if files_processed >= max_files: - break - - mime_type = file.get("mimeType", "") - - # If this is a folder and include_subfolders is enabled, queue it for processing - if mime_type == "application/vnd.google-apps.folder": - if include_subfolders: - folders_to_process.append( - (file["id"], file.get("name", "Unknown")) - ) - logger.debug(f"Queued subfolder: {file.get('name', 'Unknown')}") - continue - - files_processed += 1 - - # Create pending document for this file - pending_doc, should_skip = await _create_pending_document_for_file( - session=session, - file=file, - connector_id=connector_id, - search_space_id=search_space_id, - user_id=user_id, - ) - - if should_skip: - documents_skipped += 1 - continue - - if pending_doc and pending_doc.id is None: - # New document was created - new_documents_created = True - - files_to_process.append((file, pending_doc)) - - page_token = next_token - if not page_token: - break - - if not files_to_process and first_listing_error: - error_lower = first_listing_error.lower() - if ( - "401" in first_listing_error - or "invalid credentials" in error_lower - or "authError" in first_listing_error - ): - raise Exception( - f"Google Drive authentication failed. Please re-authenticate. " - f"(Error: {first_listing_error})" - ) - raise Exception(f"Failed to list Google Drive files: {first_listing_error}") - - # Commit all pending documents - they all appear in UI now - if new_documents_created: - logger.info( - f"Phase 1: Committing {len([f for f in files_to_process if f[1] and f[1].id is None])} pending documents" - ) - await session.commit() - - # ======================================================================= - # PHASE 2: Process each file one by one - # Each document transitions: pending → processing → ready/failed - # ======================================================================= - logger.info(f"Phase 2: Processing {len(files_to_process)} files") - - for file, pending_doc in files_to_process: - # Check if it's time for a heartbeat update - if on_heartbeat_callback: - current_time = time.time() - if current_time - last_heartbeat_time >= HEARTBEAT_INTERVAL_SECONDS: - await on_heartbeat_callback(documents_indexed) - last_heartbeat_time = current_time - - indexed, skipped, failed = await _process_single_file( - drive_client=drive_client, - session=session, - file=file, - connector_id=connector_id, - search_space_id=search_space_id, - user_id=user_id, - task_logger=task_logger, - log_entry=log_entry, - pending_document=pending_doc, - enable_summary=enable_summary, - ) - - documents_indexed += indexed - documents_skipped += skipped - documents_failed += failed - - if documents_indexed % 10 == 0 and documents_indexed > 0: - await session.commit() - logger.info(f"Committed batch: {documents_indexed} files indexed so far") - - logger.info( - f"Full scan complete: {documents_indexed} indexed, {documents_skipped} skipped, {documents_failed} failed" - ) - return documents_indexed, documents_skipped - - -async def _index_with_delta_sync( - drive_client: GoogleDriveClient, - session: AsyncSession, - connector: any, - connector_id: int, - search_space_id: int, - user_id: str, - folder_id: str | None, - start_page_token: str, - task_logger: TaskLoggingService, - log_entry: any, - max_files: int, - include_subfolders: bool = False, - on_heartbeat_callback: HeartbeatCallbackType | None = None, - enable_summary: bool = True, -) -> tuple[int, int]: - """Perform delta sync indexing using change tracking. - - Note: include_subfolders is accepted for API consistency but delta sync - automatically tracks changes across all folders including subfolders. - - Implements 2-phase document status updates for real-time UI feedback: - - Phase 1: Collect all changes and create pending documents (visible in UI immediately) - - Phase 2: Process each file: pending → processing → ready/failed - """ - await task_logger.log_task_progress( - log_entry, - f"Starting delta sync from token: {start_page_token[:20]}...", - {"stage": "delta_sync", "start_token": start_page_token}, - ) - - changes, _final_token, error = await fetch_all_changes( - drive_client, start_page_token, folder_id - ) - - if error: - logger.error(f"Error fetching changes: {error}") - error_lower = error.lower() - if ( - "401" in error - or "invalid credentials" in error_lower - or "authError" in error - ): - raise Exception( - f"Google Drive authentication failed. Please re-authenticate. " - f"(Error: {error})" - ) - raise Exception(f"Failed to fetch Google Drive changes: {error}") - - if not changes: - logger.info("No changes detected since last sync") - return 0, 0 - - logger.info(f"Processing {len(changes)} changes") - - documents_indexed = 0 - documents_skipped = 0 - documents_failed = 0 - files_processed = 0 - - # Heartbeat tracking - update notification periodically to prevent appearing stuck - last_heartbeat_time = time.time() - - # ======================================================================= - # PHASE 1: Analyze changes and create pending documents for new/modified files - # ======================================================================= - changes_to_process = [] # List of (change, file, pending_document or None) - new_documents_created = False - - logger.info("Phase 1: Analyzing changes and creating pending documents") - - for change in changes: - if files_processed >= max_files: - break - - files_processed += 1 - change_type = categorize_change(change) - - if change_type in ["removed", "trashed"]: - file_id = change.get("fileId") - if file_id: - await _remove_document(session, file_id, search_space_id) - continue - - file = change.get("file") - if not file: - continue - - # Create pending document for this file - pending_doc, should_skip = await _create_pending_document_for_file( - session=session, - file=file, - connector_id=connector_id, - search_space_id=search_space_id, - user_id=user_id, - ) - - if should_skip: - documents_skipped += 1 - continue - - if pending_doc and pending_doc.id is None: - # New document was created - new_documents_created = True - - changes_to_process.append((change, file, pending_doc)) - - # Commit all pending documents - they all appear in UI now - if new_documents_created: - logger.info("Phase 1: Committing pending documents") - await session.commit() - - # ======================================================================= - # PHASE 2: Process each file one by one - # Each document transitions: pending → processing → ready/failed - # ======================================================================= - logger.info(f"Phase 2: Processing {len(changes_to_process)} changes") - - for _, file, pending_doc in changes_to_process: - # Check if it's time for a heartbeat update - if on_heartbeat_callback: - current_time = time.time() - if current_time - last_heartbeat_time >= HEARTBEAT_INTERVAL_SECONDS: - await on_heartbeat_callback(documents_indexed) - last_heartbeat_time = current_time - - indexed, skipped, failed = await _process_single_file( - drive_client=drive_client, - session=session, - file=file, - connector_id=connector_id, - search_space_id=search_space_id, - user_id=user_id, - task_logger=task_logger, - log_entry=log_entry, - pending_document=pending_doc, - enable_summary=enable_summary, - ) - - documents_indexed += indexed - documents_skipped += skipped - documents_failed += failed - - if documents_indexed % 10 == 0 and documents_indexed > 0: - await session.commit() - logger.info(f"Committed batch: {documents_indexed} changes processed") - - logger.info( - f"Delta sync complete: {documents_indexed} indexed, {documents_skipped} skipped, {documents_failed} failed" - ) - return documents_indexed, documents_skipped - - -async def _create_pending_document_for_file( - session: AsyncSession, - file: dict, - connector_id: int, - search_space_id: int, - user_id: str, -) -> tuple[Document | None, bool]: - """ - Create a pending document for a Google Drive file if it doesn't exist. - - This is Phase 1 of the 2-phase document status update pattern. - Creates documents with 'pending' status so they appear in UI immediately. - - Args: - session: Database session - file: File metadata from Google Drive API - connector_id: ID of the Drive connector - search_space_id: ID of the search space - user_id: ID of the user - - Returns: - Tuple of (document, should_skip): - - (existing_doc, False): Existing document that needs update - - (new_pending_doc, False): New pending document created - - (None, True): File should be skipped (unchanged, rename-only, or folder) - """ - from app.connectors.google_drive.file_types import should_skip_file - - file_id = file.get("id") - file_name = file.get("name", "Unknown") - mime_type = file.get("mimeType", "") - - # Skip folders and shortcuts - if should_skip_file(mime_type): - return None, True - - if not file_id: - return None, True - - # Generate unique identifier hash for this file - unique_identifier_hash = generate_unique_identifier_hash( - DocumentType.GOOGLE_DRIVE_FILE, file_id, search_space_id - ) - - # Check if document exists (primary hash first, then legacy Composio hash) - existing_document = await check_document_by_unique_identifier( - session, unique_identifier_hash - ) - if not existing_document: - legacy_hash = generate_unique_identifier_hash( - DocumentType.COMPOSIO_GOOGLE_DRIVE_CONNECTOR, file_id, search_space_id - ) - existing_document = await check_document_by_unique_identifier( - session, legacy_hash - ) - if existing_document: - existing_document.unique_identifier_hash = unique_identifier_hash - if ( - existing_document.document_type - == DocumentType.COMPOSIO_GOOGLE_DRIVE_CONNECTOR - ): - existing_document.document_type = DocumentType.GOOGLE_DRIVE_FILE - logger.info(f"Migrated legacy Composio document to native type: {file_id}") - - if existing_document: - # Check if this is a rename-only update (content unchanged) - incoming_md5 = file.get("md5Checksum") - incoming_modified_time = file.get("modifiedTime") - doc_metadata = existing_document.document_metadata or {} - stored_md5 = doc_metadata.get("md5_checksum") - stored_modified_time = doc_metadata.get("modified_time") - - # Determine if content changed - content_unchanged = False - if incoming_md5 and stored_md5: - content_unchanged = incoming_md5 == stored_md5 - elif not incoming_md5 and incoming_modified_time and stored_modified_time: - # Google Workspace file - use modifiedTime as fallback - content_unchanged = incoming_modified_time == stored_modified_time - - if content_unchanged: - # Ensure status is ready (might have been stuck in processing/pending) - if not DocumentStatus.is_state( - existing_document.status, DocumentStatus.READY - ): - existing_document.status = DocumentStatus.ready() - return None, True - - # Content changed - return existing document for update - return existing_document, False - - # Create new pending document - document = Document( - search_space_id=search_space_id, - title=file_name, - document_type=DocumentType.GOOGLE_DRIVE_FILE, - document_metadata={ - "google_drive_file_id": file_id, - "google_drive_file_name": file_name, - "google_drive_mime_type": mime_type, - "connector_id": connector_id, - }, - content="Pending...", # Placeholder until processed - content_hash=unique_identifier_hash, # Temporary unique value - updated when ready - unique_identifier_hash=unique_identifier_hash, - embedding=None, - chunks=[], # Empty at creation - status=DocumentStatus.pending(), # Pending until processing starts - updated_at=get_current_timestamp(), - created_by_id=user_id, - connector_id=connector_id, - ) - session.add(document) - - return document, False - - -async def _check_rename_only_update( - session: AsyncSession, - file: dict, - search_space_id: int, -) -> tuple[bool, str | None]: - """ - Check if a file only needs a rename update (no content change). - - Uses md5Checksum comparison (preferred) or modifiedTime (fallback for Google Workspace files) - to detect if content has changed. This optimization prevents unnecessary ETL API calls - (Docling/LlamaCloud) for rename-only operations. - - Args: - session: Database session - file: File metadata from Google Drive API - search_space_id: ID of the search space - - Returns: - Tuple of (is_rename_only, message) - - (True, message): Only filename changed, document was updated - - (False, None): Content changed or new file, needs full processing - """ - from sqlalchemy import String, cast, select - from sqlalchemy.orm.attributes import flag_modified - - from app.db import Document - - file_id = file.get("id") - file_name = file.get("name", "Unknown") - incoming_md5 = file.get("md5Checksum") # None for Google Workspace files - incoming_modified_time = file.get("modifiedTime") - - if not file_id: - return False, None - - # Try to find existing document by file_id-based hash (primary method) - primary_hash = generate_unique_identifier_hash( - DocumentType.GOOGLE_DRIVE_FILE, file_id, search_space_id - ) - existing_document = await check_document_by_unique_identifier(session, primary_hash) - - # Fallback: legacy Composio hash - if not existing_document: - legacy_hash = generate_unique_identifier_hash( - DocumentType.COMPOSIO_GOOGLE_DRIVE_CONNECTOR, file_id, search_space_id - ) - existing_document = await check_document_by_unique_identifier( - session, legacy_hash - ) - - # Fallback: metadata search (covers old filename-based hashes) - if not existing_document: - result = await session.execute( - select(Document).where( - Document.search_space_id == search_space_id, - Document.document_type.in_( - [ - DocumentType.GOOGLE_DRIVE_FILE, - DocumentType.COMPOSIO_GOOGLE_DRIVE_CONNECTOR, - ] - ), - cast(Document.document_metadata["google_drive_file_id"], String) - == file_id, - ) - ) - existing_document = result.scalar_one_or_none() - if existing_document: - logger.debug(f"Found legacy document by metadata for file_id: {file_id}") - - # Migrate legacy Composio document to native type - if existing_document: - if existing_document.unique_identifier_hash != primary_hash: - existing_document.unique_identifier_hash = primary_hash - if ( - existing_document.document_type - == DocumentType.COMPOSIO_GOOGLE_DRIVE_CONNECTOR - ): - existing_document.document_type = DocumentType.GOOGLE_DRIVE_FILE - logger.info(f"Migrated legacy Composio Drive document: {file_id}") - - if not existing_document: - # New file, needs full processing - return False, None - - # Get stored checksums/timestamps from document metadata - doc_metadata = existing_document.document_metadata or {} - stored_md5 = doc_metadata.get("md5_checksum") - stored_modified_time = doc_metadata.get("modified_time") - - # Determine if content changed using md5Checksum (preferred) or modifiedTime (fallback) - content_unchanged = False - - if incoming_md5 and stored_md5: - # Best case: Compare md5 checksums (only changes when content changes, not on rename) - content_unchanged = incoming_md5 == stored_md5 - logger.debug(f"MD5 comparison for {file_name}: unchanged={content_unchanged}") - elif incoming_md5 and not stored_md5: - # Have incoming md5 but no stored md5 (legacy doc) - need to reprocess to store it - logger.debug( - f"No stored md5 for {file_name}, will reprocess to store md5_checksum" - ) - return False, None - elif not incoming_md5: - # Google Workspace file (no md5Checksum available) - fall back to modifiedTime - # Note: modifiedTime is less reliable as it changes on rename too, but it's the best we have - if incoming_modified_time and stored_modified_time: - content_unchanged = incoming_modified_time == stored_modified_time - logger.debug( - f"ModifiedTime fallback for Google Workspace file {file_name}: unchanged={content_unchanged}" - ) - else: - # No stored modifiedTime (legacy) - reprocess to store it - return False, None - - if content_unchanged: - # Content hasn't changed - check if filename changed - old_name = doc_metadata.get("FILE_NAME") or doc_metadata.get( - "google_drive_file_name" - ) - - if old_name and old_name != file_name: - # Rename-only update - update the document without re-processing - existing_document.title = file_name - if not existing_document.document_metadata: - existing_document.document_metadata = {} - existing_document.document_metadata["FILE_NAME"] = file_name - existing_document.document_metadata["google_drive_file_name"] = file_name - # Also update modified_time for Google Workspace files (since it changed on rename) - if incoming_modified_time: - existing_document.document_metadata["modified_time"] = ( - incoming_modified_time - ) - flag_modified(existing_document, "document_metadata") - await session.commit() - - logger.info( - f"Rename-only update: '{old_name}' → '{file_name}' (skipped ETL)" - ) - return ( - True, - f"File renamed: '{old_name}' → '{file_name}' (no content change)", - ) - else: - # Neither content nor name changed - logger.debug(f"File unchanged: {file_name}") - return True, "File unchanged (same content and name)" - - # Content changed - needs full processing - return False, None - - -async def _process_single_file( - drive_client: GoogleDriveClient, - session: AsyncSession, - file: dict, - connector_id: int, - search_space_id: int, - user_id: str, - task_logger: TaskLoggingService, - log_entry: any, - pending_document: Document | None = None, - enable_summary: bool = True, -) -> tuple[int, int, int]: - """ - Process a single file by downloading and using Surfsense's file processor. - - Implements Phase 2 of the 2-phase document status update pattern. - Updates document status: pending → processing → ready/failed - - Args: - drive_client: Google Drive client - session: Database session - file: File metadata from Google Drive API - connector_id: ID of the connector - search_space_id: ID of the search space - user_id: ID of the user - task_logger: Task logging service - log_entry: Log entry for tracking - pending_document: Optional pending document created in Phase 1 - - Returns: - Tuple of (indexed_count, skipped_count, failed_count) - """ - file_name = file.get("name", "Unknown") - mime_type = file.get("mimeType", "") - file_id = file.get("id") - - try: - logger.info(f"Processing file: {file_name} ({mime_type})") - - # Early check: Is this a rename-only update? - # This optimization prevents downloading and ETL processing for files - # where only the name changed but content is the same. - is_rename_only, rename_message = await _check_rename_only_update( - session=session, - file=file, - search_space_id=search_space_id, - ) - - if is_rename_only: - await task_logger.log_task_progress( - log_entry, - f"Skipped ETL for {file_name}: {rename_message}", - {"status": "rename_only", "reason": rename_message}, - ) - # Return 1 for renamed files (they are "indexed" in the sense that they're updated) - # Return 0 for unchanged files - if "renamed" in (rename_message or "").lower(): - return 1, 0, 0 - return 0, 1, 0 - - # Set document to PROCESSING status if we have a pending document - if pending_document: - pending_document.status = DocumentStatus.processing() - await session.commit() - - _, error, _metadata = await download_and_process_file( - client=drive_client, - file=file, - search_space_id=search_space_id, - user_id=user_id, - session=session, - task_logger=task_logger, - log_entry=log_entry, - connector_id=connector_id, - enable_summary=enable_summary, - ) - - if error: - await task_logger.log_task_progress( - log_entry, - f"Skipped {file_name}: {error}", - {"status": "skipped", "reason": error}, - ) - # Mark pending document as failed if it exists - if pending_document: - pending_document.status = DocumentStatus.failed(error) - pending_document.updated_at = get_current_timestamp() - await session.commit() - return 0, 1, 0 - - # The document was created/updated by download_and_process_file - # Find the document and ensure it has READY status - if file_id: - unique_identifier_hash = generate_unique_identifier_hash( - DocumentType.GOOGLE_DRIVE_FILE, file_id, search_space_id - ) - processed_doc = await check_document_by_unique_identifier( - session, unique_identifier_hash - ) - # Ensure status is READY - if processed_doc and not DocumentStatus.is_state( - processed_doc.status, DocumentStatus.READY - ): - processed_doc.status = DocumentStatus.ready() - processed_doc.updated_at = get_current_timestamp() - await session.commit() - - logger.info(f"Successfully indexed Google Drive file: {file_name}") - return 1, 0, 0 - - except Exception as e: - logger.error(f"Error processing file {file_name}: {e!s}", exc_info=True) - # Mark pending document as failed if it exists - if pending_document: - try: - pending_document.status = DocumentStatus.failed(str(e)) - pending_document.updated_at = get_current_timestamp() - await session.commit() - except Exception as status_error: - logger.error( - f"Failed to update document status to failed: {status_error}" - ) - return 0, 0, 1 - - -async def _remove_document(session: AsyncSession, file_id: str, search_space_id: int): - """Remove a document that was deleted in Drive. - - Handles both new (file_id-based) and legacy (filename-based) hash schemes. - """ - from sqlalchemy import String, cast, select - - from app.db import Document - - # First try with file_id-based hash (new method) - unique_identifier_hash = generate_unique_identifier_hash( - DocumentType.GOOGLE_DRIVE_FILE, file_id, search_space_id - ) - - existing_document = await check_document_by_unique_identifier( - session, unique_identifier_hash - ) - - # Fallback: legacy Composio hash - if not existing_document: - legacy_hash = generate_unique_identifier_hash( - DocumentType.COMPOSIO_GOOGLE_DRIVE_CONNECTOR, file_id, search_space_id - ) - existing_document = await check_document_by_unique_identifier( - session, legacy_hash - ) - - # Fallback: metadata search (covers old filename-based hashes, both native and Composio) - if not existing_document: - result = await session.execute( - select(Document).where( - Document.search_space_id == search_space_id, - Document.document_type.in_( - [ - DocumentType.GOOGLE_DRIVE_FILE, - DocumentType.COMPOSIO_GOOGLE_DRIVE_CONNECTOR, - ] - ), - cast(Document.document_metadata["google_drive_file_id"], String) - == file_id, - ) - ) - existing_document = result.scalar_one_or_none() - if existing_document: - logger.info(f"Found legacy document by metadata for file_id: {file_id}") - - if existing_document: - await session.delete(existing_document) - logger.info(f"Removed deleted file document: {file_id}") diff --git a/surfsense_backend/app/tasks/connector_indexers/google_gmail_indexer.py b/surfsense_backend/app/tasks/connector_indexers/google_gmail_indexer.py index 384ad85e2..96cc1cbb4 100644 --- a/surfsense_backend/app/tasks/connector_indexers/google_gmail_indexer.py +++ b/surfsense_backend/app/tasks/connector_indexers/google_gmail_indexer.py @@ -1,11 +1,11 @@ """ Google Gmail connector indexer. -Implements 2-phase document status updates for real-time UI feedback: -- Phase 1: Create all documents with 'pending' status (visible in UI immediately) -- Phase 2: Process each document: pending → processing → ready/failed +Uses the shared IndexingPipelineService for document deduplication, +summarization, chunking, and embedding. """ +import logging import time from collections.abc import Awaitable, Callable from datetime import datetime @@ -15,21 +15,15 @@ from sqlalchemy.exc import SQLAlchemyError from sqlalchemy.ext.asyncio import AsyncSession from app.connectors.google_gmail_connector import GoogleGmailConnector -from app.db import ( - Document, - DocumentStatus, - DocumentType, - SearchSourceConnectorType, +from app.db import DocumentType, SearchSourceConnectorType +from app.indexing_pipeline.connector_document import ConnectorDocument +from app.indexing_pipeline.document_hashing import ( + compute_content_hash, + compute_unique_identifier_hash, ) +from app.indexing_pipeline.indexing_pipeline_service import IndexingPipelineService from app.services.llm_service import get_user_long_context_llm from app.services.task_logging_service import TaskLoggingService -from app.utils.document_converters import ( - create_document_chunks, - embed_text, - generate_content_hash, - generate_document_summary, - generate_unique_identifier_hash, -) from app.utils.google_credentials import ( COMPOSIO_GOOGLE_CONNECTOR_TYPES, build_composio_credentials, @@ -37,12 +31,9 @@ from app.utils.google_credentials import ( from .base import ( calculate_date_range, - check_document_by_unique_identifier, check_duplicate_document_by_hash, get_connector_by_id, - get_current_timestamp, logger, - safe_set_chunks, update_connector_last_indexed, ) @@ -51,13 +42,70 @@ ACCEPTED_GMAIL_CONNECTOR_TYPES = { SearchSourceConnectorType.COMPOSIO_GMAIL_CONNECTOR, } -# Type hint for heartbeat callback HeartbeatCallbackType = Callable[[int], Awaitable[None]] - -# Heartbeat interval in seconds HEARTBEAT_INTERVAL_SECONDS = 30 +def _build_connector_doc( + message: dict, + markdown_content: str, + *, + connector_id: int, + search_space_id: int, + user_id: str, + enable_summary: bool, +) -> ConnectorDocument: + """Map a raw Gmail API message dict to a ConnectorDocument.""" + message_id = message.get("id", "") + thread_id = message.get("threadId", "") + payload = message.get("payload", {}) + headers = payload.get("headers", []) + + subject = "No Subject" + sender = "Unknown Sender" + date_str = "Unknown Date" + + for header in headers: + name = header.get("name", "").lower() + value = header.get("value", "") + if name == "subject": + subject = value + elif name == "from": + sender = value + elif name == "date": + date_str = value + + metadata = { + "message_id": message_id, + "thread_id": thread_id, + "subject": subject, + "sender": sender, + "date": date_str, + "connector_id": connector_id, + "document_type": "Gmail Message", + "connector_type": "Google Gmail", + } + + fallback_summary = ( + f"Google Gmail Message: {subject}\n\n" + f"From: {sender}\nDate: {date_str}\n\n" + f"{markdown_content}" + ) + + return ConnectorDocument( + title=subject, + source_markdown=markdown_content, + unique_id=message_id, + document_type=DocumentType.GOOGLE_GMAIL_CONNECTOR, + search_space_id=search_space_id, + connector_id=connector_id, + created_by_id=user_id, + should_summarize=enable_summary, + fallback_summary=fallback_summary, + metadata=metadata, + ) + + async def index_google_gmail_messages( session: AsyncSession, connector_id: int, @@ -80,7 +128,7 @@ async def index_google_gmail_messages( start_date: Start date for filtering messages (YYYY-MM-DD format) end_date: End date for filtering messages (YYYY-MM-DD format) update_last_indexed: Whether to update the last_indexed_at timestamp (default: True) - max_messages: Maximum number of messages to fetch (default: 100) + max_messages: Maximum number of messages to fetch (default: 1000) on_heartbeat_callback: Optional callback to update notification during long-running indexing. Returns: @@ -88,7 +136,6 @@ async def index_google_gmail_messages( """ task_logger = TaskLoggingService(session, search_space_id) - # Log task start log_entry = await task_logger.log_task_start( task_name="google_gmail_messages_indexing", source="connector_indexing_task", @@ -103,7 +150,7 @@ async def index_google_gmail_messages( ) try: - # Accept both native and Composio Gmail connectors + # ── Connector lookup ────────────────────────────────────────── connector = None for ct in ACCEPTED_GMAIL_CONNECTOR_TYPES: connector = await get_connector_by_id(session, connector_id, ct) @@ -117,7 +164,7 @@ async def index_google_gmail_messages( ) return 0, 0, error_msg - # Build credentials based on connector type + # ── Credential building ─────────────────────────────────────── if connector.connector_type in COMPOSIO_GOOGLE_CONNECTOR_TYPES: connected_account_id = connector.config.get("composio_connected_account_id") if not connected_account_id: @@ -189,6 +236,7 @@ async def index_google_gmail_messages( ) return 0, 0, "Google gmail credentials not found in connector config" + # ── Gmail client init ───────────────────────────────────────── await task_logger.log_task_progress( log_entry, f"Initializing Google gmail client for connector {connector_id}", @@ -199,14 +247,11 @@ async def index_google_gmail_messages( credentials, session, user_id, connector_id ) - # Calculate date range using last_indexed_at if dates not provided - # This ensures Gmail uses the same date logic as other connectors - # (uses last_indexed_at → now, or 365 days back for first-time indexing) calculated_start_date, calculated_end_date = calculate_date_range( connector, start_date, end_date, default_days_back=365 ) - # Fetch recent Google gmail messages + # ── Fetch messages ──────────────────────────────────────────── logger.info( f"Fetching emails for connector {connector_id} " f"from {calculated_start_date} to {calculated_end_date}" @@ -218,7 +263,6 @@ async def index_google_gmail_messages( ) if error: - # Check if this is an authentication error that requires re-authentication error_message = error error_type = "APIError" if ( @@ -243,263 +287,92 @@ async def index_google_gmail_messages( logger.info(f"Found {len(messages)} Google gmail messages to index") - documents_indexed = 0 + # ── Build ConnectorDocuments ────────────────────────────────── + connector_docs: list[ConnectorDocument] = [] documents_skipped = 0 - documents_failed = 0 # Track messages that failed processing - duplicate_content_count = ( - 0 # Track messages skipped due to duplicate content_hash - ) - - # Heartbeat tracking - update notification periodically to prevent appearing stuck - last_heartbeat_time = time.time() - - # ======================================================================= - # PHASE 1: Analyze all messages, create pending documents - # This makes ALL documents visible in the UI immediately with pending status - # ======================================================================= - messages_to_process = [] # List of dicts with document and message data - new_documents_created = False + duplicate_content_count = 0 for message in messages: try: - # Extract message information message_id = message.get("id", "") - thread_id = message.get("threadId", "") - - # Extract headers for subject and sender - payload = message.get("payload", {}) - headers = payload.get("headers", []) - - subject = "No Subject" - sender = "Unknown Sender" - date_str = "Unknown Date" - - for header in headers: - name = header.get("name", "").lower() - value = header.get("value", "") - if name == "subject": - subject = value - elif name == "from": - sender = value - elif name == "date": - date_str = value - if not message_id: - logger.warning(f"Skipping message with missing ID: {subject}") + logger.warning("Skipping message with missing ID") documents_skipped += 1 continue - # Format message to markdown markdown_content = gmail_connector.format_message_to_markdown(message) - if not markdown_content.strip(): - logger.warning(f"Skipping message with no content: {subject}") + logger.warning(f"Skipping message with no content: {message_id}") documents_skipped += 1 continue - # Generate unique identifier hash for this Gmail message - unique_identifier_hash = generate_unique_identifier_hash( - DocumentType.GOOGLE_GMAIL_CONNECTOR, message_id, search_space_id + doc = _build_connector_doc( + message, + markdown_content, + connector_id=connector_id, + search_space_id=search_space_id, + user_id=user_id, + enable_summary=connector.enable_summary, ) - # Generate content hash - content_hash = generate_content_hash(markdown_content, search_space_id) - - # Check if document with this unique identifier already exists - existing_document = await check_document_by_unique_identifier( - session, unique_identifier_hash - ) - - # Fallback: legacy Composio hash - if not existing_document: - legacy_hash = generate_unique_identifier_hash( - DocumentType.COMPOSIO_GMAIL_CONNECTOR, - message_id, - search_space_id, - ) - existing_document = await check_document_by_unique_identifier( - session, legacy_hash - ) - if existing_document: - existing_document.unique_identifier_hash = ( - unique_identifier_hash - ) - if ( - existing_document.document_type - == DocumentType.COMPOSIO_GMAIL_CONNECTOR - ): - existing_document.document_type = ( - DocumentType.GOOGLE_GMAIL_CONNECTOR - ) - logger.info( - f"Migrated legacy Composio Gmail document: {message_id}" - ) - - if existing_document: - # Document exists - check if content has changed - if existing_document.content_hash == content_hash: - # Ensure status is ready (might have been stuck in processing/pending) - if not DocumentStatus.is_state( - existing_document.status, DocumentStatus.READY - ): - existing_document.status = DocumentStatus.ready() - documents_skipped += 1 - continue - - # Queue existing document for update (will be set to processing in Phase 2) - messages_to_process.append( - { - "document": existing_document, - "is_new": False, - "markdown_content": markdown_content, - "content_hash": content_hash, - "message_id": message_id, - "thread_id": thread_id, - "subject": subject, - "sender": sender, - "date_str": date_str, - } - ) - continue - - # Document doesn't exist by unique_identifier_hash - # Check if a document with the same content_hash exists (from another connector) with session.no_autoflush: - duplicate_by_content = await check_duplicate_document_by_hash( - session, content_hash + duplicate = await check_duplicate_document_by_hash( + session, compute_content_hash(doc) ) - - if duplicate_by_content: + if duplicate: logger.info( - f"Gmail message {subject} already indexed by another connector " - f"(existing document ID: {duplicate_by_content.id}, " - f"type: {duplicate_by_content.document_type}). Skipping." + f"Gmail message {doc.title} already indexed by another connector " + f"(existing document ID: {duplicate.id}, " + f"type: {duplicate.document_type}). Skipping." ) duplicate_content_count += 1 documents_skipped += 1 continue - # Create new document with PENDING status (visible in UI immediately) - document = Document( - search_space_id=search_space_id, - title=subject, - document_type=DocumentType.GOOGLE_GMAIL_CONNECTOR, - document_metadata={ - "message_id": message_id, - "thread_id": thread_id, - "subject": subject, - "sender": sender, - "date": date_str, - "connector_id": connector_id, - }, - content="Pending...", # Placeholder until processed - content_hash=unique_identifier_hash, # Temporary unique value - updated when ready - unique_identifier_hash=unique_identifier_hash, - embedding=None, - chunks=[], # Empty at creation - safe for async - status=DocumentStatus.pending(), # Pending until processing starts - updated_at=get_current_timestamp(), - created_by_id=user_id, - connector_id=connector_id, - ) - session.add(document) - new_documents_created = True - - messages_to_process.append( - { - "document": document, - "is_new": True, - "markdown_content": markdown_content, - "content_hash": content_hash, - "message_id": message_id, - "thread_id": thread_id, - "subject": subject, - "sender": sender, - "date_str": date_str, - } - ) + connector_docs.append(doc) except Exception as e: - logger.error(f"Error in Phase 1 for message: {e!s}", exc_info=True) - documents_failed += 1 + logger.error(f"Error building ConnectorDocument for message: {e!s}", exc_info=True) + documents_skipped += 1 continue - # Commit all pending documents - they all appear in UI now - if new_documents_created: - logger.info( - f"Phase 1: Committing {len([m for m in messages_to_process if m['is_new']])} pending documents" - ) - await session.commit() + # ── Pipeline: migrate legacy docs + prepare + index ─────────── + pipeline = IndexingPipelineService(session) - # ======================================================================= - # PHASE 2: Process each document one by one - # Each document transitions: pending → processing → ready/failed - # ======================================================================= - logger.info(f"Phase 2: Processing {len(messages_to_process)} documents") + await pipeline.migrate_legacy_docs(connector_docs) - for item in messages_to_process: - # Send heartbeat periodically + documents = await pipeline.prepare_for_indexing(connector_docs) + + doc_map = { + compute_unique_identifier_hash(cd): cd for cd in connector_docs + } + + documents_indexed = 0 + documents_failed = 0 + last_heartbeat_time = time.time() + + for document in documents: if on_heartbeat_callback: current_time = time.time() if current_time - last_heartbeat_time >= HEARTBEAT_INTERVAL_SECONDS: await on_heartbeat_callback(documents_indexed) last_heartbeat_time = current_time - document = item["document"] - try: - # Set to PROCESSING and commit - shows "processing" in UI for THIS document only - document.status = DocumentStatus.processing() - await session.commit() + connector_doc = doc_map.get(document.unique_identifier_hash) + if connector_doc is None: + logger.warning( + f"No matching ConnectorDocument for document {document.id}, skipping" + ) + documents_failed += 1 + continue - # Heavy processing (LLM, embeddings, chunks) + try: user_llm = await get_user_long_context_llm( session, user_id, search_space_id ) - - if user_llm and connector.enable_summary: - document_metadata_for_summary = { - "message_id": item["message_id"], - "thread_id": item["thread_id"], - "subject": item["subject"], - "sender": item["sender"], - "date": item["date_str"], - "document_type": "Gmail Message", - "connector_type": "Google Gmail", - } - ( - summary_content, - summary_embedding, - ) = await generate_document_summary( - item["markdown_content"], - user_llm, - document_metadata_for_summary, - ) - else: - summary_content = f"Google Gmail Message: {item['subject']}\n\nFrom: {item['sender']}\nDate: {item['date_str']}\n\n{item['markdown_content']}" - summary_embedding = embed_text(summary_content) - - chunks = await create_document_chunks(item["markdown_content"]) - - # Update document to READY with actual content - document.title = item["subject"] - document.content = summary_content - document.content_hash = item["content_hash"] - document.embedding = summary_embedding - document.document_metadata = { - "message_id": item["message_id"], - "thread_id": item["thread_id"], - "subject": item["subject"], - "sender": item["sender"], - "date": item["date_str"], - "connector_id": connector_id, - } - await safe_set_chunks(session, document, chunks) - document.updated_at = get_current_timestamp() - document.status = DocumentStatus.ready() - + await pipeline.index(document, connector_doc, user_llm) documents_indexed += 1 - # Batch commit every 10 documents (for ready status updates) if documents_indexed % 10 == 0: logger.info( f"Committing batch: {documents_indexed} Gmail messages processed so far" @@ -508,21 +381,12 @@ async def index_google_gmail_messages( except Exception as e: logger.error(f"Error processing Gmail message: {e!s}", exc_info=True) - # Mark document as failed with reason (visible in UI) - try: - document.status = DocumentStatus.failed(str(e)) - document.updated_at = get_current_timestamp() - except Exception as status_error: - logger.error( - f"Failed to update document status to failed: {status_error}" - ) documents_failed += 1 continue - # CRITICAL: Always update timestamp (even if 0 documents indexed) so Zero syncs + # ── Finalize ────────────────────────────────────────────────── await update_connector_last_indexed(session, connector, update_last_indexed) - # Final commit for any remaining documents not yet committed in batches logger.info(f"Final commit: Total {documents_indexed} Gmail messages processed") try: await session.commit() @@ -530,22 +394,18 @@ async def index_google_gmail_messages( "Successfully committed all Google Gmail document changes to database" ) except Exception as e: - # Handle any remaining integrity errors gracefully (race conditions, etc.) if ( "duplicate key value violates unique constraint" in str(e).lower() or "uniqueviolationerror" in str(e).lower() ): logger.warning( f"Duplicate content_hash detected during final commit. " - f"This may occur if the same message was indexed by multiple connectors. " f"Rolling back and continuing. Error: {e!s}" ) await session.rollback() - # Don't fail the entire task - some documents may have been successfully indexed else: raise - # Build warning message if there were issues warning_parts = [] if duplicate_content_count > 0: warning_parts.append(f"{duplicate_content_count} duplicate") @@ -555,7 +415,6 @@ async def index_google_gmail_messages( total_processed = documents_indexed - # Log success await task_logger.log_task_success( log_entry, f"Successfully completed Google Gmail indexing for connector {connector_id}", diff --git a/surfsense_backend/tests/unit/indexing_pipeline/test_document_hashing.py b/surfsense_backend/tests/unit/indexing_pipeline/test_document_hashing.py index fe536b066..d04d8b048 100644 --- a/surfsense_backend/tests/unit/indexing_pipeline/test_document_hashing.py +++ b/surfsense_backend/tests/unit/indexing_pipeline/test_document_hashing.py @@ -3,6 +3,7 @@ import pytest from app.db import DocumentType from app.indexing_pipeline.document_hashing import ( compute_content_hash, + compute_identifier_hash, compute_unique_identifier_hash, ) @@ -61,3 +62,23 @@ def test_different_content_produces_different_content_hash(make_connector_docume doc_a = make_connector_document(source_markdown="Original content") doc_b = make_connector_document(source_markdown="Updated content") assert compute_content_hash(doc_a) != compute_content_hash(doc_b) + + +def test_compute_identifier_hash_matches_connector_doc_hash(make_connector_document): + """Raw-args hash equals ConnectorDocument hash for equivalent inputs.""" + doc = make_connector_document( + document_type=DocumentType.GOOGLE_GMAIL_CONNECTOR, + unique_id="msg-123", + search_space_id=5, + ) + raw_hash = compute_identifier_hash("GOOGLE_GMAIL_CONNECTOR", "msg-123", 5) + assert raw_hash == compute_unique_identifier_hash(doc) + + +def test_compute_identifier_hash_differs_for_different_inputs(): + """Different arguments produce different hashes.""" + h1 = compute_identifier_hash("GOOGLE_DRIVE_FILE", "file-1", 1) + h2 = compute_identifier_hash("GOOGLE_DRIVE_FILE", "file-2", 1) + h3 = compute_identifier_hash("GOOGLE_DRIVE_FILE", "file-1", 2) + h4 = compute_identifier_hash("COMPOSIO_GOOGLE_DRIVE_CONNECTOR", "file-1", 1) + assert len({h1, h2, h3, h4}) == 4 From 8c41fd91bafc01347e4ab35da3cd79c9c4e7b104 Mon Sep 17 00:00:00 2001 From: Anish Sarkar <104695310+AnishSarkar22@users.noreply.github.com> Date: Wed, 25 Mar 2026 18:34:02 +0530 Subject: [PATCH 02/71] feat: add integration tests for indexing pipeline components - Introduced integration tests for Calendar, Drive, and Gmail indexers to ensure proper document creation and migration. - Added tests for batch indexing functionality to validate the processing of multiple documents. - Implemented tests for legacy document migration to verify updates to document types and hashes. - Enhanced test coverage for the IndexingPipelineService to ensure robust functionality across various document types. --- .../test_calendar_pipeline.py | 111 +++++++++++++++ .../indexing_pipeline/test_drive_pipeline.py | 110 +++++++++++++++ .../indexing_pipeline/test_gmail_pipeline.py | 116 ++++++++++++++++ .../indexing_pipeline/test_index_batch.py | 55 ++++++++ .../test_migrate_legacy_docs.py | 92 +++++++++++++ .../indexing_pipeline/test_index_batch.py | 82 +++++++++++ .../test_migrate_legacy_docs.py | 127 ++++++++++++++++++ 7 files changed, 693 insertions(+) create mode 100644 surfsense_backend/tests/integration/indexing_pipeline/test_calendar_pipeline.py create mode 100644 surfsense_backend/tests/integration/indexing_pipeline/test_drive_pipeline.py create mode 100644 surfsense_backend/tests/integration/indexing_pipeline/test_gmail_pipeline.py create mode 100644 surfsense_backend/tests/integration/indexing_pipeline/test_index_batch.py create mode 100644 surfsense_backend/tests/integration/indexing_pipeline/test_migrate_legacy_docs.py create mode 100644 surfsense_backend/tests/unit/indexing_pipeline/test_index_batch.py create mode 100644 surfsense_backend/tests/unit/indexing_pipeline/test_migrate_legacy_docs.py diff --git a/surfsense_backend/tests/integration/indexing_pipeline/test_calendar_pipeline.py b/surfsense_backend/tests/integration/indexing_pipeline/test_calendar_pipeline.py new file mode 100644 index 000000000..6a60c5cc1 --- /dev/null +++ b/surfsense_backend/tests/integration/indexing_pipeline/test_calendar_pipeline.py @@ -0,0 +1,111 @@ +"""Integration tests: Calendar indexer builds ConnectorDocuments that flow through the pipeline.""" + +import pytest +from sqlalchemy import select + +from app.config import config as app_config +from app.db import Document, DocumentStatus, DocumentType +from app.indexing_pipeline.connector_document import ConnectorDocument +from app.indexing_pipeline.document_hashing import compute_identifier_hash +from app.indexing_pipeline.indexing_pipeline_service import IndexingPipelineService + +_EMBEDDING_DIM = app_config.embedding_model_instance.dimension + +pytestmark = pytest.mark.integration + + +def _cal_doc(*, unique_id: str, search_space_id: int, connector_id: int, user_id: str) -> ConnectorDocument: + return ConnectorDocument( + title=f"Event {unique_id}", + source_markdown=f"## Calendar Event\n\nDetails for {unique_id}", + unique_id=unique_id, + document_type=DocumentType.GOOGLE_CALENDAR_CONNECTOR, + search_space_id=search_space_id, + connector_id=connector_id, + created_by_id=user_id, + should_summarize=True, + fallback_summary=f"Calendar: Event {unique_id}", + metadata={ + "event_id": unique_id, + "start_time": "2025-01-15T10:00:00", + "end_time": "2025-01-15T11:00:00", + "document_type": "Google Calendar Event", + }, + ) + + +@pytest.mark.usefixtures("patched_summarize", "patched_embed_texts", "patched_chunk_text") +async def test_calendar_pipeline_creates_ready_document( + db_session, db_search_space, db_connector, db_user, mocker +): + """A Calendar ConnectorDocument flows through prepare + index to a READY document.""" + space_id = db_search_space.id + doc = _cal_doc( + unique_id="evt-1", + search_space_id=space_id, + connector_id=db_connector.id, + user_id=str(db_user.id), + ) + + service = IndexingPipelineService(session=db_session) + prepared = await service.prepare_for_indexing([doc]) + assert len(prepared) == 1 + + await service.index(prepared[0], doc, llm=mocker.Mock()) + + result = await db_session.execute( + select(Document).filter(Document.search_space_id == space_id) + ) + row = result.scalars().first() + + assert row is not None + assert row.document_type == DocumentType.GOOGLE_CALENDAR_CONNECTOR + assert DocumentStatus.is_state(row.status, DocumentStatus.READY) + + +@pytest.mark.usefixtures("patched_summarize", "patched_embed_texts", "patched_chunk_text") +async def test_calendar_legacy_doc_migrated( + db_session, db_search_space, db_connector, db_user, mocker +): + """A legacy Composio Calendar doc is migrated and reused.""" + space_id = db_search_space.id + user_id = str(db_user.id) + evt_id = "evt-legacy-cal" + + legacy_hash = compute_identifier_hash( + DocumentType.COMPOSIO_GOOGLE_CALENDAR_CONNECTOR.value, evt_id, space_id + ) + legacy_doc = Document( + title="Old Calendar Event", + document_type=DocumentType.COMPOSIO_GOOGLE_CALENDAR_CONNECTOR, + content="old summary", + content_hash=f"ch-{legacy_hash[:12]}", + unique_identifier_hash=legacy_hash, + source_markdown="## Old event", + search_space_id=space_id, + created_by_id=user_id, + embedding=[0.1] * _EMBEDDING_DIM, + status={"state": "ready"}, + ) + db_session.add(legacy_doc) + await db_session.flush() + original_id = legacy_doc.id + + connector_doc = _cal_doc( + unique_id=evt_id, + search_space_id=space_id, + connector_id=db_connector.id, + user_id=user_id, + ) + + service = IndexingPipelineService(session=db_session) + await service.migrate_legacy_docs([connector_doc]) + + result = await db_session.execute(select(Document).filter(Document.id == original_id)) + row = result.scalars().first() + + assert row.document_type == DocumentType.GOOGLE_CALENDAR_CONNECTOR + native_hash = compute_identifier_hash( + DocumentType.GOOGLE_CALENDAR_CONNECTOR.value, evt_id, space_id + ) + assert row.unique_identifier_hash == native_hash diff --git a/surfsense_backend/tests/integration/indexing_pipeline/test_drive_pipeline.py b/surfsense_backend/tests/integration/indexing_pipeline/test_drive_pipeline.py new file mode 100644 index 000000000..32af0b8c1 --- /dev/null +++ b/surfsense_backend/tests/integration/indexing_pipeline/test_drive_pipeline.py @@ -0,0 +1,110 @@ +"""Integration tests: Drive indexer builds ConnectorDocuments that flow through the pipeline.""" + +import pytest +from sqlalchemy import select + +from app.config import config as app_config +from app.db import Document, DocumentStatus, DocumentType +from app.indexing_pipeline.connector_document import ConnectorDocument +from app.indexing_pipeline.document_hashing import compute_identifier_hash +from app.indexing_pipeline.indexing_pipeline_service import IndexingPipelineService + +_EMBEDDING_DIM = app_config.embedding_model_instance.dimension + +pytestmark = pytest.mark.integration + + +def _drive_doc(*, unique_id: str, search_space_id: int, connector_id: int, user_id: str) -> ConnectorDocument: + return ConnectorDocument( + title=f"File {unique_id}.pdf", + source_markdown=f"## Document Content\n\nText from file {unique_id}", + unique_id=unique_id, + document_type=DocumentType.GOOGLE_DRIVE_FILE, + search_space_id=search_space_id, + connector_id=connector_id, + created_by_id=user_id, + should_summarize=True, + fallback_summary=f"File: {unique_id}.pdf", + metadata={ + "google_drive_file_id": unique_id, + "google_drive_file_name": f"{unique_id}.pdf", + "document_type": "Google Drive File", + }, + ) + + +@pytest.mark.usefixtures("patched_summarize", "patched_embed_texts", "patched_chunk_text") +async def test_drive_pipeline_creates_ready_document( + db_session, db_search_space, db_connector, db_user, mocker +): + """A Drive ConnectorDocument flows through prepare + index to a READY document.""" + space_id = db_search_space.id + doc = _drive_doc( + unique_id="file-abc", + search_space_id=space_id, + connector_id=db_connector.id, + user_id=str(db_user.id), + ) + + service = IndexingPipelineService(session=db_session) + prepared = await service.prepare_for_indexing([doc]) + assert len(prepared) == 1 + + await service.index(prepared[0], doc, llm=mocker.Mock()) + + result = await db_session.execute( + select(Document).filter(Document.search_space_id == space_id) + ) + row = result.scalars().first() + + assert row is not None + assert row.document_type == DocumentType.GOOGLE_DRIVE_FILE + assert DocumentStatus.is_state(row.status, DocumentStatus.READY) + + +@pytest.mark.usefixtures("patched_summarize", "patched_embed_texts", "patched_chunk_text") +async def test_drive_legacy_doc_migrated( + db_session, db_search_space, db_connector, db_user, mocker +): + """A legacy Composio Drive doc is migrated and reused.""" + space_id = db_search_space.id + user_id = str(db_user.id) + file_id = "file-legacy-drive" + + legacy_hash = compute_identifier_hash( + DocumentType.COMPOSIO_GOOGLE_DRIVE_CONNECTOR.value, file_id, space_id + ) + legacy_doc = Document( + title="Old Drive File", + document_type=DocumentType.COMPOSIO_GOOGLE_DRIVE_CONNECTOR, + content="old file summary", + content_hash=f"ch-{legacy_hash[:12]}", + unique_identifier_hash=legacy_hash, + source_markdown="## Old file content", + search_space_id=space_id, + created_by_id=user_id, + embedding=[0.1] * _EMBEDDING_DIM, + status={"state": "ready"}, + ) + db_session.add(legacy_doc) + await db_session.flush() + original_id = legacy_doc.id + + connector_doc = _drive_doc( + unique_id=file_id, + search_space_id=space_id, + connector_id=db_connector.id, + user_id=user_id, + ) + + service = IndexingPipelineService(session=db_session) + await service.migrate_legacy_docs([connector_doc]) + + result = await db_session.execute(select(Document).filter(Document.id == original_id)) + row = result.scalars().first() + + assert row.document_type == DocumentType.GOOGLE_DRIVE_FILE + native_hash = compute_identifier_hash( + DocumentType.GOOGLE_DRIVE_FILE.value, file_id, space_id + ) + assert row.unique_identifier_hash == native_hash diff --git a/surfsense_backend/tests/integration/indexing_pipeline/test_gmail_pipeline.py b/surfsense_backend/tests/integration/indexing_pipeline/test_gmail_pipeline.py new file mode 100644 index 000000000..d67420cb7 --- /dev/null +++ b/surfsense_backend/tests/integration/indexing_pipeline/test_gmail_pipeline.py @@ -0,0 +1,116 @@ +"""Integration tests: Gmail indexer builds ConnectorDocuments that flow through the pipeline.""" + +import pytest +from sqlalchemy import select + +from app.config import config as app_config +from app.db import Document, DocumentStatus, DocumentType +from app.indexing_pipeline.connector_document import ConnectorDocument +from app.indexing_pipeline.document_hashing import ( + compute_identifier_hash, + compute_unique_identifier_hash, +) +from app.indexing_pipeline.indexing_pipeline_service import IndexingPipelineService + +_EMBEDDING_DIM = app_config.embedding_model_instance.dimension + +pytestmark = pytest.mark.integration + + +def _gmail_doc(*, unique_id: str, search_space_id: int, connector_id: int, user_id: str) -> ConnectorDocument: + """Build a Gmail-style ConnectorDocument like the real indexer does.""" + return ConnectorDocument( + title=f"Subject for {unique_id}", + source_markdown=f"## Email\n\nBody of {unique_id}", + unique_id=unique_id, + document_type=DocumentType.GOOGLE_GMAIL_CONNECTOR, + search_space_id=search_space_id, + connector_id=connector_id, + created_by_id=user_id, + should_summarize=True, + fallback_summary=f"Gmail: Subject for {unique_id}", + metadata={ + "message_id": unique_id, + "from": "sender@example.com", + "document_type": "Gmail Message", + }, + ) + + +@pytest.mark.usefixtures("patched_summarize", "patched_embed_texts", "patched_chunk_text") +async def test_gmail_pipeline_creates_ready_document( + db_session, db_search_space, db_connector, db_user, mocker +): + """A Gmail ConnectorDocument flows through prepare + index to a READY document.""" + space_id = db_search_space.id + doc = _gmail_doc( + unique_id="msg-pipeline-1", + search_space_id=space_id, + connector_id=db_connector.id, + user_id=str(db_user.id), + ) + + service = IndexingPipelineService(session=db_session) + prepared = await service.prepare_for_indexing([doc]) + assert len(prepared) == 1 + + await service.index(prepared[0], doc, llm=mocker.Mock()) + + result = await db_session.execute( + select(Document).filter(Document.search_space_id == space_id) + ) + row = result.scalars().first() + + assert row is not None + assert row.document_type == DocumentType.GOOGLE_GMAIL_CONNECTOR + assert DocumentStatus.is_state(row.status, DocumentStatus.READY) + assert row.source_markdown == doc.source_markdown + + +@pytest.mark.usefixtures("patched_summarize", "patched_embed_texts", "patched_chunk_text") +async def test_gmail_legacy_doc_migrated_then_reused( + db_session, db_search_space, db_connector, db_user, mocker +): + """A legacy Composio Gmail doc is migrated then reused by the pipeline.""" + space_id = db_search_space.id + user_id = str(db_user.id) + msg_id = "msg-legacy-gmail" + + legacy_hash = compute_identifier_hash( + DocumentType.COMPOSIO_GMAIL_CONNECTOR.value, msg_id, space_id + ) + legacy_doc = Document( + title="Old Gmail", + document_type=DocumentType.COMPOSIO_GMAIL_CONNECTOR, + content="old summary", + content_hash=f"ch-{legacy_hash[:12]}", + unique_identifier_hash=legacy_hash, + source_markdown="## Old content", + search_space_id=space_id, + created_by_id=user_id, + embedding=[0.1] * _EMBEDDING_DIM, + status={"state": "ready"}, + ) + db_session.add(legacy_doc) + await db_session.flush() + original_id = legacy_doc.id + + connector_doc = _gmail_doc( + unique_id=msg_id, + search_space_id=space_id, + connector_id=db_connector.id, + user_id=user_id, + ) + + service = IndexingPipelineService(session=db_session) + await service.migrate_legacy_docs([connector_doc]) + + prepared = await service.prepare_for_indexing([connector_doc]) + assert len(prepared) == 1 + assert prepared[0].id == original_id + assert prepared[0].document_type == DocumentType.GOOGLE_GMAIL_CONNECTOR + + native_hash = compute_identifier_hash( + DocumentType.GOOGLE_GMAIL_CONNECTOR.value, msg_id, space_id + ) + assert prepared[0].unique_identifier_hash == native_hash diff --git a/surfsense_backend/tests/integration/indexing_pipeline/test_index_batch.py b/surfsense_backend/tests/integration/indexing_pipeline/test_index_batch.py new file mode 100644 index 000000000..a40498769 --- /dev/null +++ b/surfsense_backend/tests/integration/indexing_pipeline/test_index_batch.py @@ -0,0 +1,55 @@ +"""Integration tests for IndexingPipelineService.index_batch().""" + +import pytest +from sqlalchemy import select + +from app.db import Document, DocumentStatus, DocumentType +from app.indexing_pipeline.indexing_pipeline_service import IndexingPipelineService + +pytestmark = pytest.mark.integration + + +@pytest.mark.usefixtures("patched_summarize", "patched_embed_texts", "patched_chunk_text") +async def test_index_batch_creates_ready_documents( + db_session, db_search_space, make_connector_document, mocker +): + """index_batch prepares and indexes a batch, resulting in READY documents.""" + space_id = db_search_space.id + docs = [ + make_connector_document( + document_type=DocumentType.GOOGLE_GMAIL_CONNECTOR, + unique_id="msg-batch-1", + search_space_id=space_id, + source_markdown="## Email 1\n\nBody", + ), + make_connector_document( + document_type=DocumentType.GOOGLE_GMAIL_CONNECTOR, + unique_id="msg-batch-2", + search_space_id=space_id, + source_markdown="## Email 2\n\nDifferent body", + ), + ] + + service = IndexingPipelineService(session=db_session) + results = await service.index_batch(docs, llm=mocker.Mock()) + + assert len(results) == 2 + + result = await db_session.execute( + select(Document).filter(Document.search_space_id == space_id) + ) + rows = result.scalars().all() + assert len(rows) == 2 + + for row in rows: + assert DocumentStatus.is_state(row.status, DocumentStatus.READY) + assert row.content is not None + assert row.embedding is not None + + +@pytest.mark.usefixtures("patched_summarize", "patched_embed_texts", "patched_chunk_text") +async def test_index_batch_empty_returns_empty(db_session, mocker): + """index_batch with empty input returns an empty list.""" + service = IndexingPipelineService(session=db_session) + results = await service.index_batch([], llm=mocker.Mock()) + assert results == [] diff --git a/surfsense_backend/tests/integration/indexing_pipeline/test_migrate_legacy_docs.py b/surfsense_backend/tests/integration/indexing_pipeline/test_migrate_legacy_docs.py new file mode 100644 index 000000000..8fc0e7586 --- /dev/null +++ b/surfsense_backend/tests/integration/indexing_pipeline/test_migrate_legacy_docs.py @@ -0,0 +1,92 @@ +"""Integration tests for IndexingPipelineService.migrate_legacy_docs().""" + +import pytest +from sqlalchemy import select + +from app.config import config as app_config +from app.db import Document, DocumentType +from app.indexing_pipeline.document_hashing import compute_identifier_hash +from app.indexing_pipeline.indexing_pipeline_service import IndexingPipelineService + +_EMBEDDING_DIM = app_config.embedding_model_instance.dimension + +pytestmark = pytest.mark.integration + + +async def test_legacy_composio_gmail_doc_migrated_in_db( + db_session, db_search_space, db_user, make_connector_document +): + """A Composio Gmail doc in the DB gets its hash and type updated to native.""" + space_id = db_search_space.id + user_id = str(db_user.id) + unique_id = "msg-legacy-123" + + legacy_hash = compute_identifier_hash( + DocumentType.COMPOSIO_GMAIL_CONNECTOR.value, unique_id, space_id + ) + native_hash = compute_identifier_hash( + DocumentType.GOOGLE_GMAIL_CONNECTOR.value, unique_id, space_id + ) + + legacy_doc = Document( + title="Old Gmail", + document_type=DocumentType.COMPOSIO_GMAIL_CONNECTOR, + content="legacy content", + content_hash=f"ch-{legacy_hash[:12]}", + unique_identifier_hash=legacy_hash, + search_space_id=space_id, + created_by_id=user_id, + embedding=[0.1] * _EMBEDDING_DIM, + status={"state": "ready"}, + ) + db_session.add(legacy_doc) + await db_session.flush() + doc_id = legacy_doc.id + + connector_doc = make_connector_document( + document_type=DocumentType.GOOGLE_GMAIL_CONNECTOR, + unique_id=unique_id, + search_space_id=space_id, + ) + + service = IndexingPipelineService(session=db_session) + await service.migrate_legacy_docs([connector_doc]) + + result = await db_session.execute(select(Document).filter(Document.id == doc_id)) + reloaded = result.scalars().first() + + assert reloaded.unique_identifier_hash == native_hash + assert reloaded.document_type == DocumentType.GOOGLE_GMAIL_CONNECTOR + + +async def test_no_legacy_doc_is_noop( + db_session, db_search_space, make_connector_document +): + """When no legacy document exists, migrate_legacy_docs does nothing.""" + connector_doc = make_connector_document( + document_type=DocumentType.GOOGLE_CALENDAR_CONNECTOR, + unique_id="evt-no-legacy", + search_space_id=db_search_space.id, + ) + + service = IndexingPipelineService(session=db_session) + await service.migrate_legacy_docs([connector_doc]) + + result = await db_session.execute( + select(Document).filter(Document.search_space_id == db_search_space.id) + ) + assert result.scalars().all() == [] + + +async def test_non_google_type_is_skipped( + db_session, db_search_space, make_connector_document +): + """migrate_legacy_docs skips ConnectorDocuments that are not Google types.""" + connector_doc = make_connector_document( + document_type=DocumentType.CLICKUP_CONNECTOR, + unique_id="task-1", + search_space_id=db_search_space.id, + ) + + service = IndexingPipelineService(session=db_session) + await service.migrate_legacy_docs([connector_doc]) diff --git a/surfsense_backend/tests/unit/indexing_pipeline/test_index_batch.py b/surfsense_backend/tests/unit/indexing_pipeline/test_index_batch.py new file mode 100644 index 000000000..dcd097d20 --- /dev/null +++ b/surfsense_backend/tests/unit/indexing_pipeline/test_index_batch.py @@ -0,0 +1,82 @@ +from unittest.mock import AsyncMock, MagicMock + +import pytest + +from app.db import Document, DocumentType +from app.indexing_pipeline.document_hashing import compute_unique_identifier_hash +from app.indexing_pipeline.indexing_pipeline_service import IndexingPipelineService + +pytestmark = pytest.mark.unit + + +@pytest.fixture +def mock_session(): + return AsyncMock() + + +@pytest.fixture +def pipeline(mock_session): + return IndexingPipelineService(mock_session) + + +async def test_calls_prepare_then_index_per_document( + pipeline, make_connector_document +): + """index_batch calls prepare_for_indexing, then index() for each returned doc.""" + doc1 = make_connector_document( + document_type=DocumentType.GOOGLE_GMAIL_CONNECTOR, + unique_id="msg-1", + search_space_id=1, + ) + doc2 = make_connector_document( + document_type=DocumentType.GOOGLE_GMAIL_CONNECTOR, + unique_id="msg-2", + search_space_id=1, + ) + + orm1 = MagicMock(spec=Document) + orm1.unique_identifier_hash = compute_unique_identifier_hash(doc1) + orm2 = MagicMock(spec=Document) + orm2.unique_identifier_hash = compute_unique_identifier_hash(doc2) + + mock_llm = MagicMock() + + pipeline.prepare_for_indexing = AsyncMock(return_value=[orm1, orm2]) + pipeline.index = AsyncMock(side_effect=lambda doc, cdoc, llm: doc) + + results = await pipeline.index_batch([doc1, doc2], mock_llm) + + pipeline.prepare_for_indexing.assert_awaited_once_with([doc1, doc2]) + assert pipeline.index.await_count == 2 + assert results == [orm1, orm2] + + +async def test_empty_input_returns_empty(pipeline): + """Empty connector_docs list returns empty result.""" + pipeline.prepare_for_indexing = AsyncMock(return_value=[]) + + results = await pipeline.index_batch([], MagicMock()) + + assert results == [] + + +async def test_skips_document_without_matching_connector_doc( + pipeline, make_connector_document +): + """If prepare returns a doc whose hash has no matching ConnectorDocument, it's skipped.""" + doc1 = make_connector_document( + document_type=DocumentType.GOOGLE_GMAIL_CONNECTOR, + unique_id="msg-1", + search_space_id=1, + ) + + orphan_orm = MagicMock(spec=Document) + orphan_orm.unique_identifier_hash = "nonexistent-hash" + + pipeline.prepare_for_indexing = AsyncMock(return_value=[orphan_orm]) + pipeline.index = AsyncMock() + + results = await pipeline.index_batch([doc1], MagicMock()) + + pipeline.index.assert_not_awaited() + assert results == [] diff --git a/surfsense_backend/tests/unit/indexing_pipeline/test_migrate_legacy_docs.py b/surfsense_backend/tests/unit/indexing_pipeline/test_migrate_legacy_docs.py new file mode 100644 index 000000000..9334fe678 --- /dev/null +++ b/surfsense_backend/tests/unit/indexing_pipeline/test_migrate_legacy_docs.py @@ -0,0 +1,127 @@ +from unittest.mock import AsyncMock, MagicMock + +import pytest + +from app.db import Document, DocumentType +from app.indexing_pipeline.document_hashing import compute_identifier_hash +from app.indexing_pipeline.indexing_pipeline_service import IndexingPipelineService + +pytestmark = pytest.mark.unit + + +@pytest.fixture +def mock_session(): + session = AsyncMock() + return session + + +@pytest.fixture +def pipeline(mock_session): + return IndexingPipelineService(mock_session) + + +def _make_execute_side_effect(doc_by_hash: dict): + """Return a side_effect for session.execute that resolves documents by hash.""" + + async def _side_effect(stmt): + result = MagicMock() + for h, doc in doc_by_hash.items(): + if h in str(stmt.compile(compile_kwargs={"literal_binds": True})): + result.scalars.return_value.first.return_value = doc + return result + result.scalars.return_value.first.return_value = None + return result + + return _side_effect + + +async def test_updates_hash_and_type_for_legacy_document( + pipeline, mock_session, make_connector_document +): + """Legacy Composio document gets unique_identifier_hash and document_type updated.""" + doc = make_connector_document( + document_type=DocumentType.GOOGLE_GMAIL_CONNECTOR, + unique_id="msg-abc", + search_space_id=1, + ) + + legacy_hash = compute_identifier_hash("COMPOSIO_GMAIL_CONNECTOR", "msg-abc", 1) + native_hash = compute_identifier_hash("GOOGLE_GMAIL_CONNECTOR", "msg-abc", 1) + + existing = MagicMock(spec=Document) + existing.unique_identifier_hash = legacy_hash + existing.document_type = DocumentType.COMPOSIO_GMAIL_CONNECTOR + + result_mock = MagicMock() + result_mock.scalars.return_value.first.return_value = existing + mock_session.execute = AsyncMock(return_value=result_mock) + + await pipeline.migrate_legacy_docs([doc]) + + assert existing.unique_identifier_hash == native_hash + assert existing.document_type == DocumentType.GOOGLE_GMAIL_CONNECTOR + mock_session.commit.assert_awaited_once() + + +async def test_noop_when_no_legacy_document_exists( + pipeline, mock_session, make_connector_document +): + """No updates when no legacy Composio document is found in DB.""" + doc = make_connector_document( + document_type=DocumentType.GOOGLE_GMAIL_CONNECTOR, + unique_id="msg-xyz", + search_space_id=1, + ) + + result_mock = MagicMock() + result_mock.scalars.return_value.first.return_value = None + mock_session.execute = AsyncMock(return_value=result_mock) + + await pipeline.migrate_legacy_docs([doc]) + + mock_session.commit.assert_awaited_once() + + +async def test_skips_non_google_doc_types( + pipeline, mock_session, make_connector_document +): + """Non-Google doc types have no legacy mapping and trigger no DB query.""" + doc = make_connector_document( + document_type=DocumentType.SLACK_CONNECTOR, + unique_id="slack-123", + search_space_id=1, + ) + + await pipeline.migrate_legacy_docs([doc]) + + mock_session.execute.assert_not_awaited() + mock_session.commit.assert_awaited_once() + + +async def test_handles_all_three_google_types( + pipeline, mock_session, make_connector_document +): + """Each native Google type correctly maps to its Composio legacy type.""" + mappings = [ + (DocumentType.GOOGLE_GMAIL_CONNECTOR, "COMPOSIO_GMAIL_CONNECTOR"), + (DocumentType.GOOGLE_CALENDAR_CONNECTOR, "COMPOSIO_GOOGLE_CALENDAR_CONNECTOR"), + (DocumentType.GOOGLE_DRIVE_FILE, "COMPOSIO_GOOGLE_DRIVE_CONNECTOR"), + ] + for native_type, expected_legacy in mappings: + doc = make_connector_document( + document_type=native_type, + unique_id="id-1", + search_space_id=1, + ) + + existing = MagicMock(spec=Document) + existing.document_type = DocumentType(expected_legacy) + + result_mock = MagicMock() + result_mock.scalars.return_value.first.return_value = existing + mock_session.execute = AsyncMock(return_value=result_mock) + mock_session.commit = AsyncMock() + + await pipeline.migrate_legacy_docs([doc]) + + assert existing.document_type == native_type From c3d5c865fdb18b6ea5c039a2e160a5e4cbdbd64f Mon Sep 17 00:00:00 2001 From: Anish Sarkar <104695310+AnishSarkar22@users.noreply.github.com> Date: Wed, 25 Mar 2026 18:51:40 +0530 Subject: [PATCH 03/71] fix: update file skipping logic in Google Drive indexer - Modified the `_should_skip_file` function to prevent skipping of documents with a FAILED status, ensuring they are reprocessed even if their content remains unchanged. - Added a new integration test to verify that FAILED documents are not skipped during the indexing process. --- .../google_drive_indexer.py | 2 +- .../indexing_pipeline/test_drive_pipeline.py | 59 +++++++++++++++++++ 2 files changed, 60 insertions(+), 1 deletion(-) diff --git a/surfsense_backend/app/tasks/connector_indexers/google_drive_indexer.py b/surfsense_backend/app/tasks/connector_indexers/google_drive_indexer.py index 92c074812..af9528bb7 100644 --- a/surfsense_backend/app/tasks/connector_indexers/google_drive_indexer.py +++ b/surfsense_backend/app/tasks/connector_indexers/google_drive_indexer.py @@ -149,7 +149,7 @@ async def _should_skip_file( return True, f"File renamed: '{old_name}' → '{file_name}'" if not DocumentStatus.is_state(existing.status, DocumentStatus.READY): - existing.status = DocumentStatus.ready() + return False, None return True, "unchanged" diff --git a/surfsense_backend/tests/integration/indexing_pipeline/test_drive_pipeline.py b/surfsense_backend/tests/integration/indexing_pipeline/test_drive_pipeline.py index 32af0b8c1..77128ebd9 100644 --- a/surfsense_backend/tests/integration/indexing_pipeline/test_drive_pipeline.py +++ b/surfsense_backend/tests/integration/indexing_pipeline/test_drive_pipeline.py @@ -108,3 +108,62 @@ async def test_drive_legacy_doc_migrated( DocumentType.GOOGLE_DRIVE_FILE.value, file_id, space_id ) assert row.unique_identifier_hash == native_hash + + +async def test_should_skip_file_does_not_skip_failed_document( + db_session, db_search_space, db_user, +): + """A FAILED document with unchanged md5 must NOT be skipped — it needs reprocessing.""" + import importlib + import sys + import types + + pkg = "app.tasks.connector_indexers" + stub = pkg not in sys.modules + if stub: + mod = types.ModuleType(pkg) + mod.__path__ = ["app/tasks/connector_indexers"] + mod.__package__ = pkg + sys.modules[pkg] = mod + + try: + gdm = importlib.import_module( + "app.tasks.connector_indexers.google_drive_indexer" + ) + _should_skip_file = gdm._should_skip_file + finally: + if stub: + sys.modules.pop(pkg, None) + + space_id = db_search_space.id + file_id = "file-failed-drive" + md5 = "abc123deadbeef" + + doc_hash = compute_identifier_hash( + DocumentType.GOOGLE_DRIVE_FILE.value, file_id, space_id + ) + failed_doc = Document( + title="Failed File.pdf", + document_type=DocumentType.GOOGLE_DRIVE_FILE, + content="LLM rate limit exceeded", + content_hash=f"ch-{doc_hash[:12]}", + unique_identifier_hash=doc_hash, + source_markdown="## Real content", + search_space_id=space_id, + created_by_id=str(db_user.id), + embedding=[0.1] * _EMBEDDING_DIM, + status=DocumentStatus.failed("LLM rate limit exceeded"), + document_metadata={ + "google_drive_file_id": file_id, + "google_drive_file_name": "Failed File.pdf", + "md5_checksum": md5, + }, + ) + db_session.add(failed_doc) + await db_session.flush() + + incoming_file = {"id": file_id, "name": "Failed File.pdf", "mimeType": "application/pdf", "md5Checksum": md5} + + should_skip, _msg = await _should_skip_file(db_session, incoming_file, space_id) + + assert not should_skip, "FAILED documents must not be skipped even when content is unchanged" From 227fb014d4695908768687e8cec573dea6b05c89 Mon Sep 17 00:00:00 2001 From: likiosliu Date: Wed, 25 Mar 2026 12:32:24 +0800 Subject: [PATCH 04/71] fix: add noopener to window.open call in AnnouncementToastProvider Closes #939 --- .../components/announcements/AnnouncementToastProvider.tsx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/surfsense_web/components/announcements/AnnouncementToastProvider.tsx b/surfsense_web/components/announcements/AnnouncementToastProvider.tsx index 3ae6bf233..6cb1b17e5 100644 --- a/surfsense_web/components/announcements/AnnouncementToastProvider.tsx +++ b/surfsense_web/components/announcements/AnnouncementToastProvider.tsx @@ -34,7 +34,7 @@ function showAnnouncementToast(announcement: Announcement) { label: announcement.link.label, onClick: () => { if (announcement.link?.url.startsWith("http")) { - window.open(announcement.link.url, "_blank"); + window.open(announcement.link.url, "_blank", "noopener,noreferrer"); } else if (announcement.link?.url) { window.location.href = announcement.link.url; } From 2a7b50408f5219003c4e6469c9e38bc174f369d9 Mon Sep 17 00:00:00 2001 From: likiosliu Date: Wed, 25 Mar 2026 12:32:56 +0800 Subject: [PATCH 05/71] fix: add missing type dependency in DocumentTypeChip truncation check Closes #946 --- .../documents/(manage)/components/DocumentTypeIcon.tsx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/components/DocumentTypeIcon.tsx b/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/components/DocumentTypeIcon.tsx index c07f34935..25eeb4cab 100644 --- a/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/components/DocumentTypeIcon.tsx +++ b/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/components/DocumentTypeIcon.tsx @@ -63,7 +63,7 @@ export function DocumentTypeChip({ type, className }: { type: string; className? checkTruncation(); window.addEventListener("resize", checkTruncation); return () => window.removeEventListener("resize", checkTruncation); - }, []); + }, [type]); const chip = ( Date: Wed, 25 Mar 2026 16:58:46 +0800 Subject: [PATCH 06/71] fix: avoid stale event reference in register page retry action Extract submission logic into submitForm() so the retry toast action does not capture the original SyntheticEvent, which may be recycled by React by the time the user clicks retry. Closes #945 --- surfsense_web/app/(home)/register/page.tsx | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/surfsense_web/app/(home)/register/page.tsx b/surfsense_web/app/(home)/register/page.tsx index 35fa2b668..96fab2c6a 100644 --- a/surfsense_web/app/(home)/register/page.tsx +++ b/surfsense_web/app/(home)/register/page.tsx @@ -43,9 +43,12 @@ export default function RegisterPage() { } }, [router]); - const handleSubmit = async (e: React.FormEvent) => { + const handleSubmit = (e: React.FormEvent) => { e.preventDefault(); + submitForm(); + }; + const submitForm = async () => { // Form validation if (password !== confirmPassword) { setError({ title: t("password_mismatch"), message: t("passwords_no_match_desc") }); @@ -140,7 +143,7 @@ export default function RegisterPage() { if (shouldRetry(errorCode)) { toastOptions.action = { label: tCommon("retry"), - onClick: () => handleSubmit(e), + onClick: () => submitForm(), }; } From 97e7e73baf76340c79a47522c2b11f3983aae78a Mon Sep 17 00:00:00 2001 From: likiosliu Date: Wed, 25 Mar 2026 16:55:26 +0800 Subject: [PATCH 07/71] fix: remove unnecessary useEffect + useState for AUTH_TYPE constant AUTH_TYPE is a static module-level import that never changes. No need for useState + useEffect; use the constant directly. Closes #941 --- surfsense_web/app/(home)/login/LocalLoginForm.tsx | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/surfsense_web/app/(home)/login/LocalLoginForm.tsx b/surfsense_web/app/(home)/login/LocalLoginForm.tsx index 9481976a9..7c85eedbd 100644 --- a/surfsense_web/app/(home)/login/LocalLoginForm.tsx +++ b/surfsense_web/app/(home)/login/LocalLoginForm.tsx @@ -5,7 +5,7 @@ import { AnimatePresence, motion } from "motion/react"; import Link from "next/link"; import { useRouter } from "next/navigation"; import { useTranslations } from "next-intl"; -import { useEffect, useState } from "react"; +import { useState } from "react"; import { loginMutationAtom } from "@/atoms/auth/auth-mutation.atoms"; import { Spinner } from "@/components/ui/spinner"; import { getAuthErrorDetails, isNetworkError } from "@/lib/auth-errors"; @@ -25,15 +25,10 @@ export function LocalLoginForm() { title: null, message: null, }); - const [authType, setAuthType] = useState(null); + const authType = AUTH_TYPE; const router = useRouter(); const [{ mutateAsync: login, isPending: isLoggingIn }] = useAtom(loginMutationAtom); - useEffect(() => { - // Get the auth type from centralized config - setAuthType(AUTH_TYPE); - }, []); - const handleSubmit = async (e: React.FormEvent) => { e.preventDefault(); setError({ title: null, message: null }); // Clear any previous errors From e5cabf95e46f75854d56a5ca6eb2315ccce9752b Mon Sep 17 00:00:00 2001 From: likiosliu Date: Wed, 25 Mar 2026 12:34:30 +0800 Subject: [PATCH 08/71] fix: clean up recursive setTimeout calls in onboarding tour - Add cancelled flag to prevent state updates after unmount in checkAndStartTour retry loop - Store retry timer ID in a ref and clear it on cleanup in updateTarget effect Closes #950 --- surfsense_web/components/onboarding-tour.tsx | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/surfsense_web/components/onboarding-tour.tsx b/surfsense_web/components/onboarding-tour.tsx index 03fad87b6..114a46141 100644 --- a/surfsense_web/components/onboarding-tour.tsx +++ b/surfsense_web/components/onboarding-tour.tsx @@ -436,6 +436,7 @@ export function OnboardingTour() { const { resolvedTheme } = useTheme(); const pathname = usePathname(); const retryCountRef = useRef(0); + const retryTimerRef = useRef | null>(null); const maxRetries = 10; // Track previous user ID to detect user changes const previousUserIdRef = useRef(null); @@ -477,7 +478,7 @@ export function OnboardingTour() { retryCountRef.current = 0; } else if (retryCountRef.current < maxRetries) { retryCountRef.current++; - setTimeout(() => { + retryTimerRef.current = setTimeout(() => { const retryEl = document.querySelector(currentStep.target); if (retryEl) { setTargetEl(retryEl); @@ -487,6 +488,10 @@ export function OnboardingTour() { } }, 200); } + + return () => { + if (retryTimerRef.current) clearTimeout(retryTimerRef.current); + }; }, [currentStep]); // Check if tour should run: localStorage + data validation with user ID tracking @@ -556,7 +561,11 @@ export function OnboardingTour() { } // User is new and hasn't seen tour - wait for DOM elements and start tour + let cancelled = false; + const checkAndStartTour = () => { + if (cancelled) return; + // Check if all required elements exist const connectorEl = document.querySelector(TOUR_STEPS[0].target); const documentsEl = document.querySelector(TOUR_STEPS[1].target); @@ -578,7 +587,10 @@ export function OnboardingTour() { // Start checking after initial delay const timer = setTimeout(checkAndStartTour, 500); - return () => clearTimeout(timer); + return () => { + cancelled = true; + clearTimeout(timer); + }; }, [mounted, user?.id, searchSpaceId, pathname, threadsData, documentTypeCounts, connectors]); // Update position on resize/scroll From b17ce0e64f10f9a6dddb06512ec2bb2da6a5df43 Mon Sep 17 00:00:00 2001 From: Tyson Cung Date: Wed, 25 Mar 2026 14:43:11 +0000 Subject: [PATCH 09/71] fix(ui): show skeleton instead of fake star count while loading (#918) Replace the misleading 10000 placeholder with a Skeleton component during the loading state of the GitHub stars badge. This prevents users from thinking 10000 is the actual star count before real data loads. Closes #918 --- .../components/homepage/github-stars-badge.tsx | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/surfsense_web/components/homepage/github-stars-badge.tsx b/surfsense_web/components/homepage/github-stars-badge.tsx index e11d6ff2d..feee8ee33 100644 --- a/surfsense_web/components/homepage/github-stars-badge.tsx +++ b/surfsense_web/components/homepage/github-stars-badge.tsx @@ -4,6 +4,7 @@ import { IconBrandGithub } from "@tabler/icons-react"; import { motion, useMotionValue, useSpring } from "motion/react"; import * as React from "react"; import { cn } from "@/lib/utils"; +import { Skeleton } from "@/components/ui/skeleton"; // --------------------------------------------------------------------------- // Per-digit scrolling wheel @@ -277,12 +278,16 @@ function NavbarGitHubStars({ )} > - + {isLoading ? ( + + ) : ( + + )} ); } From bbd5ee8a1979c67a4ab43b1cadca904445a4008f Mon Sep 17 00:00:00 2001 From: Anish Sarkar <104695310+AnishSarkar22@users.noreply.github.com> Date: Wed, 25 Mar 2026 20:35:23 +0530 Subject: [PATCH 10/71] feat: enhance Google Calendar event update functionality - Introduced helper functions `_is_date_only` and `_build_time_body` to streamline the construction of event start and end times for all-day and timed events. - Refactored the `create_update_calendar_event_tool` to utilize the new helper functions, improving code readability and maintainability. - Updated the Google Calendar sync service to ensure proper handling of calendar IDs with a default fallback to "primary". - Modified the ApprovalCard component to simplify the construction of event update arguments, enhancing clarity and reducing redundancy. --- .../tools/google_calendar/update_event.py | 34 +++++++------- .../google_calendar/kb_sync_service.py | 4 +- .../hitl-edit-panel/hitl-edit-panel.tsx | 2 +- .../tool-ui/google-calendar/update-event.tsx | 46 ++++++++++++++----- 4 files changed, 55 insertions(+), 31 deletions(-) diff --git a/surfsense_backend/app/agents/new_chat/tools/google_calendar/update_event.py b/surfsense_backend/app/agents/new_chat/tools/google_calendar/update_event.py index 4b57cf2e3..a114c84f4 100644 --- a/surfsense_backend/app/agents/new_chat/tools/google_calendar/update_event.py +++ b/surfsense_backend/app/agents/new_chat/tools/google_calendar/update_event.py @@ -14,6 +14,20 @@ from app.services.google_calendar import GoogleCalendarToolMetadataService logger = logging.getLogger(__name__) +def _is_date_only(value: str) -> bool: + """Return True when *value* looks like a bare date (YYYY-MM-DD) with no time component.""" + return len(value) <= 10 and "T" not in value + + +def _build_time_body(value: str, context: dict[str, Any] | Any) -> dict[str, str]: + """Build a Google Calendar start/end body using ``date`` for all-day + events and ``dateTime`` for timed events.""" + if _is_date_only(value): + return {"date": value} + tz = context.get("timezone", "UTC") if isinstance(context, dict) else "UTC" + return {"dateTime": value, "timeZone": tz} + + def create_update_calendar_event_tool( db_session: AsyncSession | None = None, search_space_id: int | None = None, @@ -255,25 +269,13 @@ def create_update_calendar_event_tool( if final_new_summary is not None: update_body["summary"] = final_new_summary if final_new_start_datetime is not None: - tz = ( - context.get("timezone", "UTC") - if isinstance(context, dict) - else "UTC" + update_body["start"] = _build_time_body( + final_new_start_datetime, context ) - update_body["start"] = { - "dateTime": final_new_start_datetime, - "timeZone": tz, - } if final_new_end_datetime is not None: - tz = ( - context.get("timezone", "UTC") - if isinstance(context, dict) - else "UTC" + update_body["end"] = _build_time_body( + final_new_end_datetime, context ) - update_body["end"] = { - "dateTime": final_new_end_datetime, - "timeZone": tz, - } if final_new_description is not None: update_body["description"] = final_new_description if final_new_location is not None: diff --git a/surfsense_backend/app/services/google_calendar/kb_sync_service.py b/surfsense_backend/app/services/google_calendar/kb_sync_service.py index 59afa116e..3cda02b9b 100644 --- a/surfsense_backend/app/services/google_calendar/kb_sync_service.py +++ b/surfsense_backend/app/services/google_calendar/kb_sync_service.py @@ -209,8 +209,8 @@ class GoogleCalendarKBSyncService: ) calendar_id = (document.document_metadata or {}).get( - "calendar_id", "primary" - ) + "calendar_id" + ) or "primary" live_event = await loop.run_in_executor( None, lambda: ( diff --git a/surfsense_web/components/hitl-edit-panel/hitl-edit-panel.tsx b/surfsense_web/components/hitl-edit-panel/hitl-edit-panel.tsx index 25e896842..e8bc1a6cd 100644 --- a/surfsense_web/components/hitl-edit-panel/hitl-edit-panel.tsx +++ b/surfsense_web/components/hitl-edit-panel/hitl-edit-panel.tsx @@ -185,7 +185,7 @@ function DateTimePickerField({ type="time" value={time} onChange={handleTimeChange} - className="w-[120px] text-sm shrink-0 pl-1.5 [&::-webkit-calendar-picker-indicator]:order-first [&::-webkit-calendar-picker-indicator]:mr-1" + className="w-[120px] text-sm shrink-0 appearance-none [&::-webkit-calendar-picker-indicator]:hidden [&::-webkit-calendar-picker-indicator]:appearance-none" /> ); diff --git a/surfsense_web/components/tool-ui/google-calendar/update-event.tsx b/surfsense_web/components/tool-ui/google-calendar/update-event.tsx index cc941bab8..661032628 100644 --- a/surfsense_web/components/tool-ui/google-calendar/update-event.tsx +++ b/surfsense_web/components/tool-ui/google-calendar/update-event.tsx @@ -253,6 +253,12 @@ function ApprovalCard({ String(effectiveNewDescription ?? "") !== (event?.description ?? ""); const buildFinalArgs = useCallback(() => { + const base = { + event_id: event?.event_id, + document_id: event?.document_id, + connector_id: account?.id, + }; + if (pendingEdits) { const attendeesArr = pendingEdits.attendees ? pendingEdits.attendees @@ -260,22 +266,38 @@ function ApprovalCard({ .map((e) => e.trim()) .filter(Boolean) : null; + const origAttendees = event?.attendees?.map((a) => a.email) ?? []; + return { - event_id: event?.event_id, - document_id: event?.document_id, - connector_id: account?.id, - new_summary: pendingEdits.summary || null, - new_description: pendingEdits.description || null, - new_start_datetime: pendingEdits.start_datetime || null, - new_end_datetime: pendingEdits.end_datetime || null, - new_location: pendingEdits.location || null, - new_attendees: attendeesArr, + ...base, + new_summary: + pendingEdits.summary && pendingEdits.summary !== (event?.summary ?? "") + ? pendingEdits.summary + : null, + new_description: + pendingEdits.description !== (event?.description ?? "") + ? pendingEdits.description || null + : null, + new_start_datetime: + pendingEdits.start_datetime && pendingEdits.start_datetime !== (event?.start ?? "") + ? pendingEdits.start_datetime + : null, + new_end_datetime: + pendingEdits.end_datetime && pendingEdits.end_datetime !== (event?.end ?? "") + ? pendingEdits.end_datetime + : null, + new_location: + pendingEdits.location !== (event?.location ?? "") + ? pendingEdits.location || null + : null, + new_attendees: + attendeesArr && attendeesArr.join(",") !== origAttendees.join(",") + ? attendeesArr + : null, }; } return { - event_id: event?.event_id, - document_id: event?.document_id, - connector_id: account?.id, + ...base, new_summary: actionArgs.new_summary ?? null, new_description: actionArgs.new_description ?? null, new_start_datetime: actionArgs.new_start_datetime ?? null, From f7640671f3dfe96e0432ba8c2df88a38bb9fd6ba Mon Sep 17 00:00:00 2001 From: likiosliu Date: Thu, 26 Mar 2026 11:49:45 +0800 Subject: [PATCH 11/71] fix: replace router.push with Link for static navigation in UserDropdown Enables route prefetching and follows Next.js best practices. Removes unused useRouter import. --- surfsense_web/components/UserDropdown.tsx | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/surfsense_web/components/UserDropdown.tsx b/surfsense_web/components/UserDropdown.tsx index b79ab6e79..197db6287 100644 --- a/surfsense_web/components/UserDropdown.tsx +++ b/surfsense_web/components/UserDropdown.tsx @@ -1,7 +1,7 @@ "use client"; import { BadgeCheck, LogOut } from "lucide-react"; -import { useRouter } from "next/navigation"; +import Link from "next/link"; import { useState } from "react"; import { Avatar, AvatarFallback, AvatarImage } from "@/components/ui/avatar"; import { Button } from "@/components/ui/button"; @@ -27,7 +27,6 @@ export function UserDropdown({ avatar: string; }; }) { - const router = useRouter(); const [isLoggingOut, setIsLoggingOut] = useState(false); const handleLogout = async () => { @@ -75,12 +74,11 @@ export function UserDropdown({ - router.push(`/dashboard/api-key`)} - className="text-xs md:text-sm" - > - - API Key + + + + API Key + From 3d762ccf6216bac059079b513c63472cfd19c861 Mon Sep 17 00:00:00 2001 From: likiosliu Date: Thu, 26 Mar 2026 11:50:39 +0800 Subject: [PATCH 12/71] fix: remove unnecessary "use client" from pure presentational components These components only render JSX with props and don't use hooks, event handlers, or browser APIs. --- surfsense_web/app/docs/sidebar-separator.tsx | 2 -- surfsense_web/components/Logo.tsx | 2 -- .../components/announcements/AnnouncementsEmptyState.tsx | 2 -- .../public-chat-snapshots/public-chat-snapshots-empty-state.tsx | 2 -- 4 files changed, 8 deletions(-) diff --git a/surfsense_web/app/docs/sidebar-separator.tsx b/surfsense_web/app/docs/sidebar-separator.tsx index 36fff09a4..ceb56b160 100644 --- a/surfsense_web/app/docs/sidebar-separator.tsx +++ b/surfsense_web/app/docs/sidebar-separator.tsx @@ -1,5 +1,3 @@ -"use client"; - import type { Separator } from "fumadocs-core/page-tree"; export function SidebarSeparator({ item }: { item: Separator }) { diff --git a/surfsense_web/components/Logo.tsx b/surfsense_web/components/Logo.tsx index 76446ca59..121185757 100644 --- a/surfsense_web/components/Logo.tsx +++ b/surfsense_web/components/Logo.tsx @@ -1,5 +1,3 @@ -"use client"; - import Image from "next/image"; import Link from "next/link"; import { cn } from "@/lib/utils"; diff --git a/surfsense_web/components/announcements/AnnouncementsEmptyState.tsx b/surfsense_web/components/announcements/AnnouncementsEmptyState.tsx index b4551f56a..9ed1ea45d 100644 --- a/surfsense_web/components/announcements/AnnouncementsEmptyState.tsx +++ b/surfsense_web/components/announcements/AnnouncementsEmptyState.tsx @@ -1,5 +1,3 @@ -"use client"; - import { BellOff } from "lucide-react"; export function AnnouncementsEmptyState() { diff --git a/surfsense_web/components/public-chat-snapshots/public-chat-snapshots-empty-state.tsx b/surfsense_web/components/public-chat-snapshots/public-chat-snapshots-empty-state.tsx index 4bb295217..4a4a57770 100644 --- a/surfsense_web/components/public-chat-snapshots/public-chat-snapshots-empty-state.tsx +++ b/surfsense_web/components/public-chat-snapshots/public-chat-snapshots-empty-state.tsx @@ -1,5 +1,3 @@ -"use client"; - import { Link2Off } from "lucide-react"; interface PublicChatSnapshotsEmptyStateProps { From 2cf6866c10e7e7219ffcf205b33744972dbed866 Mon Sep 17 00:00:00 2001 From: JoeMakuta Date: Thu, 26 Mar 2026 11:59:04 +0200 Subject: [PATCH 13/71] Add loader on new chat route --- .../new-chat/[[...chat_id]]/page.tsx | 38 ++-------------- .../[search_space_id]/new-chat/loading.tsx | 45 +++++++++++++++++++ 2 files changed, 48 insertions(+), 35 deletions(-) create mode 100644 surfsense_web/app/dashboard/[search_space_id]/new-chat/loading.tsx diff --git a/surfsense_web/app/dashboard/[search_space_id]/new-chat/[[...chat_id]]/page.tsx b/surfsense_web/app/dashboard/[search_space_id]/new-chat/[[...chat_id]]/page.tsx index 8578d2dcb..1cbfca2df 100644 --- a/surfsense_web/app/dashboard/[search_space_id]/new-chat/[[...chat_id]]/page.tsx +++ b/surfsense_web/app/dashboard/[search_space_id]/new-chat/[[...chat_id]]/page.tsx @@ -74,6 +74,7 @@ import { trackChatMessageSent, trackChatResponseReceived, } from "@/lib/posthog/events"; +import Loading from "../loading"; /** * After a tool produces output, mark any previously-decided interrupt tool @@ -1527,40 +1528,7 @@ export default function NewChatPage() { // Show loading state only when loading an existing thread if (isInitializing) { return ( -
-
- {/* User message */} -
- -
- - {/* Assistant message */} -
- - - -
- - {/* User message */} -
- -
- - {/* Assistant message */} -
- - - -
-
- - {/* Input bar */} -
-
- -
-
-
+ ); } @@ -1597,4 +1565,4 @@ export default function NewChatPage() { ); -} +} \ No newline at end of file diff --git a/surfsense_web/app/dashboard/[search_space_id]/new-chat/loading.tsx b/surfsense_web/app/dashboard/[search_space_id]/new-chat/loading.tsx new file mode 100644 index 000000000..1f47fb95a --- /dev/null +++ b/surfsense_web/app/dashboard/[search_space_id]/new-chat/loading.tsx @@ -0,0 +1,45 @@ +import { Skeleton } from "@/components/ui/skeleton"; + +export default function Loading() { + return ( +
+
+ {/* User message */} +
+ +
+ + {/* Assistant message */} +
+ + + +
+ + {/* User message */} +
+ +
+ + {/* Assistant message */} +
+ + + +
+ + {/* User message */} +
+ +
+
+ + {/* Input bar */} +
+
+ +
+
+
+ ); +} From 80ede9849ab5feed4c0cb3be0935422315811d1f Mon Sep 17 00:00:00 2001 From: JoeMakuta Date: Thu, 26 Mar 2026 12:19:18 +0200 Subject: [PATCH 14/71] Add loading od logs route --- .../[search_space_id]/logs/loading.tsx | 136 ++++++++++++++++++ 1 file changed, 136 insertions(+) create mode 100644 surfsense_web/app/dashboard/[search_space_id]/logs/loading.tsx diff --git a/surfsense_web/app/dashboard/[search_space_id]/logs/loading.tsx b/surfsense_web/app/dashboard/[search_space_id]/logs/loading.tsx new file mode 100644 index 000000000..318c2836b --- /dev/null +++ b/surfsense_web/app/dashboard/[search_space_id]/logs/loading.tsx @@ -0,0 +1,136 @@ +"use client"; + +import { motion } from "motion/react"; +import { Skeleton } from "@/components/ui/skeleton"; + +export default function Loading() { + return ( + + {/* Summary Dashboard Skeleton */} + + {[...Array(4)].map((_, i) => ( +
+
+ + +
+
+ + +
+
+ ))} +
+ + {/* Header Section Skeleton */} + +
+ + +
+ +
+ + {/* Filters Skeleton */} + +
+ + + + +
+
+ + {/* Table Skeleton */} + + {/* Table Header */} +
+ + + + + + + +
+ + {/* Table Rows */} + {[...Array(6)].map((_, i) => ( +
+ + + +
+ + +
+
+ + +
+
+ + +
+ +
+ ))} +
+ + {/* Pagination Skeleton */} +
+ + + + + + + + + +
+ + + + +
+
+
+ ); +} From d535851ad51ad574fd99664ec553c13786b0e5b5 Mon Sep 17 00:00:00 2001 From: JoeMakuta Date: Thu, 26 Mar 2026 12:44:46 +0200 Subject: [PATCH 15/71] Add loader to more-pages route --- .../dashboard/[search_space_id]/more-pages/loading.tsx | 10 ++++++++++ 1 file changed, 10 insertions(+) create mode 100644 surfsense_web/app/dashboard/[search_space_id]/more-pages/loading.tsx diff --git a/surfsense_web/app/dashboard/[search_space_id]/more-pages/loading.tsx b/surfsense_web/app/dashboard/[search_space_id]/more-pages/loading.tsx new file mode 100644 index 000000000..9a0c45f3f --- /dev/null +++ b/surfsense_web/app/dashboard/[search_space_id]/more-pages/loading.tsx @@ -0,0 +1,10 @@ +import { Skeleton } from "@/components/ui/skeleton"; + +export default function Loading() { + return ( +
+ + +
+ ); +} From e4d5c119ef6879aa9a58ef59140b36e00695b8f1 Mon Sep 17 00:00:00 2001 From: JoeMakuta Date: Thu, 26 Mar 2026 13:33:29 +0200 Subject: [PATCH 16/71] fix: convert public chat page to server component --- surfsense_web/app/public/[token]/page.tsx | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/surfsense_web/app/public/[token]/page.tsx b/surfsense_web/app/public/[token]/page.tsx index 530664ac6..10cd19732 100644 --- a/surfsense_web/app/public/[token]/page.tsx +++ b/surfsense_web/app/public/[token]/page.tsx @@ -1,11 +1,11 @@ -"use client"; - -import { useParams } from "next/navigation"; import { PublicChatView } from "@/components/public-chat/public-chat-view"; -export default function PublicChatPage() { - const params = useParams(); - const token = params.token as string; +export default async function PublicChatPage({ + params, +}: { + params: Promise<{ token: string }>; +}) { + const { token } = await params; - return ; + return ; } From f00f7826ed09c94d32ee85fc75cd101946dec133 Mon Sep 17 00:00:00 2001 From: JoeMakuta Date: Thu, 26 Mar 2026 15:11:39 +0200 Subject: [PATCH 17/71] fix: improve semantics and structure of settings forms in GeneralSettingsManager and PromptConfigManager --- .../settings/general-settings-manager.tsx | 300 ++++++++------- .../settings/prompt-config-manager.tsx | 344 ++++++++++-------- 2 files changed, 350 insertions(+), 294 deletions(-) diff --git a/surfsense_web/components/settings/general-settings-manager.tsx b/surfsense_web/components/settings/general-settings-manager.tsx index a9482001d..8a847b629 100644 --- a/surfsense_web/components/settings/general-settings-manager.tsx +++ b/surfsense_web/components/settings/general-settings-manager.tsx @@ -9,160 +9,190 @@ import { toast } from "sonner"; import { updateSearchSpaceMutationAtom } from "@/atoms/search-spaces/search-space-mutation.atoms"; import { Alert, AlertDescription } from "@/components/ui/alert"; import { Button } from "@/components/ui/button"; -import { Card, CardContent, CardDescription, CardHeader, CardTitle } from "@/components/ui/card"; +import { + Card, + CardContent, + CardDescription, + CardHeader, + CardTitle, +} from "@/components/ui/card"; import { Input } from "@/components/ui/input"; import { Label } from "@/components/ui/label"; import { Skeleton } from "@/components/ui/skeleton"; import { searchSpacesApiService } from "@/lib/apis/search-spaces-api.service"; import { cacheKeys } from "@/lib/query-client/cache-keys"; +import { Spinner } from "../ui/spinner"; interface GeneralSettingsManagerProps { - searchSpaceId: number; + searchSpaceId: number; } -export function GeneralSettingsManager({ searchSpaceId }: GeneralSettingsManagerProps) { - const t = useTranslations("searchSpaceSettings"); - const tCommon = useTranslations("common"); - const { - data: searchSpace, - isLoading: loading, - refetch: fetchSearchSpace, - } = useQuery({ - queryKey: cacheKeys.searchSpaces.detail(searchSpaceId.toString()), - queryFn: () => searchSpacesApiService.getSearchSpace({ id: searchSpaceId }), - enabled: !!searchSpaceId, - }); +export function GeneralSettingsManager({ + searchSpaceId, +}: GeneralSettingsManagerProps) { + const t = useTranslations("searchSpaceSettings"); + const tCommon = useTranslations("common"); + const { + data: searchSpace, + isLoading: loading, + refetch: fetchSearchSpace, + } = useQuery({ + queryKey: cacheKeys.searchSpaces.detail(searchSpaceId.toString()), + queryFn: () => searchSpacesApiService.getSearchSpace({ id: searchSpaceId }), + enabled: !!searchSpaceId, + }); - const { mutateAsync: updateSearchSpace } = useAtomValue(updateSearchSpaceMutationAtom); + const { mutateAsync: updateSearchSpace } = useAtomValue( + updateSearchSpaceMutationAtom, + ); - const [name, setName] = useState(""); - const [description, setDescription] = useState(""); - const [saving, setSaving] = useState(false); - const [hasChanges, setHasChanges] = useState(false); + const [name, setName] = useState(""); + const [description, setDescription] = useState(""); + const [saving, setSaving] = useState(false); + const [hasChanges, setHasChanges] = useState(false); - // Initialize state from fetched search space - useEffect(() => { - if (searchSpace) { - setName(searchSpace.name || ""); - setDescription(searchSpace.description || ""); - setHasChanges(false); - } - }, [searchSpace]); + // Initialize state from fetched search space + useEffect(() => { + if (searchSpace) { + setName(searchSpace.name || ""); + setDescription(searchSpace.description || ""); + setHasChanges(false); + } + }, [searchSpace]); - // Track changes - useEffect(() => { - if (searchSpace) { - const currentName = searchSpace.name || ""; - const currentDescription = searchSpace.description || ""; - const changed = currentName !== name || currentDescription !== description; - setHasChanges(changed); - } - }, [searchSpace, name, description]); + // Track changes + useEffect(() => { + if (searchSpace) { + const currentName = searchSpace.name || ""; + const currentDescription = searchSpace.description || ""; + const changed = + currentName !== name || currentDescription !== description; + setHasChanges(changed); + } + }, [searchSpace, name, description]); - const handleSave = async () => { - try { - setSaving(true); + const handleSave = async () => { + try { + setSaving(true); - await updateSearchSpace({ - id: searchSpaceId, - data: { - name: name.trim(), - description: description.trim() || undefined, - }, - }); + await updateSearchSpace({ + id: searchSpaceId, + data: { + name: name.trim(), + description: description.trim() || undefined, + }, + }); - setHasChanges(false); - await fetchSearchSpace(); - } catch (error: any) { - console.error("Error saving search space details:", error); - toast.error(error.message || "Failed to save search space details"); - } finally { - setSaving(false); - } - }; + setHasChanges(false); + await fetchSearchSpace(); + } catch (error: any) { + console.error("Error saving search space details:", error); + toast.error(error.message || "Failed to save search space details"); + } finally { + setSaving(false); + } + }; - if (loading) { - return ( -
- - - - - - - - - - -
- ); - } + const onSubmit = (e: React.FormEvent) => { + e.preventDefault(); + handleSave(); + }; - return ( -
- - - - Update your search space name and description. These details help identify and organize - your workspace. - - + if (loading) { + return ( +
+ + + + + + + + + + +
+ ); + } - {/* Search Space Details Card */} - - - Search Space Details - - Manage the basic information for this search space. - - - -
- - setName(e.target.value)} - className="text-sm md:text-base h-9 md:h-10" - /> -

- {t("general_name_description")} -

-
+ return ( +
+ + + + Update your search space name and description. These details help + identify and organize your workspace. + + -
- - setDescription(e.target.value)} - className="text-sm md:text-base h-9 md:h-10" - /> -

- {t("general_description_description")} -

-
- - + {/* Search Space Details Card */} +
+ + + + Search Space Details + + + Manage the basic information for this search space. + + + +
+ + setName(e.target.value)} + className="text-sm md:text-base h-9 md:h-10" + /> +

+ {t("general_name_description")} +

+
- {/* Action Buttons */} -
- -
-
- ); +
+ + setDescription(e.target.value)} + className="text-sm md:text-base h-9 md:h-10" + /> +

+ {t("general_description_description")} +

+
+
+
+ + {/* Action Buttons */} +
+ +
+ +
+ ); } diff --git a/surfsense_web/components/settings/prompt-config-manager.tsx b/surfsense_web/components/settings/prompt-config-manager.tsx index b9c9c2fc8..dc3a15a7d 100644 --- a/surfsense_web/components/settings/prompt-config-manager.tsx +++ b/surfsense_web/components/settings/prompt-config-manager.tsx @@ -6,187 +6,213 @@ import { useEffect, useState } from "react"; import { toast } from "sonner"; import { Alert, AlertDescription } from "@/components/ui/alert"; import { Button } from "@/components/ui/button"; -import { Card, CardContent, CardDescription, CardHeader, CardTitle } from "@/components/ui/card"; +import { + Card, + CardContent, + CardDescription, + CardHeader, + CardTitle, +} from "@/components/ui/card"; import { Label } from "@/components/ui/label"; import { Skeleton } from "@/components/ui/skeleton"; import { Textarea } from "@/components/ui/textarea"; import { searchSpacesApiService } from "@/lib/apis/search-spaces-api.service"; import { authenticatedFetch } from "@/lib/auth-utils"; import { cacheKeys } from "@/lib/query-client/cache-keys"; +import { Spinner } from "../ui/spinner"; interface PromptConfigManagerProps { - searchSpaceId: number; + searchSpaceId: number; } -export function PromptConfigManager({ searchSpaceId }: PromptConfigManagerProps) { - const { - data: searchSpace, - isLoading: loading, - refetch: fetchSearchSpace, - } = useQuery({ - queryKey: cacheKeys.searchSpaces.detail(searchSpaceId.toString()), - queryFn: () => searchSpacesApiService.getSearchSpace({ id: searchSpaceId }), - enabled: !!searchSpaceId, - }); +export function PromptConfigManager({ + searchSpaceId, +}: PromptConfigManagerProps) { + const { + data: searchSpace, + isLoading: loading, + refetch: fetchSearchSpace, + } = useQuery({ + queryKey: cacheKeys.searchSpaces.detail(searchSpaceId.toString()), + queryFn: () => searchSpacesApiService.getSearchSpace({ id: searchSpaceId }), + enabled: !!searchSpaceId, + }); - const [customInstructions, setCustomInstructions] = useState(""); - const [saving, setSaving] = useState(false); - const [hasChanges, setHasChanges] = useState(false); + const [customInstructions, setCustomInstructions] = useState(""); + const [saving, setSaving] = useState(false); + const [hasChanges, setHasChanges] = useState(false); - // Initialize state from fetched search space - useEffect(() => { - if (searchSpace) { - setCustomInstructions(searchSpace.qna_custom_instructions || ""); - setHasChanges(false); - } - }, [searchSpace]); + // Initialize state from fetched search space + useEffect(() => { + if (searchSpace) { + setCustomInstructions(searchSpace.qna_custom_instructions || ""); + setHasChanges(false); + } + }, [searchSpace]); - // Track changes - useEffect(() => { - if (searchSpace) { - const currentCustom = searchSpace.qna_custom_instructions || ""; - const changed = currentCustom !== customInstructions; - setHasChanges(changed); - } - }, [searchSpace, customInstructions]); + // Track changes + useEffect(() => { + if (searchSpace) { + const currentCustom = searchSpace.qna_custom_instructions || ""; + const changed = currentCustom !== customInstructions; + setHasChanges(changed); + } + }, [searchSpace, customInstructions]); - const handleSave = async () => { - try { - setSaving(true); + const handleSave = async () => { + try { + setSaving(true); - const payload = { - qna_custom_instructions: customInstructions.trim() || "", - }; + const payload = { + qna_custom_instructions: customInstructions.trim() || "", + }; - const response = await authenticatedFetch( - `${process.env.NEXT_PUBLIC_FASTAPI_BACKEND_URL}/api/v1/searchspaces/${searchSpaceId}`, - { - method: "PUT", - headers: { "Content-Type": "application/json" }, - body: JSON.stringify(payload), - } - ); + const response = await authenticatedFetch( + `${process.env.NEXT_PUBLIC_FASTAPI_BACKEND_URL}/api/v1/searchspaces/${searchSpaceId}`, + { + method: "PUT", + headers: { "Content-Type": "application/json" }, + body: JSON.stringify(payload), + }, + ); - if (!response.ok) { - const errorData = await response.json().catch(() => ({})); - throw new Error(errorData.detail || "Failed to save system instructions"); - } + if (!response.ok) { + const errorData = await response.json().catch(() => ({})); + throw new Error( + errorData.detail || "Failed to save system instructions", + ); + } - toast.success("System instructions saved successfully"); - setHasChanges(false); - await fetchSearchSpace(); - } catch (error: any) { - console.error("Error saving system instructions:", error); - toast.error(error.message || "Failed to save system instructions"); - } finally { - setSaving(false); - } - }; + toast.success("System instructions saved successfully"); + setHasChanges(false); + await fetchSearchSpace(); + } catch (error: any) { + console.error("Error saving system instructions:", error); + toast.error(error.message || "Failed to save system instructions"); + } finally { + setSaving(false); + } + }; - if (loading) { - return ( -
- - - - - - - - - - -
- ); - } + const onSubmit = (e: React.FormEvent) => { + e.preventDefault(); + handleSave(); + }; - return ( -
- {/* Work in Progress Notice */} - - - - Work in Progress: This functionality is currently - under development and not yet connected to the backend. Your instructions will be saved - but won't affect AI behavior until the feature is fully implemented. - - + if (loading) { + return ( +
+ + + + + + + + + + +
+ ); + } - - - - System instructions apply to all AI interactions in this search space. They guide how the - AI responds, its tone, focus areas, and behavior patterns. - - + return ( +
+ {/* Work in Progress Notice */} + + + + Work in Progress: This + functionality is currently under development and not yet connected to + the backend. Your instructions will be saved but won't affect AI + behavior until the feature is fully implemented. + + - {/* System Instructions Card */} - - - Custom System Instructions - - Provide specific guidelines for how you want the AI to respond. These instructions will - be applied to all answers in this search space. - - - -
- -