diff --git a/surfsense_backend/app/connectors/google_drive/change_tracker.py b/surfsense_backend/app/connectors/google_drive/change_tracker.py index dee828219..3b7f804ac 100644 --- a/surfsense_backend/app/connectors/google_drive/change_tracker.py +++ b/surfsense_backend/app/connectors/google_drive/change_tracker.py @@ -58,7 +58,7 @@ async def get_changes( params = { "pageToken": page_token, "pageSize": 100, - "fields": "nextPageToken, newStartPageToken, changes(fileId, removed, file(id, name, mimeType, modifiedTime, size, webViewLink, parents, trashed))", + "fields": "nextPageToken, newStartPageToken, changes(fileId, removed, file(id, name, mimeType, modifiedTime, md5Checksum, size, webViewLink, parents, trashed))", "supportsAllDrives": True, "includeItemsFromAllDrives": True, } diff --git a/surfsense_backend/app/connectors/google_drive/client.py b/surfsense_backend/app/connectors/google_drive/client.py index aec5704b8..a001e42be 100644 --- a/surfsense_backend/app/connectors/google_drive/client.py +++ b/surfsense_backend/app/connectors/google_drive/client.py @@ -47,7 +47,7 @@ class GoogleDriveClient: async def list_files( self, query: str = "", - fields: str = "nextPageToken, files(id, name, mimeType, modifiedTime, size, webViewLink, parents, owners, createdTime, description)", + fields: str = "nextPageToken, files(id, name, mimeType, modifiedTime, md5Checksum, size, webViewLink, parents, owners, createdTime, description)", page_size: int = 100, page_token: str | None = None, ) -> tuple[list[dict[str, Any]], str | None, str | None]: diff --git a/surfsense_backend/app/connectors/google_drive/content_extractor.py b/surfsense_backend/app/connectors/google_drive/content_extractor.py index 28c14a757..61b427970 100644 --- a/surfsense_backend/app/connectors/google_drive/content_extractor.py +++ b/surfsense_backend/app/connectors/google_drive/content_extractor.py @@ -102,6 +102,8 @@ async def download_and_process_file( connector_info["metadata"]["file_size"] = file["size"] if "webViewLink" in file: connector_info["metadata"]["web_view_link"] = file["webViewLink"] + if "md5Checksum" in file: + connector_info["metadata"]["md5_checksum"] = file["md5Checksum"] if is_google_workspace_file(mime_type): connector_info["metadata"]["exported_as"] = "pdf" diff --git a/surfsense_backend/app/connectors/google_drive/folder_manager.py b/surfsense_backend/app/connectors/google_drive/folder_manager.py index e28505f11..c0535ee58 100644 --- a/surfsense_backend/app/connectors/google_drive/folder_manager.py +++ b/surfsense_backend/app/connectors/google_drive/folder_manager.py @@ -157,7 +157,7 @@ async def get_file_by_id( try: file, error = await client.get_file_metadata( file_id, - fields="id, name, mimeType, parents, createdTime, modifiedTime, size, webViewLink, iconLink", + fields="id, name, mimeType, parents, createdTime, modifiedTime, md5Checksum, size, webViewLink, iconLink", ) if error: @@ -228,7 +228,7 @@ async def list_folder_contents( while True: items, next_token, error = await client.list_files( query=query, - fields="files(id, name, mimeType, parents, createdTime, modifiedTime, size, webViewLink, iconLink)", + fields="files(id, name, mimeType, parents, createdTime, modifiedTime, md5Checksum, size, webViewLink, iconLink)", page_size=1000, # Max allowed by Google Drive API page_token=page_token, ) diff --git a/surfsense_backend/app/tasks/connector_indexers/google_drive_indexer.py b/surfsense_backend/app/tasks/connector_indexers/google_drive_indexer.py index 142adf649..f655c290d 100644 --- a/surfsense_backend/app/tasks/connector_indexers/google_drive_indexer.py +++ b/surfsense_backend/app/tasks/connector_indexers/google_drive_indexer.py @@ -547,6 +547,117 @@ async def _index_with_delta_sync( return documents_indexed, documents_skipped +async def _check_rename_only_update( + session: AsyncSession, + file: dict, + search_space_id: int, +) -> tuple[bool, str | None]: + """ + Check if a file only needs a rename update (no content change). + + Uses md5Checksum comparison (preferred) or modifiedTime (fallback for Google Workspace files) + to detect if content has changed. This optimization prevents unnecessary ETL API calls + (Docling/LlamaCloud) for rename-only operations. + + Args: + session: Database session + file: File metadata from Google Drive API + search_space_id: ID of the search space + + Returns: + Tuple of (is_rename_only, message) + - (True, message): Only filename changed, document was updated + - (False, None): Content changed or new file, needs full processing + """ + from sqlalchemy import select + from sqlalchemy.orm.attributes import flag_modified + from app.db import Document + + file_id = file.get("id") + file_name = file.get("name", "Unknown") + incoming_md5 = file.get("md5Checksum") # None for Google Workspace files + incoming_modified_time = file.get("modifiedTime") + + if not file_id: + return False, None + + # Try to find existing document by file_id-based hash (primary method) + primary_hash = generate_unique_identifier_hash( + DocumentType.GOOGLE_DRIVE_FILE, file_id, search_space_id + ) + existing_document = await check_document_by_unique_identifier(session, primary_hash) + + # If not found by primary hash, try searching by metadata (for legacy documents) + if not existing_document: + result = await session.execute( + select(Document).where( + Document.search_space_id == search_space_id, + Document.document_type == DocumentType.GOOGLE_DRIVE_FILE, + Document.document_metadata["google_drive_file_id"].astext == file_id + ) + ) + existing_document = result.scalar_one_or_none() + if existing_document: + logger.debug(f"Found legacy document by metadata for file_id: {file_id}") + + if not existing_document: + # New file, needs full processing + return False, None + + # Get stored checksums/timestamps from document metadata + doc_metadata = existing_document.document_metadata or {} + stored_md5 = doc_metadata.get("md5_checksum") + stored_modified_time = doc_metadata.get("modified_time") + + # Determine if content changed using md5Checksum (preferred) or modifiedTime (fallback) + content_unchanged = False + + if incoming_md5 and stored_md5: + # Best case: Compare md5 checksums (only changes when content changes, not on rename) + content_unchanged = (incoming_md5 == stored_md5) + logger.debug(f"MD5 comparison for {file_name}: unchanged={content_unchanged}") + elif incoming_md5 and not stored_md5: + # Have incoming md5 but no stored md5 (legacy doc) - need to reprocess to store it + logger.debug(f"No stored md5 for {file_name}, will reprocess to store md5_checksum") + return False, None + elif not incoming_md5: + # Google Workspace file (no md5Checksum available) - fall back to modifiedTime + # Note: modifiedTime is less reliable as it changes on rename too, but it's the best we have + if incoming_modified_time and stored_modified_time: + content_unchanged = (incoming_modified_time == stored_modified_time) + logger.debug(f"ModifiedTime fallback for Google Workspace file {file_name}: unchanged={content_unchanged}") + else: + # No stored modifiedTime (legacy) - reprocess to store it + return False, None + + if content_unchanged: + # Content hasn't changed - check if filename changed + old_name = doc_metadata.get("FILE_NAME") or doc_metadata.get("google_drive_file_name") + + if old_name and old_name != file_name: + # Rename-only update - update the document without re-processing + existing_document.title = file_name + if not existing_document.document_metadata: + existing_document.document_metadata = {} + existing_document.document_metadata["FILE_NAME"] = file_name + existing_document.document_metadata["google_drive_file_name"] = file_name + # Also update modified_time for Google Workspace files (since it changed on rename) + if incoming_modified_time: + existing_document.document_metadata["modified_time"] = incoming_modified_time + flag_modified(existing_document, "document_metadata") + await session.commit() + + logger.info(f"Rename-only update: '{old_name}' → '{file_name}' (skipped ETL)") + return True, f"File renamed: '{old_name}' → '{file_name}' (no content change)" + else: + # Neither content nor name changed + logger.debug(f"File unchanged: {file_name}") + return True, "File unchanged (same content and name)" + + # Content changed - needs full processing + return False, None + + async def _process_single_file( drive_client: GoogleDriveClient, session: AsyncSession, @@ -569,6 +680,27 @@ async def _process_single_file( try: logger.info(f"Processing file: {file_name} ({mime_type})") + # Early check: Is this a rename-only update? + # This optimization prevents downloading and ETL processing for files + # where only the name changed but content is the same. + is_rename_only, rename_message = await _check_rename_only_update( + session=session, + file=file, + search_space_id=search_space_id, + ) + + if is_rename_only: + await task_logger.log_task_progress( + log_entry, + f"Skipped ETL for {file_name}: {rename_message}", + {"status": "rename_only", "reason": rename_message}, + ) + # Return 1 for renamed files (they are "indexed" in the sense that they're updated) + # Return 0 for unchanged files + if "renamed" in (rename_message or "").lower(): + return 1, 0 + return 0, 1 + _, error, _ = await download_and_process_file( client=drive_client, file=file,