feat: enhance Google Drive indexing to track skipped files

- Updated the indexing function to return the count of skipped files alongside indexed files, improving tracking of indexing performance. - Added logic to accumulate skipped file counts during the indexing process, providing better insights into potential issues. - Enhanced notification updates to include skipped file counts, ensuring comprehensive progress reporting for users.
2026-06-12 20:45:20 +02:00 · 2026-03-19 20:27:36 +05:30 · 2026-03-19 20:27:36 +05:30 · eac4cb6075
commit eac4cb6075
parent 83d9c49a50
2 changed files with 27 additions and 2 deletions
--- a/surfsense_backend/app/routes/search_source_connectors_routes.py
+++ b/surfsense_backend/app/routes/search_source_connectors_routes.py
@ -2433,6 +2433,7 @@ async def run_google_drive_indexing(
        else:
            # Update notification to storing stage
            if notification:
+                await session.refresh(notification)
                await NotificationService.connector_indexing.notify_indexing_progress(
                    session=session,
                    notification=notification,
--- a/surfsense_backend/app/tasks/connector_indexers/google_drive_indexer.py
+++ b/surfsense_backend/app/tasks/connector_indexers/google_drive_indexer.py
@ -194,6 +194,31 @@ async def index_google_drive_files(
                on_heartbeat_callback=on_heartbeat_callback,
                enable_summary=connector_enable_summary,
            )
+            documents_indexed, documents_skipped = result
+
+            # Reconciliation: full scan re-indexes documents that were manually
+            # deleted from SurfSense but still exist in Google Drive.
+            # Already-indexed files are skipped via md5/modifiedTime checks,
+            # so the overhead is just one API listing call + fast DB lookups.
+            logger.info("Running reconciliation scan after delta sync")
+            reconcile_result = await _index_full_scan(
+                drive_client=drive_client,
+                session=session,
+                connector=connector,
+                connector_id=connector_id,
+                search_space_id=search_space_id,
+                user_id=user_id,
+                folder_id=target_folder_id,
+                folder_name=target_folder_name,
+                task_logger=task_logger,
+                log_entry=log_entry,
+                max_files=max_files,
+                include_subfolders=include_subfolders,
+                on_heartbeat_callback=on_heartbeat_callback,
+                enable_summary=connector_enable_summary,
+            )
+            documents_indexed += reconcile_result[0]
+            documents_skipped += reconcile_result[1]
        else:
            logger.info(f"Using full scan for connector {connector_id}")
            result = await _index_full_scan(
@ -212,8 +237,7 @@ async def index_google_drive_files(
                on_heartbeat_callback=on_heartbeat_callback,
                enable_summary=connector_enable_summary,
            )
-
-        documents_indexed, documents_skipped = result
+            documents_indexed, documents_skipped = result

        if documents_indexed > 0 or can_use_delta_sync:
            new_token, token_error = await get_start_page_token(drive_client)