From 4e0749f9070972a37067407a90954dca50ab0249 Mon Sep 17 00:00:00 2001 From: Anish Sarkar <104695310+AnishSarkar22@users.noreply.github.com> Date: Fri, 27 Mar 2026 20:01:08 +0530 Subject: [PATCH] fix: update file skipping logic for failed documents in Google Drive indexer - Modified the `_should_skip_file` function to skip previously failed documents during processing, improving error handling. - Updated the corresponding test to reflect the new behavior, ensuring that failed documents are correctly identified and skipped during automatic sync. --- .../app/tasks/connector_indexers/google_drive_indexer.py | 2 +- .../integration/indexing_pipeline/test_drive_pipeline.py | 9 +++++---- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/surfsense_backend/app/tasks/connector_indexers/google_drive_indexer.py b/surfsense_backend/app/tasks/connector_indexers/google_drive_indexer.py index d67665d99..74101ed74 100644 --- a/surfsense_backend/app/tasks/connector_indexers/google_drive_indexer.py +++ b/surfsense_backend/app/tasks/connector_indexers/google_drive_indexer.py @@ -150,7 +150,7 @@ async def _should_skip_file( return True, f"File renamed: '{old_name}' → '{file_name}'" if not DocumentStatus.is_state(existing.status, DocumentStatus.READY): - return False, None + return True, "skipped (previously failed)" return True, "unchanged" diff --git a/surfsense_backend/tests/integration/indexing_pipeline/test_drive_pipeline.py b/surfsense_backend/tests/integration/indexing_pipeline/test_drive_pipeline.py index 77128ebd9..2fffa9053 100644 --- a/surfsense_backend/tests/integration/indexing_pipeline/test_drive_pipeline.py +++ b/surfsense_backend/tests/integration/indexing_pipeline/test_drive_pipeline.py @@ -110,10 +110,10 @@ async def test_drive_legacy_doc_migrated( assert row.unique_identifier_hash == native_hash -async def test_should_skip_file_does_not_skip_failed_document( +async def test_should_skip_file_skips_failed_document( db_session, db_search_space, db_user, ): - """A FAILED document with unchanged md5 must NOT be skipped — it needs reprocessing.""" + """A FAILED document with unchanged md5 must be skipped — user can manually retry via Quick Index.""" import importlib import sys import types @@ -164,6 +164,7 @@ async def test_should_skip_file_does_not_skip_failed_document( incoming_file = {"id": file_id, "name": "Failed File.pdf", "mimeType": "application/pdf", "md5Checksum": md5} - should_skip, _msg = await _should_skip_file(db_session, incoming_file, space_id) + should_skip, msg = await _should_skip_file(db_session, incoming_file, space_id) - assert not should_skip, "FAILED documents must not be skipped even when content is unchanged" + assert should_skip, "FAILED documents must be skipped during automatic sync" + assert "failed" in msg.lower()