refactor: enhance file skipping logic across Dropbox, Google Drive, and OneDrive connectors to return unsupported extensions, improving error reporting and maintainability

This commit is contained in:
Anish Sarkar 2026-04-07 03:16:34 +05:30
parent e7beeb2a36
commit 3a1d700817
14 changed files with 344 additions and 160 deletions

View file

@ -265,7 +265,10 @@ def full_scan_mocks(mock_dropbox_client, monkeypatch):
async def _fake_skip(session, file, search_space_id):
from app.connectors.dropbox.file_types import should_skip_file as _skip
if _skip(file):
item_skip, unsup_ext = _skip(file)
if item_skip:
if unsup_ext:
return True, f"unsupported:{unsup_ext}"
return True, "folder/non-downloadable"
return skip_results.get(file.get("id", ""), (False, None))
@ -541,7 +544,7 @@ async def test_delta_sync_deletions_call_remove_document(monkeypatch):
mock_task_logger = MagicMock()
mock_task_logger.log_task_progress = AsyncMock()
indexed, skipped, cursor = await _index_with_delta_sync(
indexed, skipped, unsupported, cursor = await _index_with_delta_sync(
mock_client,
AsyncMock(),
_CONNECTOR_ID,
@ -578,7 +581,7 @@ async def test_delta_sync_upserts_filtered_and_downloaded(monkeypatch):
mock_task_logger = MagicMock()
mock_task_logger.log_task_progress = AsyncMock()
indexed, skipped, cursor = await _index_with_delta_sync(
indexed, skipped, unsupported, cursor = await _index_with_delta_sync(
mock_client,
AsyncMock(),
_CONNECTOR_ID,
@ -628,7 +631,7 @@ async def test_delta_sync_mix_deletions_and_upserts(monkeypatch):
mock_task_logger = MagicMock()
mock_task_logger.log_task_progress = AsyncMock()
indexed, skipped, cursor = await _index_with_delta_sync(
indexed, skipped, unsupported, cursor = await _index_with_delta_sync(
mock_client,
AsyncMock(),
_CONNECTOR_ID,
@ -662,7 +665,7 @@ async def test_delta_sync_returns_new_cursor(monkeypatch):
mock_task_logger = MagicMock()
mock_task_logger.log_task_progress = AsyncMock()
indexed, skipped, cursor = await _index_with_delta_sync(
indexed, skipped, unsupported, cursor = await _index_with_delta_sync(
mock_client,
AsyncMock(),
_CONNECTOR_ID,

View file

@ -497,7 +497,7 @@ async def test_delta_sync_removals_serial_rest_parallel(monkeypatch):
mock_task_logger = MagicMock()
mock_task_logger.log_task_progress = AsyncMock()
indexed, skipped = await _index_with_delta_sync(
indexed, skipped, unsupported = await _index_with_delta_sync(
MagicMock(),
mock_session,
MagicMock(),

View file

@ -384,7 +384,7 @@ async def test_gdrive_full_scan_skips_over_quota(gdrive_full_scan_mocks, monkeyp
m["download_mock"].return_value = ([], 0)
m["batch_mock"].return_value = ([], 2, 0)
_indexed, skipped = await _run_gdrive_full_scan(m)
_indexed, skipped, _unsup = await _run_gdrive_full_scan(m)
call_files = m["download_mock"].call_args[0][1]
assert len(call_files) == 2
@ -459,7 +459,7 @@ async def test_gdrive_delta_sync_skips_over_quota(monkeypatch):
mock_task_logger = MagicMock()
mock_task_logger.log_task_progress = AsyncMock()
_indexed, skipped = await _mod._index_with_delta_sync(
_indexed, skipped, _unsupported = await _mod._index_with_delta_sync(
MagicMock(),
session,
MagicMock(),

View file

@ -14,17 +14,23 @@ pytestmark = pytest.mark.unit
def test_folder_item_is_skipped():
item = {".tag": "folder", "name": "My Folder"}
assert should_skip_file(item) is True
skip, ext = should_skip_file(item)
assert skip is True
assert ext is None
def test_paper_file_is_not_skipped():
item = {".tag": "file", "name": "notes.paper", "is_downloadable": False}
assert should_skip_file(item) is False
skip, ext = should_skip_file(item)
assert skip is False
assert ext is None
def test_non_downloadable_item_is_skipped():
item = {".tag": "file", "name": "locked.gdoc", "is_downloadable": False}
assert should_skip_file(item) is True
skip, ext = should_skip_file(item)
assert skip is True
assert ext is None
# ---------------------------------------------------------------------------
@ -49,7 +55,9 @@ def test_non_downloadable_item_is_skipped():
def test_non_parseable_extensions_are_skipped(filename, mocker):
mocker.patch("app.config.config.ETL_SERVICE", "DOCLING")
item = {".tag": "file", "name": filename}
assert should_skip_file(item) is True, f"{filename} should be skipped"
skip, ext = should_skip_file(item)
assert skip is True, f"{filename} should be skipped"
assert ext is not None
@pytest.mark.parametrize(
@ -65,9 +73,9 @@ def test_parseable_documents_are_not_skipped(filename, mocker):
for service in ("DOCLING", "LLAMACLOUD", "UNSTRUCTURED"):
mocker.patch("app.config.config.ETL_SERVICE", service)
item = {".tag": "file", "name": filename}
assert should_skip_file(item) is False, (
f"{filename} should NOT be skipped with {service}"
)
skip, ext = should_skip_file(item)
assert skip is False, f"{filename} should NOT be skipped with {service}"
assert ext is None
@pytest.mark.parametrize(
@ -79,9 +87,9 @@ def test_universal_images_are_not_skipped(filename, mocker):
for service in ("DOCLING", "LLAMACLOUD", "UNSTRUCTURED"):
mocker.patch("app.config.config.ETL_SERVICE", service)
item = {".tag": "file", "name": filename}
assert should_skip_file(item) is False, (
f"{filename} should NOT be skipped with {service}"
)
skip, ext = should_skip_file(item)
assert skip is False, f"{filename} should NOT be skipped with {service}"
assert ext is None
@pytest.mark.parametrize("filename,service,expected_skip", [
@ -111,6 +119,20 @@ def test_universal_images_are_not_skipped(filename, mocker):
def test_parser_specific_extensions(filename, service, expected_skip, mocker):
mocker.patch("app.config.config.ETL_SERVICE", service)
item = {".tag": "file", "name": filename}
assert should_skip_file(item) is expected_skip, (
skip, ext = should_skip_file(item)
assert skip is expected_skip, (
f"{filename} with {service}: expected skip={expected_skip}"
)
if expected_skip:
assert ext is not None
else:
assert ext is None
def test_returns_unsupported_extension(mocker):
"""When a file is skipped due to unsupported extension, the ext string is returned."""
mocker.patch("app.config.config.ETL_SERVICE", "DOCLING")
item = {".tag": "file", "name": "old.doc"}
skip, ext = should_skip_file(item)
assert skip is True
assert ext == ".doc"

View file

@ -14,7 +14,8 @@ def test_unsupported_extensions_are_skipped_regardless_of_service(filename, mock
"""Truly unsupported files are skipped no matter which ETL service is configured."""
for service in ("DOCLING", "LLAMACLOUD", "UNSTRUCTURED"):
mocker.patch("app.config.config.ETL_SERVICE", service)
assert should_skip_by_extension(filename) is True
skip, ext = should_skip_by_extension(filename)
assert skip is True
@pytest.mark.parametrize("filename", [
@ -25,9 +26,9 @@ def test_universal_extensions_are_not_skipped(filename, mocker):
"""Files supported by all parsers (or handled by plaintext/direct_convert) are never skipped."""
for service in ("DOCLING", "LLAMACLOUD", "UNSTRUCTURED"):
mocker.patch("app.config.config.ETL_SERVICE", service)
assert should_skip_by_extension(filename) is False, (
f"{filename} should NOT be skipped with {service}"
)
skip, ext = should_skip_by_extension(filename)
assert skip is False, f"{filename} should NOT be skipped with {service}"
assert ext is None
@pytest.mark.parametrize("filename,service,expected_skip", [
@ -42,6 +43,19 @@ def test_universal_extensions_are_not_skipped(filename, mocker):
])
def test_parser_specific_extensions(filename, service, expected_skip, mocker):
mocker.patch("app.config.config.ETL_SERVICE", service)
assert should_skip_by_extension(filename) is expected_skip, (
skip, ext = should_skip_by_extension(filename)
assert skip is expected_skip, (
f"{filename} with {service}: expected skip={expected_skip}"
)
if expected_skip:
assert ext is not None, "unsupported extension should be returned"
else:
assert ext is None
def test_returns_unsupported_extension(mocker):
"""When a file is skipped, the unsupported extension string is returned."""
mocker.patch("app.config.config.ETL_SERVICE", "DOCLING")
skip, ext = should_skip_by_extension("macro.docm")
assert skip is True
assert ext == ".docm"

View file

@ -14,22 +14,30 @@ pytestmark = pytest.mark.unit
def test_folder_is_skipped():
item = {"folder": {}, "name": "My Folder"}
assert should_skip_file(item) is True
skip, ext = should_skip_file(item)
assert skip is True
assert ext is None
def test_remote_item_is_skipped():
item = {"remoteItem": {}, "name": "shared.docx"}
assert should_skip_file(item) is True
skip, ext = should_skip_file(item)
assert skip is True
assert ext is None
def test_package_is_skipped():
item = {"package": {}, "name": "notebook"}
assert should_skip_file(item) is True
skip, ext = should_skip_file(item)
assert skip is True
assert ext is None
def test_onenote_is_skipped():
item = {"name": "notes", "file": {"mimeType": "application/msonenote"}}
assert should_skip_file(item) is True
skip, ext = should_skip_file(item)
assert skip is True
assert ext is None
# ---------------------------------------------------------------------------
@ -43,7 +51,9 @@ def test_onenote_is_skipped():
def test_unsupported_extensions_are_skipped(filename, mocker):
mocker.patch("app.config.config.ETL_SERVICE", "DOCLING")
item = {"name": filename, "file": {"mimeType": "application/octet-stream"}}
assert should_skip_file(item) is True, f"{filename} should be skipped"
skip, ext = should_skip_file(item)
assert skip is True, f"{filename} should be skipped"
assert ext is not None
@pytest.mark.parametrize("filename", [
@ -54,9 +64,9 @@ def test_universal_files_are_not_skipped(filename, mocker):
for service in ("DOCLING", "LLAMACLOUD", "UNSTRUCTURED"):
mocker.patch("app.config.config.ETL_SERVICE", service)
item = {"name": filename, "file": {"mimeType": "application/octet-stream"}}
assert should_skip_file(item) is False, (
f"{filename} should NOT be skipped with {service}"
)
skip, ext = should_skip_file(item)
assert skip is False, f"{filename} should NOT be skipped with {service}"
assert ext is None
@pytest.mark.parametrize("filename,service,expected_skip", [
@ -70,6 +80,20 @@ def test_universal_files_are_not_skipped(filename, mocker):
def test_parser_specific_extensions(filename, service, expected_skip, mocker):
mocker.patch("app.config.config.ETL_SERVICE", service)
item = {"name": filename, "file": {"mimeType": "application/octet-stream"}}
assert should_skip_file(item) is expected_skip, (
skip, ext = should_skip_file(item)
assert skip is expected_skip, (
f"{filename} with {service}: expected skip={expected_skip}"
)
if expected_skip:
assert ext is not None
else:
assert ext is None
def test_returns_unsupported_extension(mocker):
"""When a file is skipped due to unsupported extension, the ext string is returned."""
mocker.patch("app.config.config.ETL_SERVICE", "DOCLING")
item = {"name": "mail.eml", "file": {"mimeType": "application/octet-stream"}}
skip, ext = should_skip_file(item)
assert skip is True
assert ext == ".eml"