mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-04-26 01:06:23 +02:00
refactor: enhance file skipping logic across Dropbox, Google Drive, and OneDrive connectors to return unsupported extensions, improving error reporting and maintainability
This commit is contained in:
parent
e7beeb2a36
commit
3a1d700817
14 changed files with 344 additions and 160 deletions
|
|
@ -265,7 +265,10 @@ def full_scan_mocks(mock_dropbox_client, monkeypatch):
|
|||
|
||||
async def _fake_skip(session, file, search_space_id):
|
||||
from app.connectors.dropbox.file_types import should_skip_file as _skip
|
||||
if _skip(file):
|
||||
item_skip, unsup_ext = _skip(file)
|
||||
if item_skip:
|
||||
if unsup_ext:
|
||||
return True, f"unsupported:{unsup_ext}"
|
||||
return True, "folder/non-downloadable"
|
||||
return skip_results.get(file.get("id", ""), (False, None))
|
||||
|
||||
|
|
@ -541,7 +544,7 @@ async def test_delta_sync_deletions_call_remove_document(monkeypatch):
|
|||
mock_task_logger = MagicMock()
|
||||
mock_task_logger.log_task_progress = AsyncMock()
|
||||
|
||||
indexed, skipped, cursor = await _index_with_delta_sync(
|
||||
indexed, skipped, unsupported, cursor = await _index_with_delta_sync(
|
||||
mock_client,
|
||||
AsyncMock(),
|
||||
_CONNECTOR_ID,
|
||||
|
|
@ -578,7 +581,7 @@ async def test_delta_sync_upserts_filtered_and_downloaded(monkeypatch):
|
|||
mock_task_logger = MagicMock()
|
||||
mock_task_logger.log_task_progress = AsyncMock()
|
||||
|
||||
indexed, skipped, cursor = await _index_with_delta_sync(
|
||||
indexed, skipped, unsupported, cursor = await _index_with_delta_sync(
|
||||
mock_client,
|
||||
AsyncMock(),
|
||||
_CONNECTOR_ID,
|
||||
|
|
@ -628,7 +631,7 @@ async def test_delta_sync_mix_deletions_and_upserts(monkeypatch):
|
|||
mock_task_logger = MagicMock()
|
||||
mock_task_logger.log_task_progress = AsyncMock()
|
||||
|
||||
indexed, skipped, cursor = await _index_with_delta_sync(
|
||||
indexed, skipped, unsupported, cursor = await _index_with_delta_sync(
|
||||
mock_client,
|
||||
AsyncMock(),
|
||||
_CONNECTOR_ID,
|
||||
|
|
@ -662,7 +665,7 @@ async def test_delta_sync_returns_new_cursor(monkeypatch):
|
|||
mock_task_logger = MagicMock()
|
||||
mock_task_logger.log_task_progress = AsyncMock()
|
||||
|
||||
indexed, skipped, cursor = await _index_with_delta_sync(
|
||||
indexed, skipped, unsupported, cursor = await _index_with_delta_sync(
|
||||
mock_client,
|
||||
AsyncMock(),
|
||||
_CONNECTOR_ID,
|
||||
|
|
|
|||
|
|
@ -497,7 +497,7 @@ async def test_delta_sync_removals_serial_rest_parallel(monkeypatch):
|
|||
mock_task_logger = MagicMock()
|
||||
mock_task_logger.log_task_progress = AsyncMock()
|
||||
|
||||
indexed, skipped = await _index_with_delta_sync(
|
||||
indexed, skipped, unsupported = await _index_with_delta_sync(
|
||||
MagicMock(),
|
||||
mock_session,
|
||||
MagicMock(),
|
||||
|
|
|
|||
|
|
@ -384,7 +384,7 @@ async def test_gdrive_full_scan_skips_over_quota(gdrive_full_scan_mocks, monkeyp
|
|||
m["download_mock"].return_value = ([], 0)
|
||||
m["batch_mock"].return_value = ([], 2, 0)
|
||||
|
||||
_indexed, skipped = await _run_gdrive_full_scan(m)
|
||||
_indexed, skipped, _unsup = await _run_gdrive_full_scan(m)
|
||||
|
||||
call_files = m["download_mock"].call_args[0][1]
|
||||
assert len(call_files) == 2
|
||||
|
|
@ -459,7 +459,7 @@ async def test_gdrive_delta_sync_skips_over_quota(monkeypatch):
|
|||
mock_task_logger = MagicMock()
|
||||
mock_task_logger.log_task_progress = AsyncMock()
|
||||
|
||||
_indexed, skipped = await _mod._index_with_delta_sync(
|
||||
_indexed, skipped, _unsupported = await _mod._index_with_delta_sync(
|
||||
MagicMock(),
|
||||
session,
|
||||
MagicMock(),
|
||||
|
|
|
|||
|
|
@ -14,17 +14,23 @@ pytestmark = pytest.mark.unit
|
|||
|
||||
def test_folder_item_is_skipped():
|
||||
item = {".tag": "folder", "name": "My Folder"}
|
||||
assert should_skip_file(item) is True
|
||||
skip, ext = should_skip_file(item)
|
||||
assert skip is True
|
||||
assert ext is None
|
||||
|
||||
|
||||
def test_paper_file_is_not_skipped():
|
||||
item = {".tag": "file", "name": "notes.paper", "is_downloadable": False}
|
||||
assert should_skip_file(item) is False
|
||||
skip, ext = should_skip_file(item)
|
||||
assert skip is False
|
||||
assert ext is None
|
||||
|
||||
|
||||
def test_non_downloadable_item_is_skipped():
|
||||
item = {".tag": "file", "name": "locked.gdoc", "is_downloadable": False}
|
||||
assert should_skip_file(item) is True
|
||||
skip, ext = should_skip_file(item)
|
||||
assert skip is True
|
||||
assert ext is None
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
|
|
@ -49,7 +55,9 @@ def test_non_downloadable_item_is_skipped():
|
|||
def test_non_parseable_extensions_are_skipped(filename, mocker):
|
||||
mocker.patch("app.config.config.ETL_SERVICE", "DOCLING")
|
||||
item = {".tag": "file", "name": filename}
|
||||
assert should_skip_file(item) is True, f"{filename} should be skipped"
|
||||
skip, ext = should_skip_file(item)
|
||||
assert skip is True, f"{filename} should be skipped"
|
||||
assert ext is not None
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
|
|
@ -65,9 +73,9 @@ def test_parseable_documents_are_not_skipped(filename, mocker):
|
|||
for service in ("DOCLING", "LLAMACLOUD", "UNSTRUCTURED"):
|
||||
mocker.patch("app.config.config.ETL_SERVICE", service)
|
||||
item = {".tag": "file", "name": filename}
|
||||
assert should_skip_file(item) is False, (
|
||||
f"{filename} should NOT be skipped with {service}"
|
||||
)
|
||||
skip, ext = should_skip_file(item)
|
||||
assert skip is False, f"{filename} should NOT be skipped with {service}"
|
||||
assert ext is None
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
|
|
@ -79,9 +87,9 @@ def test_universal_images_are_not_skipped(filename, mocker):
|
|||
for service in ("DOCLING", "LLAMACLOUD", "UNSTRUCTURED"):
|
||||
mocker.patch("app.config.config.ETL_SERVICE", service)
|
||||
item = {".tag": "file", "name": filename}
|
||||
assert should_skip_file(item) is False, (
|
||||
f"{filename} should NOT be skipped with {service}"
|
||||
)
|
||||
skip, ext = should_skip_file(item)
|
||||
assert skip is False, f"{filename} should NOT be skipped with {service}"
|
||||
assert ext is None
|
||||
|
||||
|
||||
@pytest.mark.parametrize("filename,service,expected_skip", [
|
||||
|
|
@ -111,6 +119,20 @@ def test_universal_images_are_not_skipped(filename, mocker):
|
|||
def test_parser_specific_extensions(filename, service, expected_skip, mocker):
|
||||
mocker.patch("app.config.config.ETL_SERVICE", service)
|
||||
item = {".tag": "file", "name": filename}
|
||||
assert should_skip_file(item) is expected_skip, (
|
||||
skip, ext = should_skip_file(item)
|
||||
assert skip is expected_skip, (
|
||||
f"{filename} with {service}: expected skip={expected_skip}"
|
||||
)
|
||||
if expected_skip:
|
||||
assert ext is not None
|
||||
else:
|
||||
assert ext is None
|
||||
|
||||
|
||||
def test_returns_unsupported_extension(mocker):
|
||||
"""When a file is skipped due to unsupported extension, the ext string is returned."""
|
||||
mocker.patch("app.config.config.ETL_SERVICE", "DOCLING")
|
||||
item = {".tag": "file", "name": "old.doc"}
|
||||
skip, ext = should_skip_file(item)
|
||||
assert skip is True
|
||||
assert ext == ".doc"
|
||||
|
|
|
|||
|
|
@ -14,7 +14,8 @@ def test_unsupported_extensions_are_skipped_regardless_of_service(filename, mock
|
|||
"""Truly unsupported files are skipped no matter which ETL service is configured."""
|
||||
for service in ("DOCLING", "LLAMACLOUD", "UNSTRUCTURED"):
|
||||
mocker.patch("app.config.config.ETL_SERVICE", service)
|
||||
assert should_skip_by_extension(filename) is True
|
||||
skip, ext = should_skip_by_extension(filename)
|
||||
assert skip is True
|
||||
|
||||
|
||||
@pytest.mark.parametrize("filename", [
|
||||
|
|
@ -25,9 +26,9 @@ def test_universal_extensions_are_not_skipped(filename, mocker):
|
|||
"""Files supported by all parsers (or handled by plaintext/direct_convert) are never skipped."""
|
||||
for service in ("DOCLING", "LLAMACLOUD", "UNSTRUCTURED"):
|
||||
mocker.patch("app.config.config.ETL_SERVICE", service)
|
||||
assert should_skip_by_extension(filename) is False, (
|
||||
f"{filename} should NOT be skipped with {service}"
|
||||
)
|
||||
skip, ext = should_skip_by_extension(filename)
|
||||
assert skip is False, f"{filename} should NOT be skipped with {service}"
|
||||
assert ext is None
|
||||
|
||||
|
||||
@pytest.mark.parametrize("filename,service,expected_skip", [
|
||||
|
|
@ -42,6 +43,19 @@ def test_universal_extensions_are_not_skipped(filename, mocker):
|
|||
])
|
||||
def test_parser_specific_extensions(filename, service, expected_skip, mocker):
|
||||
mocker.patch("app.config.config.ETL_SERVICE", service)
|
||||
assert should_skip_by_extension(filename) is expected_skip, (
|
||||
skip, ext = should_skip_by_extension(filename)
|
||||
assert skip is expected_skip, (
|
||||
f"{filename} with {service}: expected skip={expected_skip}"
|
||||
)
|
||||
if expected_skip:
|
||||
assert ext is not None, "unsupported extension should be returned"
|
||||
else:
|
||||
assert ext is None
|
||||
|
||||
|
||||
def test_returns_unsupported_extension(mocker):
|
||||
"""When a file is skipped, the unsupported extension string is returned."""
|
||||
mocker.patch("app.config.config.ETL_SERVICE", "DOCLING")
|
||||
skip, ext = should_skip_by_extension("macro.docm")
|
||||
assert skip is True
|
||||
assert ext == ".docm"
|
||||
|
|
|
|||
|
|
@ -14,22 +14,30 @@ pytestmark = pytest.mark.unit
|
|||
|
||||
def test_folder_is_skipped():
|
||||
item = {"folder": {}, "name": "My Folder"}
|
||||
assert should_skip_file(item) is True
|
||||
skip, ext = should_skip_file(item)
|
||||
assert skip is True
|
||||
assert ext is None
|
||||
|
||||
|
||||
def test_remote_item_is_skipped():
|
||||
item = {"remoteItem": {}, "name": "shared.docx"}
|
||||
assert should_skip_file(item) is True
|
||||
skip, ext = should_skip_file(item)
|
||||
assert skip is True
|
||||
assert ext is None
|
||||
|
||||
|
||||
def test_package_is_skipped():
|
||||
item = {"package": {}, "name": "notebook"}
|
||||
assert should_skip_file(item) is True
|
||||
skip, ext = should_skip_file(item)
|
||||
assert skip is True
|
||||
assert ext is None
|
||||
|
||||
|
||||
def test_onenote_is_skipped():
|
||||
item = {"name": "notes", "file": {"mimeType": "application/msonenote"}}
|
||||
assert should_skip_file(item) is True
|
||||
skip, ext = should_skip_file(item)
|
||||
assert skip is True
|
||||
assert ext is None
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
|
|
@ -43,7 +51,9 @@ def test_onenote_is_skipped():
|
|||
def test_unsupported_extensions_are_skipped(filename, mocker):
|
||||
mocker.patch("app.config.config.ETL_SERVICE", "DOCLING")
|
||||
item = {"name": filename, "file": {"mimeType": "application/octet-stream"}}
|
||||
assert should_skip_file(item) is True, f"{filename} should be skipped"
|
||||
skip, ext = should_skip_file(item)
|
||||
assert skip is True, f"{filename} should be skipped"
|
||||
assert ext is not None
|
||||
|
||||
|
||||
@pytest.mark.parametrize("filename", [
|
||||
|
|
@ -54,9 +64,9 @@ def test_universal_files_are_not_skipped(filename, mocker):
|
|||
for service in ("DOCLING", "LLAMACLOUD", "UNSTRUCTURED"):
|
||||
mocker.patch("app.config.config.ETL_SERVICE", service)
|
||||
item = {"name": filename, "file": {"mimeType": "application/octet-stream"}}
|
||||
assert should_skip_file(item) is False, (
|
||||
f"{filename} should NOT be skipped with {service}"
|
||||
)
|
||||
skip, ext = should_skip_file(item)
|
||||
assert skip is False, f"{filename} should NOT be skipped with {service}"
|
||||
assert ext is None
|
||||
|
||||
|
||||
@pytest.mark.parametrize("filename,service,expected_skip", [
|
||||
|
|
@ -70,6 +80,20 @@ def test_universal_files_are_not_skipped(filename, mocker):
|
|||
def test_parser_specific_extensions(filename, service, expected_skip, mocker):
|
||||
mocker.patch("app.config.config.ETL_SERVICE", service)
|
||||
item = {"name": filename, "file": {"mimeType": "application/octet-stream"}}
|
||||
assert should_skip_file(item) is expected_skip, (
|
||||
skip, ext = should_skip_file(item)
|
||||
assert skip is expected_skip, (
|
||||
f"{filename} with {service}: expected skip={expected_skip}"
|
||||
)
|
||||
if expected_skip:
|
||||
assert ext is not None
|
||||
else:
|
||||
assert ext is None
|
||||
|
||||
|
||||
def test_returns_unsupported_extension(mocker):
|
||||
"""When a file is skipped due to unsupported extension, the ext string is returned."""
|
||||
mocker.patch("app.config.config.ETL_SERVICE", "DOCLING")
|
||||
item = {"name": "mail.eml", "file": {"mimeType": "application/octet-stream"}}
|
||||
skip, ext = should_skip_file(item)
|
||||
assert skip is True
|
||||
assert ext == ".eml"
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue