mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-04-30 03:16:25 +02:00
refactor: enhance file skipping logic across Dropbox, Google Drive, and OneDrive connectors to return unsupported extensions, improving error reporting and maintainability
This commit is contained in:
parent
e7beeb2a36
commit
3a1d700817
14 changed files with 344 additions and 160 deletions
|
|
@ -14,17 +14,23 @@ pytestmark = pytest.mark.unit
|
|||
|
||||
def test_folder_item_is_skipped():
|
||||
item = {".tag": "folder", "name": "My Folder"}
|
||||
assert should_skip_file(item) is True
|
||||
skip, ext = should_skip_file(item)
|
||||
assert skip is True
|
||||
assert ext is None
|
||||
|
||||
|
||||
def test_paper_file_is_not_skipped():
|
||||
item = {".tag": "file", "name": "notes.paper", "is_downloadable": False}
|
||||
assert should_skip_file(item) is False
|
||||
skip, ext = should_skip_file(item)
|
||||
assert skip is False
|
||||
assert ext is None
|
||||
|
||||
|
||||
def test_non_downloadable_item_is_skipped():
|
||||
item = {".tag": "file", "name": "locked.gdoc", "is_downloadable": False}
|
||||
assert should_skip_file(item) is True
|
||||
skip, ext = should_skip_file(item)
|
||||
assert skip is True
|
||||
assert ext is None
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
|
|
@ -49,7 +55,9 @@ def test_non_downloadable_item_is_skipped():
|
|||
def test_non_parseable_extensions_are_skipped(filename, mocker):
|
||||
mocker.patch("app.config.config.ETL_SERVICE", "DOCLING")
|
||||
item = {".tag": "file", "name": filename}
|
||||
assert should_skip_file(item) is True, f"{filename} should be skipped"
|
||||
skip, ext = should_skip_file(item)
|
||||
assert skip is True, f"{filename} should be skipped"
|
||||
assert ext is not None
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
|
|
@ -65,9 +73,9 @@ def test_parseable_documents_are_not_skipped(filename, mocker):
|
|||
for service in ("DOCLING", "LLAMACLOUD", "UNSTRUCTURED"):
|
||||
mocker.patch("app.config.config.ETL_SERVICE", service)
|
||||
item = {".tag": "file", "name": filename}
|
||||
assert should_skip_file(item) is False, (
|
||||
f"{filename} should NOT be skipped with {service}"
|
||||
)
|
||||
skip, ext = should_skip_file(item)
|
||||
assert skip is False, f"{filename} should NOT be skipped with {service}"
|
||||
assert ext is None
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
|
|
@ -79,9 +87,9 @@ def test_universal_images_are_not_skipped(filename, mocker):
|
|||
for service in ("DOCLING", "LLAMACLOUD", "UNSTRUCTURED"):
|
||||
mocker.patch("app.config.config.ETL_SERVICE", service)
|
||||
item = {".tag": "file", "name": filename}
|
||||
assert should_skip_file(item) is False, (
|
||||
f"{filename} should NOT be skipped with {service}"
|
||||
)
|
||||
skip, ext = should_skip_file(item)
|
||||
assert skip is False, f"{filename} should NOT be skipped with {service}"
|
||||
assert ext is None
|
||||
|
||||
|
||||
@pytest.mark.parametrize("filename,service,expected_skip", [
|
||||
|
|
@ -111,6 +119,20 @@ def test_universal_images_are_not_skipped(filename, mocker):
|
|||
def test_parser_specific_extensions(filename, service, expected_skip, mocker):
|
||||
mocker.patch("app.config.config.ETL_SERVICE", service)
|
||||
item = {".tag": "file", "name": filename}
|
||||
assert should_skip_file(item) is expected_skip, (
|
||||
skip, ext = should_skip_file(item)
|
||||
assert skip is expected_skip, (
|
||||
f"{filename} with {service}: expected skip={expected_skip}"
|
||||
)
|
||||
if expected_skip:
|
||||
assert ext is not None
|
||||
else:
|
||||
assert ext is None
|
||||
|
||||
|
||||
def test_returns_unsupported_extension(mocker):
|
||||
"""When a file is skipped due to unsupported extension, the ext string is returned."""
|
||||
mocker.patch("app.config.config.ETL_SERVICE", "DOCLING")
|
||||
item = {".tag": "file", "name": "old.doc"}
|
||||
skip, ext = should_skip_file(item)
|
||||
assert skip is True
|
||||
assert ext == ".doc"
|
||||
|
|
|
|||
|
|
@ -14,7 +14,8 @@ def test_unsupported_extensions_are_skipped_regardless_of_service(filename, mock
|
|||
"""Truly unsupported files are skipped no matter which ETL service is configured."""
|
||||
for service in ("DOCLING", "LLAMACLOUD", "UNSTRUCTURED"):
|
||||
mocker.patch("app.config.config.ETL_SERVICE", service)
|
||||
assert should_skip_by_extension(filename) is True
|
||||
skip, ext = should_skip_by_extension(filename)
|
||||
assert skip is True
|
||||
|
||||
|
||||
@pytest.mark.parametrize("filename", [
|
||||
|
|
@ -25,9 +26,9 @@ def test_universal_extensions_are_not_skipped(filename, mocker):
|
|||
"""Files supported by all parsers (or handled by plaintext/direct_convert) are never skipped."""
|
||||
for service in ("DOCLING", "LLAMACLOUD", "UNSTRUCTURED"):
|
||||
mocker.patch("app.config.config.ETL_SERVICE", service)
|
||||
assert should_skip_by_extension(filename) is False, (
|
||||
f"{filename} should NOT be skipped with {service}"
|
||||
)
|
||||
skip, ext = should_skip_by_extension(filename)
|
||||
assert skip is False, f"{filename} should NOT be skipped with {service}"
|
||||
assert ext is None
|
||||
|
||||
|
||||
@pytest.mark.parametrize("filename,service,expected_skip", [
|
||||
|
|
@ -42,6 +43,19 @@ def test_universal_extensions_are_not_skipped(filename, mocker):
|
|||
])
|
||||
def test_parser_specific_extensions(filename, service, expected_skip, mocker):
|
||||
mocker.patch("app.config.config.ETL_SERVICE", service)
|
||||
assert should_skip_by_extension(filename) is expected_skip, (
|
||||
skip, ext = should_skip_by_extension(filename)
|
||||
assert skip is expected_skip, (
|
||||
f"{filename} with {service}: expected skip={expected_skip}"
|
||||
)
|
||||
if expected_skip:
|
||||
assert ext is not None, "unsupported extension should be returned"
|
||||
else:
|
||||
assert ext is None
|
||||
|
||||
|
||||
def test_returns_unsupported_extension(mocker):
|
||||
"""When a file is skipped, the unsupported extension string is returned."""
|
||||
mocker.patch("app.config.config.ETL_SERVICE", "DOCLING")
|
||||
skip, ext = should_skip_by_extension("macro.docm")
|
||||
assert skip is True
|
||||
assert ext == ".docm"
|
||||
|
|
|
|||
|
|
@ -14,22 +14,30 @@ pytestmark = pytest.mark.unit
|
|||
|
||||
def test_folder_is_skipped():
|
||||
item = {"folder": {}, "name": "My Folder"}
|
||||
assert should_skip_file(item) is True
|
||||
skip, ext = should_skip_file(item)
|
||||
assert skip is True
|
||||
assert ext is None
|
||||
|
||||
|
||||
def test_remote_item_is_skipped():
|
||||
item = {"remoteItem": {}, "name": "shared.docx"}
|
||||
assert should_skip_file(item) is True
|
||||
skip, ext = should_skip_file(item)
|
||||
assert skip is True
|
||||
assert ext is None
|
||||
|
||||
|
||||
def test_package_is_skipped():
|
||||
item = {"package": {}, "name": "notebook"}
|
||||
assert should_skip_file(item) is True
|
||||
skip, ext = should_skip_file(item)
|
||||
assert skip is True
|
||||
assert ext is None
|
||||
|
||||
|
||||
def test_onenote_is_skipped():
|
||||
item = {"name": "notes", "file": {"mimeType": "application/msonenote"}}
|
||||
assert should_skip_file(item) is True
|
||||
skip, ext = should_skip_file(item)
|
||||
assert skip is True
|
||||
assert ext is None
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
|
|
@ -43,7 +51,9 @@ def test_onenote_is_skipped():
|
|||
def test_unsupported_extensions_are_skipped(filename, mocker):
|
||||
mocker.patch("app.config.config.ETL_SERVICE", "DOCLING")
|
||||
item = {"name": filename, "file": {"mimeType": "application/octet-stream"}}
|
||||
assert should_skip_file(item) is True, f"{filename} should be skipped"
|
||||
skip, ext = should_skip_file(item)
|
||||
assert skip is True, f"{filename} should be skipped"
|
||||
assert ext is not None
|
||||
|
||||
|
||||
@pytest.mark.parametrize("filename", [
|
||||
|
|
@ -54,9 +64,9 @@ def test_universal_files_are_not_skipped(filename, mocker):
|
|||
for service in ("DOCLING", "LLAMACLOUD", "UNSTRUCTURED"):
|
||||
mocker.patch("app.config.config.ETL_SERVICE", service)
|
||||
item = {"name": filename, "file": {"mimeType": "application/octet-stream"}}
|
||||
assert should_skip_file(item) is False, (
|
||||
f"{filename} should NOT be skipped with {service}"
|
||||
)
|
||||
skip, ext = should_skip_file(item)
|
||||
assert skip is False, f"{filename} should NOT be skipped with {service}"
|
||||
assert ext is None
|
||||
|
||||
|
||||
@pytest.mark.parametrize("filename,service,expected_skip", [
|
||||
|
|
@ -70,6 +80,20 @@ def test_universal_files_are_not_skipped(filename, mocker):
|
|||
def test_parser_specific_extensions(filename, service, expected_skip, mocker):
|
||||
mocker.patch("app.config.config.ETL_SERVICE", service)
|
||||
item = {"name": filename, "file": {"mimeType": "application/octet-stream"}}
|
||||
assert should_skip_file(item) is expected_skip, (
|
||||
skip, ext = should_skip_file(item)
|
||||
assert skip is expected_skip, (
|
||||
f"{filename} with {service}: expected skip={expected_skip}"
|
||||
)
|
||||
if expected_skip:
|
||||
assert ext is not None
|
||||
else:
|
||||
assert ext is None
|
||||
|
||||
|
||||
def test_returns_unsupported_extension(mocker):
|
||||
"""When a file is skipped due to unsupported extension, the ext string is returned."""
|
||||
mocker.patch("app.config.config.ETL_SERVICE", "DOCLING")
|
||||
item = {"name": "mail.eml", "file": {"mimeType": "application/octet-stream"}}
|
||||
skip, ext = should_skip_file(item)
|
||||
assert skip is True
|
||||
assert ext == ".eml"
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue