refactor: unify file skipping logic across Dropbox, Google Drive, and OneDrive connectors by replacing classification checks with a centralized service-based approach, enhancing maintainability and consistency in file handling

This commit is contained in:
Anish Sarkar 2026-04-07 02:19:31 +05:30
parent f03bf05aaa
commit e7beeb2a36
13 changed files with 388 additions and 67 deletions

View file

@ -7,6 +7,11 @@ from app.connectors.dropbox.file_types import should_skip_file
pytestmark = pytest.mark.unit
# ---------------------------------------------------------------------------
# Structural skips (independent of ETL service)
# ---------------------------------------------------------------------------
def test_folder_item_is_skipped():
item = {".tag": "folder", "name": "My Folder"}
assert should_skip_file(item) is True
@ -22,13 +27,18 @@ def test_non_downloadable_item_is_skipped():
assert should_skip_file(item) is True
# ---------------------------------------------------------------------------
# Extension-based skips (require ETL service context)
# ---------------------------------------------------------------------------
@pytest.mark.parametrize(
"filename",
[
"archive.zip", "backup.tar", "data.gz", "stuff.rar", "pack.7z",
"program.exe", "lib.dll", "module.so", "image.dmg", "disk.iso",
"movie.mov", "clip.avi", "video.mkv", "film.wmv", "stream.flv",
"icon.svg", "anim.gif", "photo.webp", "shot.heic", "favicon.ico",
"favicon.ico",
"raw.cr2", "photo.nef", "image.arw", "pic.dng",
"design.psd", "vector.ai", "mockup.sketch", "proto.fig",
"font.ttf", "font.otf", "font.woff", "font.woff2",
@ -36,7 +46,8 @@ def test_non_downloadable_item_is_skipped():
"local.db", "data.sqlite", "access.mdb",
],
)
def test_non_parseable_extensions_are_skipped(filename):
def test_non_parseable_extensions_are_skipped(filename, mocker):
mocker.patch("app.config.config.ETL_SERVICE", "DOCLING")
item = {".tag": "file", "name": filename}
assert should_skip_file(item) is True, f"{filename} should be skipped"
@ -45,29 +56,61 @@ def test_non_parseable_extensions_are_skipped(filename):
"filename",
[
"report.pdf", "document.docx", "sheet.xlsx", "slides.pptx",
"old.doc", "legacy.xls", "deck.ppt",
"readme.txt", "data.csv", "page.html", "notes.md",
"config.json", "feed.xml",
],
)
def test_parseable_documents_are_not_skipped(filename):
item = {".tag": "file", "name": filename}
assert should_skip_file(item) is False, f"{filename} should NOT be skipped"
def test_parseable_documents_are_not_skipped(filename, mocker):
"""Files in plaintext/direct_convert/universal document sets are never skipped."""
for service in ("DOCLING", "LLAMACLOUD", "UNSTRUCTURED"):
mocker.patch("app.config.config.ETL_SERVICE", service)
item = {".tag": "file", "name": filename}
assert should_skip_file(item) is False, (
f"{filename} should NOT be skipped with {service}"
)
@pytest.mark.parametrize(
"filename",
["photo.jpg", "image.jpeg", "screenshot.png", "scan.bmp", "page.tiff", "doc.tif"],
)
def test_universal_images_are_not_skipped(filename):
item = {".tag": "file", "name": filename}
assert should_skip_file(item) is False, f"{filename} should NOT be skipped"
def test_universal_images_are_not_skipped(filename, mocker):
"""Images supported by all parsers are never skipped."""
for service in ("DOCLING", "LLAMACLOUD", "UNSTRUCTURED"):
mocker.patch("app.config.config.ETL_SERVICE", service)
item = {".tag": "file", "name": filename}
assert should_skip_file(item) is False, (
f"{filename} should NOT be skipped with {service}"
)
@pytest.mark.parametrize(
"filename",
["icon.svg", "anim.gif", "photo.webp", "live.heic"],
)
def test_non_universal_images_are_skipped(filename):
@pytest.mark.parametrize("filename,service,expected_skip", [
("old.doc", "DOCLING", True),
("old.doc", "LLAMACLOUD", False),
("old.doc", "UNSTRUCTURED", False),
("legacy.xls", "DOCLING", True),
("legacy.xls", "LLAMACLOUD", False),
("legacy.xls", "UNSTRUCTURED", False),
("deck.ppt", "DOCLING", True),
("deck.ppt", "LLAMACLOUD", False),
("deck.ppt", "UNSTRUCTURED", False),
("icon.svg", "DOCLING", True),
("icon.svg", "LLAMACLOUD", False),
("anim.gif", "DOCLING", True),
("anim.gif", "LLAMACLOUD", False),
("photo.webp", "DOCLING", False),
("photo.webp", "LLAMACLOUD", False),
("photo.webp", "UNSTRUCTURED", True),
("live.heic", "DOCLING", True),
("live.heic", "UNSTRUCTURED", False),
("macro.docm", "DOCLING", True),
("macro.docm", "LLAMACLOUD", False),
("mail.eml", "DOCLING", True),
("mail.eml", "UNSTRUCTURED", False),
])
def test_parser_specific_extensions(filename, service, expected_skip, mocker):
mocker.patch("app.config.config.ETL_SERVICE", service)
item = {".tag": "file", "name": filename}
assert should_skip_file(item) is True, f"{filename} should be skipped"
assert should_skip_file(item) is expected_skip, (
f"{filename} with {service}: expected skip={expected_skip}"
)

View file

@ -10,13 +10,38 @@ pytestmark = pytest.mark.unit
@pytest.mark.parametrize("filename", [
"malware.exe", "archive.zip", "video.mov", "font.woff2", "model.blend",
])
def test_unsupported_extensions_are_skipped(filename):
assert should_skip_by_extension(filename) is True
def test_unsupported_extensions_are_skipped_regardless_of_service(filename, mocker):
"""Truly unsupported files are skipped no matter which ETL service is configured."""
for service in ("DOCLING", "LLAMACLOUD", "UNSTRUCTURED"):
mocker.patch("app.config.config.ETL_SERVICE", service)
assert should_skip_by_extension(filename) is True
@pytest.mark.parametrize("filename", [
"report.pdf", "doc.docx", "sheet.xlsx", "slides.pptx",
"readme.txt", "data.csv", "photo.png", "notes.md",
])
def test_parseable_extensions_are_not_skipped(filename):
assert should_skip_by_extension(filename) is False
def test_universal_extensions_are_not_skipped(filename, mocker):
"""Files supported by all parsers (or handled by plaintext/direct_convert) are never skipped."""
for service in ("DOCLING", "LLAMACLOUD", "UNSTRUCTURED"):
mocker.patch("app.config.config.ETL_SERVICE", service)
assert should_skip_by_extension(filename) is False, (
f"{filename} should NOT be skipped with {service}"
)
@pytest.mark.parametrize("filename,service,expected_skip", [
("macro.docm", "DOCLING", True),
("macro.docm", "LLAMACLOUD", False),
("mail.eml", "DOCLING", True),
("mail.eml", "UNSTRUCTURED", False),
("photo.gif", "DOCLING", True),
("photo.gif", "LLAMACLOUD", False),
("photo.heic", "UNSTRUCTURED", False),
("photo.heic", "DOCLING", True),
])
def test_parser_specific_extensions(filename, service, expected_skip, mocker):
mocker.patch("app.config.config.ETL_SERVICE", service)
assert should_skip_by_extension(filename) is expected_skip, (
f"{filename} with {service}: expected skip={expected_skip}"
)

View file

@ -7,6 +7,11 @@ from app.connectors.onedrive.file_types import should_skip_file
pytestmark = pytest.mark.unit
# ---------------------------------------------------------------------------
# Structural skips (independent of ETL service)
# ---------------------------------------------------------------------------
def test_folder_is_skipped():
item = {"folder": {}, "name": "My Folder"}
assert should_skip_file(item) is True
@ -27,10 +32,16 @@ def test_onenote_is_skipped():
assert should_skip_file(item) is True
# ---------------------------------------------------------------------------
# Extension-based skips (require ETL service context)
# ---------------------------------------------------------------------------
@pytest.mark.parametrize("filename", [
"malware.exe", "archive.zip", "video.mov", "font.woff2", "model.blend",
])
def test_unsupported_extensions_are_skipped(filename):
def test_unsupported_extensions_are_skipped(filename, mocker):
mocker.patch("app.config.config.ETL_SERVICE", "DOCLING")
item = {"name": filename, "file": {"mimeType": "application/octet-stream"}}
assert should_skip_file(item) is True, f"{filename} should be skipped"
@ -39,6 +50,26 @@ def test_unsupported_extensions_are_skipped(filename):
"report.pdf", "doc.docx", "sheet.xlsx", "slides.pptx",
"readme.txt", "data.csv", "photo.png", "notes.md",
])
def test_parseable_files_are_not_skipped(filename):
def test_universal_files_are_not_skipped(filename, mocker):
for service in ("DOCLING", "LLAMACLOUD", "UNSTRUCTURED"):
mocker.patch("app.config.config.ETL_SERVICE", service)
item = {"name": filename, "file": {"mimeType": "application/octet-stream"}}
assert should_skip_file(item) is False, (
f"{filename} should NOT be skipped with {service}"
)
@pytest.mark.parametrize("filename,service,expected_skip", [
("macro.docm", "DOCLING", True),
("macro.docm", "LLAMACLOUD", False),
("mail.eml", "DOCLING", True),
("mail.eml", "UNSTRUCTURED", False),
("photo.heic", "UNSTRUCTURED", False),
("photo.heic", "DOCLING", True),
])
def test_parser_specific_extensions(filename, service, expected_skip, mocker):
mocker.patch("app.config.config.ETL_SERVICE", service)
item = {"name": filename, "file": {"mimeType": "application/octet-stream"}}
assert should_skip_file(item) is False, f"{filename} should NOT be skipped"
assert should_skip_file(item) is expected_skip, (
f"{filename} with {service}: expected skip={expected_skip}"
)