refactor: unify file skipping logic across Dropbox, Google Drive, and OneDrive connectors by replacing classification checks with a centralized service-based approach, enhancing maintainability and consistency in file handling

This commit is contained in:
Anish Sarkar 2026-04-07 02:19:31 +05:30
parent f03bf05aaa
commit e7beeb2a36
13 changed files with 388 additions and 67 deletions

View file

@ -261,6 +261,8 @@ def full_scan_mocks(mock_dropbox_client, monkeypatch):
skip_results: dict[str, tuple[bool, str | None]] = {}
monkeypatch.setattr("app.config.config.ETL_SERVICE", "LLAMACLOUD")
async def _fake_skip(session, file, search_space_id):
from app.connectors.dropbox.file_types import should_skip_file as _skip
if _skip(file):

View file

@ -7,6 +7,11 @@ from app.connectors.dropbox.file_types import should_skip_file
pytestmark = pytest.mark.unit
# ---------------------------------------------------------------------------
# Structural skips (independent of ETL service)
# ---------------------------------------------------------------------------
def test_folder_item_is_skipped():
item = {".tag": "folder", "name": "My Folder"}
assert should_skip_file(item) is True
@ -22,13 +27,18 @@ def test_non_downloadable_item_is_skipped():
assert should_skip_file(item) is True
# ---------------------------------------------------------------------------
# Extension-based skips (require ETL service context)
# ---------------------------------------------------------------------------
@pytest.mark.parametrize(
"filename",
[
"archive.zip", "backup.tar", "data.gz", "stuff.rar", "pack.7z",
"program.exe", "lib.dll", "module.so", "image.dmg", "disk.iso",
"movie.mov", "clip.avi", "video.mkv", "film.wmv", "stream.flv",
"icon.svg", "anim.gif", "photo.webp", "shot.heic", "favicon.ico",
"favicon.ico",
"raw.cr2", "photo.nef", "image.arw", "pic.dng",
"design.psd", "vector.ai", "mockup.sketch", "proto.fig",
"font.ttf", "font.otf", "font.woff", "font.woff2",
@ -36,7 +46,8 @@ def test_non_downloadable_item_is_skipped():
"local.db", "data.sqlite", "access.mdb",
],
)
def test_non_parseable_extensions_are_skipped(filename):
def test_non_parseable_extensions_are_skipped(filename, mocker):
mocker.patch("app.config.config.ETL_SERVICE", "DOCLING")
item = {".tag": "file", "name": filename}
assert should_skip_file(item) is True, f"{filename} should be skipped"
@ -45,29 +56,61 @@ def test_non_parseable_extensions_are_skipped(filename):
"filename",
[
"report.pdf", "document.docx", "sheet.xlsx", "slides.pptx",
"old.doc", "legacy.xls", "deck.ppt",
"readme.txt", "data.csv", "page.html", "notes.md",
"config.json", "feed.xml",
],
)
def test_parseable_documents_are_not_skipped(filename):
item = {".tag": "file", "name": filename}
assert should_skip_file(item) is False, f"{filename} should NOT be skipped"
def test_parseable_documents_are_not_skipped(filename, mocker):
"""Files in plaintext/direct_convert/universal document sets are never skipped."""
for service in ("DOCLING", "LLAMACLOUD", "UNSTRUCTURED"):
mocker.patch("app.config.config.ETL_SERVICE", service)
item = {".tag": "file", "name": filename}
assert should_skip_file(item) is False, (
f"{filename} should NOT be skipped with {service}"
)
@pytest.mark.parametrize(
"filename",
["photo.jpg", "image.jpeg", "screenshot.png", "scan.bmp", "page.tiff", "doc.tif"],
)
def test_universal_images_are_not_skipped(filename):
item = {".tag": "file", "name": filename}
assert should_skip_file(item) is False, f"{filename} should NOT be skipped"
def test_universal_images_are_not_skipped(filename, mocker):
"""Images supported by all parsers are never skipped."""
for service in ("DOCLING", "LLAMACLOUD", "UNSTRUCTURED"):
mocker.patch("app.config.config.ETL_SERVICE", service)
item = {".tag": "file", "name": filename}
assert should_skip_file(item) is False, (
f"{filename} should NOT be skipped with {service}"
)
@pytest.mark.parametrize(
"filename",
["icon.svg", "anim.gif", "photo.webp", "live.heic"],
)
def test_non_universal_images_are_skipped(filename):
@pytest.mark.parametrize("filename,service,expected_skip", [
("old.doc", "DOCLING", True),
("old.doc", "LLAMACLOUD", False),
("old.doc", "UNSTRUCTURED", False),
("legacy.xls", "DOCLING", True),
("legacy.xls", "LLAMACLOUD", False),
("legacy.xls", "UNSTRUCTURED", False),
("deck.ppt", "DOCLING", True),
("deck.ppt", "LLAMACLOUD", False),
("deck.ppt", "UNSTRUCTURED", False),
("icon.svg", "DOCLING", True),
("icon.svg", "LLAMACLOUD", False),
("anim.gif", "DOCLING", True),
("anim.gif", "LLAMACLOUD", False),
("photo.webp", "DOCLING", False),
("photo.webp", "LLAMACLOUD", False),
("photo.webp", "UNSTRUCTURED", True),
("live.heic", "DOCLING", True),
("live.heic", "UNSTRUCTURED", False),
("macro.docm", "DOCLING", True),
("macro.docm", "LLAMACLOUD", False),
("mail.eml", "DOCLING", True),
("mail.eml", "UNSTRUCTURED", False),
])
def test_parser_specific_extensions(filename, service, expected_skip, mocker):
mocker.patch("app.config.config.ETL_SERVICE", service)
item = {".tag": "file", "name": filename}
assert should_skip_file(item) is True, f"{filename} should be skipped"
assert should_skip_file(item) is expected_skip, (
f"{filename} with {service}: expected skip={expected_skip}"
)

View file

@ -10,13 +10,38 @@ pytestmark = pytest.mark.unit
@pytest.mark.parametrize("filename", [
"malware.exe", "archive.zip", "video.mov", "font.woff2", "model.blend",
])
def test_unsupported_extensions_are_skipped(filename):
assert should_skip_by_extension(filename) is True
def test_unsupported_extensions_are_skipped_regardless_of_service(filename, mocker):
"""Truly unsupported files are skipped no matter which ETL service is configured."""
for service in ("DOCLING", "LLAMACLOUD", "UNSTRUCTURED"):
mocker.patch("app.config.config.ETL_SERVICE", service)
assert should_skip_by_extension(filename) is True
@pytest.mark.parametrize("filename", [
"report.pdf", "doc.docx", "sheet.xlsx", "slides.pptx",
"readme.txt", "data.csv", "photo.png", "notes.md",
])
def test_parseable_extensions_are_not_skipped(filename):
assert should_skip_by_extension(filename) is False
def test_universal_extensions_are_not_skipped(filename, mocker):
"""Files supported by all parsers (or handled by plaintext/direct_convert) are never skipped."""
for service in ("DOCLING", "LLAMACLOUD", "UNSTRUCTURED"):
mocker.patch("app.config.config.ETL_SERVICE", service)
assert should_skip_by_extension(filename) is False, (
f"{filename} should NOT be skipped with {service}"
)
@pytest.mark.parametrize("filename,service,expected_skip", [
("macro.docm", "DOCLING", True),
("macro.docm", "LLAMACLOUD", False),
("mail.eml", "DOCLING", True),
("mail.eml", "UNSTRUCTURED", False),
("photo.gif", "DOCLING", True),
("photo.gif", "LLAMACLOUD", False),
("photo.heic", "UNSTRUCTURED", False),
("photo.heic", "DOCLING", True),
])
def test_parser_specific_extensions(filename, service, expected_skip, mocker):
mocker.patch("app.config.config.ETL_SERVICE", service)
assert should_skip_by_extension(filename) is expected_skip, (
f"{filename} with {service}: expected skip={expected_skip}"
)

View file

@ -7,6 +7,11 @@ from app.connectors.onedrive.file_types import should_skip_file
pytestmark = pytest.mark.unit
# ---------------------------------------------------------------------------
# Structural skips (independent of ETL service)
# ---------------------------------------------------------------------------
def test_folder_is_skipped():
item = {"folder": {}, "name": "My Folder"}
assert should_skip_file(item) is True
@ -27,10 +32,16 @@ def test_onenote_is_skipped():
assert should_skip_file(item) is True
# ---------------------------------------------------------------------------
# Extension-based skips (require ETL service context)
# ---------------------------------------------------------------------------
@pytest.mark.parametrize("filename", [
"malware.exe", "archive.zip", "video.mov", "font.woff2", "model.blend",
])
def test_unsupported_extensions_are_skipped(filename):
def test_unsupported_extensions_are_skipped(filename, mocker):
mocker.patch("app.config.config.ETL_SERVICE", "DOCLING")
item = {"name": filename, "file": {"mimeType": "application/octet-stream"}}
assert should_skip_file(item) is True, f"{filename} should be skipped"
@ -39,6 +50,26 @@ def test_unsupported_extensions_are_skipped(filename):
"report.pdf", "doc.docx", "sheet.xlsx", "slides.pptx",
"readme.txt", "data.csv", "photo.png", "notes.md",
])
def test_parseable_files_are_not_skipped(filename):
def test_universal_files_are_not_skipped(filename, mocker):
for service in ("DOCLING", "LLAMACLOUD", "UNSTRUCTURED"):
mocker.patch("app.config.config.ETL_SERVICE", service)
item = {"name": filename, "file": {"mimeType": "application/octet-stream"}}
assert should_skip_file(item) is False, (
f"{filename} should NOT be skipped with {service}"
)
@pytest.mark.parametrize("filename,service,expected_skip", [
("macro.docm", "DOCLING", True),
("macro.docm", "LLAMACLOUD", False),
("mail.eml", "DOCLING", True),
("mail.eml", "UNSTRUCTURED", False),
("photo.heic", "UNSTRUCTURED", False),
("photo.heic", "DOCLING", True),
])
def test_parser_specific_extensions(filename, service, expected_skip, mocker):
mocker.patch("app.config.config.ETL_SERVICE", service)
item = {"name": filename, "file": {"mimeType": "application/octet-stream"}}
assert should_skip_file(item) is False, f"{filename} should NOT be skipped"
assert should_skip_file(item) is expected_skip, (
f"{filename} with {service}: expected skip={expected_skip}"
)

View file

@ -377,3 +377,72 @@ async def test_extract_zip_raises_unsupported_error(tmp_path):
await EtlPipelineService().extract(
EtlRequest(file_path=str(zip_file), filename="archive.zip")
)
# ---------------------------------------------------------------------------
# Slice 14 should_skip_for_service (per-parser document filtering)
# ---------------------------------------------------------------------------
@pytest.mark.parametrize("filename,etl_service,expected_skip", [
("file.eml", "DOCLING", True),
("file.eml", "UNSTRUCTURED", False),
("file.docm", "LLAMACLOUD", False),
("file.docm", "DOCLING", True),
("file.txt", "DOCLING", False),
("file.csv", "LLAMACLOUD", False),
("file.mp3", "UNSTRUCTURED", False),
("file.exe", "LLAMACLOUD", True),
("file.pdf", "DOCLING", False),
("file.webp", "DOCLING", False),
("file.webp", "UNSTRUCTURED", True),
("file.gif", "LLAMACLOUD", False),
("file.gif", "DOCLING", True),
("file.heic", "UNSTRUCTURED", False),
("file.heic", "DOCLING", True),
("file.svg", "LLAMACLOUD", False),
("file.svg", "DOCLING", True),
("file.p7s", "UNSTRUCTURED", False),
("file.p7s", "LLAMACLOUD", True),
])
def test_should_skip_for_service(filename, etl_service, expected_skip):
from app.etl_pipeline.file_classifier import should_skip_for_service
assert should_skip_for_service(filename, etl_service) is expected_skip, (
f"{filename} with {etl_service}: expected skip={expected_skip}"
)
# ---------------------------------------------------------------------------
# Slice 14b ETL pipeline rejects per-parser incompatible documents
# ---------------------------------------------------------------------------
async def test_extract_docm_with_docling_raises_unsupported(tmp_path, mocker):
"""Docling cannot parse .docm -- pipeline should reject before dispatching."""
from app.etl_pipeline.exceptions import EtlUnsupportedFileError
mocker.patch("app.config.config.ETL_SERVICE", "DOCLING")
docm_file = tmp_path / "macro.docm"
docm_file.write_bytes(b"\x00" * 10)
with pytest.raises(EtlUnsupportedFileError, match="not supported by DOCLING"):
await EtlPipelineService().extract(
EtlRequest(file_path=str(docm_file), filename="macro.docm")
)
async def test_extract_eml_with_docling_raises_unsupported(tmp_path, mocker):
"""Docling cannot parse .eml -- pipeline should reject before dispatching."""
from app.etl_pipeline.exceptions import EtlUnsupportedFileError
mocker.patch("app.config.config.ETL_SERVICE", "DOCLING")
eml_file = tmp_path / "mail.eml"
eml_file.write_bytes(b"From: test@example.com")
with pytest.raises(EtlUnsupportedFileError, match="not supported by DOCLING"):
await EtlPipelineService().extract(
EtlRequest(file_path=str(eml_file), filename="mail.eml")
)

View file

@ -21,10 +21,17 @@ def test_exe_is_not_supported_document():
"report.pdf", "doc.docx", "old.doc",
"sheet.xlsx", "legacy.xls",
"slides.pptx", "deck.ppt",
"macro.docm", "macro.xlsm", "macro.pptm",
"photo.png", "photo.jpg", "photo.jpeg", "scan.bmp", "scan.tiff", "scan.tif",
"photo.webp", "anim.gif", "iphone.heic",
"manual.rtf", "book.epub",
"letter.odt", "data.ods", "presentation.odp",
"korean.hwpx",
"inbox.eml", "outlook.msg",
"korean.hwpx", "korean.hwp",
"template.dot", "template.dotm",
"template.pot", "template.potx",
"binary.xlsb", "workspace.xlw",
"vector.svg", "signature.p7s",
])
def test_document_extensions_are_supported(filename):
from app.utils.file_extensions import is_supported_document_extension
@ -40,3 +47,70 @@ def test_non_document_extensions_are_not_supported(filename):
from app.utils.file_extensions import is_supported_document_extension
assert is_supported_document_extension(filename) is False, f"{filename} should NOT be supported"
# ---------------------------------------------------------------------------
# Per-parser extension sets
# ---------------------------------------------------------------------------
def test_union_equals_all_three_sets():
from app.utils.file_extensions import (
DOCLING_DOCUMENT_EXTENSIONS,
DOCUMENT_EXTENSIONS,
LLAMAPARSE_DOCUMENT_EXTENSIONS,
UNSTRUCTURED_DOCUMENT_EXTENSIONS,
)
expected = (
DOCLING_DOCUMENT_EXTENSIONS
| LLAMAPARSE_DOCUMENT_EXTENSIONS
| UNSTRUCTURED_DOCUMENT_EXTENSIONS
)
assert DOCUMENT_EXTENSIONS == expected
def test_get_extensions_for_docling():
from app.utils.file_extensions import get_document_extensions_for_service
exts = get_document_extensions_for_service("DOCLING")
assert ".pdf" in exts
assert ".webp" in exts
assert ".docx" in exts
assert ".eml" not in exts
assert ".docm" not in exts
assert ".gif" not in exts
assert ".heic" not in exts
def test_get_extensions_for_llamacloud():
from app.utils.file_extensions import get_document_extensions_for_service
exts = get_document_extensions_for_service("LLAMACLOUD")
assert ".docm" in exts
assert ".gif" in exts
assert ".svg" in exts
assert ".hwp" in exts
assert ".eml" not in exts
assert ".heic" not in exts
def test_get_extensions_for_unstructured():
from app.utils.file_extensions import get_document_extensions_for_service
exts = get_document_extensions_for_service("UNSTRUCTURED")
assert ".eml" in exts
assert ".heic" in exts
assert ".p7s" in exts
assert ".docm" not in exts
assert ".gif" not in exts
assert ".svg" not in exts
def test_get_extensions_for_none_returns_union():
from app.utils.file_extensions import (
DOCUMENT_EXTENSIONS,
get_document_extensions_for_service,
)
assert get_document_extensions_for_service(None) == DOCUMENT_EXTENSIONS