refactor: unify file skipping logic across Dropbox, Google Drive, and OneDrive connectors by replacing classification checks with a centralized service-based approach, enhancing maintainability and consistency in file handling

This commit is contained in:
Anish Sarkar 2026-04-07 02:19:31 +05:30
parent f03bf05aaa
commit e7beeb2a36
13 changed files with 388 additions and 67 deletions

View file

@ -21,10 +21,17 @@ def test_exe_is_not_supported_document():
"report.pdf", "doc.docx", "old.doc",
"sheet.xlsx", "legacy.xls",
"slides.pptx", "deck.ppt",
"macro.docm", "macro.xlsm", "macro.pptm",
"photo.png", "photo.jpg", "photo.jpeg", "scan.bmp", "scan.tiff", "scan.tif",
"photo.webp", "anim.gif", "iphone.heic",
"manual.rtf", "book.epub",
"letter.odt", "data.ods", "presentation.odp",
"korean.hwpx",
"inbox.eml", "outlook.msg",
"korean.hwpx", "korean.hwp",
"template.dot", "template.dotm",
"template.pot", "template.potx",
"binary.xlsb", "workspace.xlw",
"vector.svg", "signature.p7s",
])
def test_document_extensions_are_supported(filename):
from app.utils.file_extensions import is_supported_document_extension
@ -40,3 +47,70 @@ def test_non_document_extensions_are_not_supported(filename):
from app.utils.file_extensions import is_supported_document_extension
assert is_supported_document_extension(filename) is False, f"{filename} should NOT be supported"
# ---------------------------------------------------------------------------
# Per-parser extension sets
# ---------------------------------------------------------------------------
def test_union_equals_all_three_sets():
from app.utils.file_extensions import (
DOCLING_DOCUMENT_EXTENSIONS,
DOCUMENT_EXTENSIONS,
LLAMAPARSE_DOCUMENT_EXTENSIONS,
UNSTRUCTURED_DOCUMENT_EXTENSIONS,
)
expected = (
DOCLING_DOCUMENT_EXTENSIONS
| LLAMAPARSE_DOCUMENT_EXTENSIONS
| UNSTRUCTURED_DOCUMENT_EXTENSIONS
)
assert DOCUMENT_EXTENSIONS == expected
def test_get_extensions_for_docling():
from app.utils.file_extensions import get_document_extensions_for_service
exts = get_document_extensions_for_service("DOCLING")
assert ".pdf" in exts
assert ".webp" in exts
assert ".docx" in exts
assert ".eml" not in exts
assert ".docm" not in exts
assert ".gif" not in exts
assert ".heic" not in exts
def test_get_extensions_for_llamacloud():
from app.utils.file_extensions import get_document_extensions_for_service
exts = get_document_extensions_for_service("LLAMACLOUD")
assert ".docm" in exts
assert ".gif" in exts
assert ".svg" in exts
assert ".hwp" in exts
assert ".eml" not in exts
assert ".heic" not in exts
def test_get_extensions_for_unstructured():
from app.utils.file_extensions import get_document_extensions_for_service
exts = get_document_extensions_for_service("UNSTRUCTURED")
assert ".eml" in exts
assert ".heic" in exts
assert ".p7s" in exts
assert ".docm" not in exts
assert ".gif" not in exts
assert ".svg" not in exts
def test_get_extensions_for_none_returns_union():
from app.utils.file_extensions import (
DOCUMENT_EXTENSIONS,
get_document_extensions_for_service,
)
assert get_document_extensions_for_service(None) == DOCUMENT_EXTENSIONS