refactor: unify file skipping logic across Dropbox, Google Drive, and OneDrive connectors by replacing classification checks with a centralized service-based approach, enhancing maintainability and consistency in file handling

This commit is contained in:
Anish Sarkar 2026-04-07 02:19:31 +05:30
parent f03bf05aaa
commit e7beeb2a36
13 changed files with 388 additions and 67 deletions

View file

@ -1,6 +1,6 @@
"""File type handlers for Dropbox.""" """File type handlers for Dropbox."""
from app.etl_pipeline.file_classifier import FileCategory, classify_file from app.etl_pipeline.file_classifier import should_skip_for_service
PAPER_EXTENSION = ".paper" PAPER_EXTENSION = ".paper"
@ -53,5 +53,7 @@ def should_skip_file(item: dict) -> bool:
return False return False
if not item.get("is_downloadable", True): if not item.get("is_downloadable", True):
return True return True
from app.config import config as app_config
name = item.get("name", "") name = item.get("name", "")
return classify_file(name) == FileCategory.UNSUPPORTED return should_skip_for_service(name, app_config.ETL_SERVICE)

View file

@ -1,6 +1,6 @@
"""File type handlers for Google Drive.""" """File type handlers for Google Drive."""
from app.etl_pipeline.file_classifier import FileCategory, classify_file from app.etl_pipeline.file_classifier import should_skip_for_service
GOOGLE_DOC = "application/vnd.google-apps.document" GOOGLE_DOC = "application/vnd.google-apps.document"
GOOGLE_SHEET = "application/vnd.google-apps.spreadsheet" GOOGLE_SHEET = "application/vnd.google-apps.spreadsheet"
@ -49,8 +49,10 @@ def should_skip_file(mime_type: str) -> bool:
def should_skip_by_extension(filename: str) -> bool: def should_skip_by_extension(filename: str) -> bool:
"""Return True if the file extension is not parseable by any ETL pipeline.""" """Return True if the file extension is not parseable by the configured ETL service."""
return classify_file(filename) == FileCategory.UNSUPPORTED from app.config import config as app_config
return should_skip_for_service(filename, app_config.ETL_SERVICE)
def get_export_mime_type(mime_type: str) -> str | None: def get_export_mime_type(mime_type: str) -> str | None:

View file

@ -1,6 +1,6 @@
"""File type handlers for Microsoft OneDrive.""" """File type handlers for Microsoft OneDrive."""
from app.etl_pipeline.file_classifier import FileCategory, classify_file from app.etl_pipeline.file_classifier import should_skip_for_service
ONEDRIVE_FOLDER_FACET = "folder" ONEDRIVE_FOLDER_FACET = "folder"
ONENOTE_MIME = "application/msonenote" ONENOTE_MIME = "application/msonenote"
@ -51,5 +51,7 @@ def should_skip_file(item: dict) -> bool:
mime = item.get("file", {}).get("mimeType", "") mime = item.get("file", {}).get("mimeType", "")
if mime in SKIP_MIME_TYPES: if mime in SKIP_MIME_TYPES:
return True return True
from app.config import config as app_config
name = item.get("name", "") name = item.get("name", "")
return classify_file(name) == FileCategory.UNSUPPORTED return should_skip_for_service(name, app_config.ETL_SERVICE)

View file

@ -45,6 +45,10 @@ class EtlPipelineService:
return await self._extract_document(request) return await self._extract_document(request)
async def _extract_document(self, request: EtlRequest) -> EtlResult: async def _extract_document(self, request: EtlRequest) -> EtlResult:
from pathlib import PurePosixPath
from app.utils.file_extensions import get_document_extensions_for_service
etl_service = app_config.ETL_SERVICE etl_service = app_config.ETL_SERVICE
if not etl_service: if not etl_service:
raise EtlServiceUnavailableError( raise EtlServiceUnavailableError(
@ -52,6 +56,13 @@ class EtlPipelineService:
"Set ETL_SERVICE to UNSTRUCTURED, LLAMACLOUD, or DOCLING in your .env" "Set ETL_SERVICE to UNSTRUCTURED, LLAMACLOUD, or DOCLING in your .env"
) )
ext = PurePosixPath(request.filename).suffix.lower()
supported = get_document_extensions_for_service(etl_service)
if ext not in supported:
raise EtlUnsupportedFileError(
f"File type {ext} is not supported by {etl_service}"
)
if etl_service == "DOCLING": if etl_service == "DOCLING":
from app.etl_pipeline.parsers.docling import parse_with_docling from app.etl_pipeline.parsers.docling import parse_with_docling

View file

@ -1,7 +1,7 @@
from enum import Enum from enum import Enum
from pathlib import PurePosixPath from pathlib import PurePosixPath
from app.utils.file_extensions import DOCUMENT_EXTENSIONS from app.utils.file_extensions import DOCUMENT_EXTENSIONS, get_document_extensions_for_service
PLAINTEXT_EXTENSIONS = frozenset( PLAINTEXT_EXTENSIONS = frozenset(
{ {
@ -29,7 +29,7 @@ AUDIO_EXTENSIONS = frozenset(
{".mp3", ".mp4", ".mpeg", ".mpga", ".m4a", ".wav", ".webm"} {".mp3", ".mp4", ".mpeg", ".mpga", ".m4a", ".wav", ".webm"}
) )
DIRECT_CONVERT_EXTENSIONS = frozenset({".csv", ".tsv", ".html", ".htm"}) DIRECT_CONVERT_EXTENSIONS = frozenset({".csv", ".tsv", ".html", ".htm", ".xhtml"})
class FileCategory(Enum): class FileCategory(Enum):
@ -51,3 +51,18 @@ def classify_file(filename: str) -> FileCategory:
if suffix in DOCUMENT_EXTENSIONS: if suffix in DOCUMENT_EXTENSIONS:
return FileCategory.DOCUMENT return FileCategory.DOCUMENT
return FileCategory.UNSUPPORTED return FileCategory.UNSUPPORTED
def should_skip_for_service(filename: str, etl_service: str | None) -> bool:
"""Return True if *filename* cannot be processed by *etl_service*.
Plaintext, audio, and direct-convert files are parser-agnostic and never
skipped. Document files are checked against the per-parser extension set.
"""
category = classify_file(filename)
if category == FileCategory.UNSUPPORTED:
return True
if category == FileCategory.DOCUMENT:
suffix = PurePosixPath(filename).suffix.lower()
return suffix not in get_document_extensions_for_service(etl_service)
return False

View file

@ -1,29 +1,69 @@
"""Allowlist of document extensions the ETL parsers can handle. """Per-parser document extension sets for the ETL pipeline.
Every consumer (file_classifier, connector-level skip checks) imports from Every consumer (file_classifier, connector-level skip checks, ETL pipeline
here so there is a single source of truth. Extensions already covered by validation) imports from here so there is a single source of truth.
PLAINTEXT_EXTENSIONS, AUDIO_EXTENSIONS, or DIRECT_CONVERT_EXTENSIONS in
file_classifier are NOT repeated here -- this set is exclusively for the Extensions already covered by PLAINTEXT_EXTENSIONS, AUDIO_EXTENSIONS, or
"document" ETL path (Docling / LlamaParse / Unstructured). DIRECT_CONVERT_EXTENSIONS in file_classifier are NOT repeated here -- these
sets are exclusively for the "document" ETL path (Docling / LlamaParse /
Unstructured).
""" """
from pathlib import PurePosixPath from pathlib import PurePosixPath
DOCUMENT_EXTENSIONS: frozenset[str] = frozenset({ # ---------------------------------------------------------------------------
# PDF # Per-parser document extension sets (from official documentation)
# ---------------------------------------------------------------------------
DOCLING_DOCUMENT_EXTENSIONS: frozenset[str] = frozenset({
".pdf", ".pdf",
# Microsoft Office ".docx", ".xlsx", ".pptx",
".docx", ".doc", ".xlsx", ".xls", ".pptx", ".ppt", ".png", ".jpg", ".jpeg", ".tiff", ".tif", ".bmp", ".webp",
# Images (raster: OCR / vision parsing)
".png", ".jpg", ".jpeg", ".bmp", ".tiff", ".tif",
# Rich text / e-book
".rtf", ".epub",
# OpenDocument
".odt", ".ods", ".odp",
# Other (LlamaParse / Unstructured specific)
".hwpx",
}) })
LLAMAPARSE_DOCUMENT_EXTENSIONS: frozenset[str] = frozenset({
".pdf",
".docx", ".doc", ".xlsx", ".xls", ".pptx", ".ppt",
".docm", ".dot", ".dotm", ".pptm", ".pot", ".potx",
".xlsm", ".xlsb", ".xlw",
".rtf", ".epub",
".png", ".jpg", ".jpeg", ".gif", ".bmp", ".tiff", ".tif", ".webp", ".svg",
".odt", ".ods", ".odp",
".hwp", ".hwpx",
})
UNSTRUCTURED_DOCUMENT_EXTENSIONS: frozenset[str] = frozenset({
".pdf",
".docx", ".doc", ".xlsx", ".xls", ".pptx", ".ppt",
".png", ".jpg", ".jpeg", ".bmp", ".tiff", ".tif", ".heic",
".rtf", ".epub", ".odt",
".eml", ".msg", ".p7s",
})
# ---------------------------------------------------------------------------
# Union (used by classify_file for routing) + service lookup
# ---------------------------------------------------------------------------
DOCUMENT_EXTENSIONS: frozenset[str] = (
DOCLING_DOCUMENT_EXTENSIONS
| LLAMAPARSE_DOCUMENT_EXTENSIONS
| UNSTRUCTURED_DOCUMENT_EXTENSIONS
)
_SERVICE_MAP: dict[str, frozenset[str]] = {
"DOCLING": DOCLING_DOCUMENT_EXTENSIONS,
"LLAMACLOUD": LLAMAPARSE_DOCUMENT_EXTENSIONS,
"UNSTRUCTURED": UNSTRUCTURED_DOCUMENT_EXTENSIONS,
}
def get_document_extensions_for_service(etl_service: str | None) -> frozenset[str]:
"""Return the document extensions supported by *etl_service*.
Falls back to the full union when the service is ``None`` or unknown.
"""
return _SERVICE_MAP.get(etl_service or "", DOCUMENT_EXTENSIONS)
def is_supported_document_extension(filename: str) -> bool: def is_supported_document_extension(filename: str) -> bool:
"""Return True if the file's extension is in the supported document set.""" """Return True if the file's extension is in the supported document set."""

View file

@ -261,6 +261,8 @@ def full_scan_mocks(mock_dropbox_client, monkeypatch):
skip_results: dict[str, tuple[bool, str | None]] = {} skip_results: dict[str, tuple[bool, str | None]] = {}
monkeypatch.setattr("app.config.config.ETL_SERVICE", "LLAMACLOUD")
async def _fake_skip(session, file, search_space_id): async def _fake_skip(session, file, search_space_id):
from app.connectors.dropbox.file_types import should_skip_file as _skip from app.connectors.dropbox.file_types import should_skip_file as _skip
if _skip(file): if _skip(file):

View file

@ -7,6 +7,11 @@ from app.connectors.dropbox.file_types import should_skip_file
pytestmark = pytest.mark.unit pytestmark = pytest.mark.unit
# ---------------------------------------------------------------------------
# Structural skips (independent of ETL service)
# ---------------------------------------------------------------------------
def test_folder_item_is_skipped(): def test_folder_item_is_skipped():
item = {".tag": "folder", "name": "My Folder"} item = {".tag": "folder", "name": "My Folder"}
assert should_skip_file(item) is True assert should_skip_file(item) is True
@ -22,13 +27,18 @@ def test_non_downloadable_item_is_skipped():
assert should_skip_file(item) is True assert should_skip_file(item) is True
# ---------------------------------------------------------------------------
# Extension-based skips (require ETL service context)
# ---------------------------------------------------------------------------
@pytest.mark.parametrize( @pytest.mark.parametrize(
"filename", "filename",
[ [
"archive.zip", "backup.tar", "data.gz", "stuff.rar", "pack.7z", "archive.zip", "backup.tar", "data.gz", "stuff.rar", "pack.7z",
"program.exe", "lib.dll", "module.so", "image.dmg", "disk.iso", "program.exe", "lib.dll", "module.so", "image.dmg", "disk.iso",
"movie.mov", "clip.avi", "video.mkv", "film.wmv", "stream.flv", "movie.mov", "clip.avi", "video.mkv", "film.wmv", "stream.flv",
"icon.svg", "anim.gif", "photo.webp", "shot.heic", "favicon.ico", "favicon.ico",
"raw.cr2", "photo.nef", "image.arw", "pic.dng", "raw.cr2", "photo.nef", "image.arw", "pic.dng",
"design.psd", "vector.ai", "mockup.sketch", "proto.fig", "design.psd", "vector.ai", "mockup.sketch", "proto.fig",
"font.ttf", "font.otf", "font.woff", "font.woff2", "font.ttf", "font.otf", "font.woff", "font.woff2",
@ -36,7 +46,8 @@ def test_non_downloadable_item_is_skipped():
"local.db", "data.sqlite", "access.mdb", "local.db", "data.sqlite", "access.mdb",
], ],
) )
def test_non_parseable_extensions_are_skipped(filename): def test_non_parseable_extensions_are_skipped(filename, mocker):
mocker.patch("app.config.config.ETL_SERVICE", "DOCLING")
item = {".tag": "file", "name": filename} item = {".tag": "file", "name": filename}
assert should_skip_file(item) is True, f"{filename} should be skipped" assert should_skip_file(item) is True, f"{filename} should be skipped"
@ -45,29 +56,61 @@ def test_non_parseable_extensions_are_skipped(filename):
"filename", "filename",
[ [
"report.pdf", "document.docx", "sheet.xlsx", "slides.pptx", "report.pdf", "document.docx", "sheet.xlsx", "slides.pptx",
"old.doc", "legacy.xls", "deck.ppt",
"readme.txt", "data.csv", "page.html", "notes.md", "readme.txt", "data.csv", "page.html", "notes.md",
"config.json", "feed.xml", "config.json", "feed.xml",
], ],
) )
def test_parseable_documents_are_not_skipped(filename): def test_parseable_documents_are_not_skipped(filename, mocker):
"""Files in plaintext/direct_convert/universal document sets are never skipped."""
for service in ("DOCLING", "LLAMACLOUD", "UNSTRUCTURED"):
mocker.patch("app.config.config.ETL_SERVICE", service)
item = {".tag": "file", "name": filename} item = {".tag": "file", "name": filename}
assert should_skip_file(item) is False, f"{filename} should NOT be skipped" assert should_skip_file(item) is False, (
f"{filename} should NOT be skipped with {service}"
)
@pytest.mark.parametrize( @pytest.mark.parametrize(
"filename", "filename",
["photo.jpg", "image.jpeg", "screenshot.png", "scan.bmp", "page.tiff", "doc.tif"], ["photo.jpg", "image.jpeg", "screenshot.png", "scan.bmp", "page.tiff", "doc.tif"],
) )
def test_universal_images_are_not_skipped(filename): def test_universal_images_are_not_skipped(filename, mocker):
"""Images supported by all parsers are never skipped."""
for service in ("DOCLING", "LLAMACLOUD", "UNSTRUCTURED"):
mocker.patch("app.config.config.ETL_SERVICE", service)
item = {".tag": "file", "name": filename} item = {".tag": "file", "name": filename}
assert should_skip_file(item) is False, f"{filename} should NOT be skipped" assert should_skip_file(item) is False, (
f"{filename} should NOT be skipped with {service}"
@pytest.mark.parametrize(
"filename",
["icon.svg", "anim.gif", "photo.webp", "live.heic"],
) )
def test_non_universal_images_are_skipped(filename):
@pytest.mark.parametrize("filename,service,expected_skip", [
("old.doc", "DOCLING", True),
("old.doc", "LLAMACLOUD", False),
("old.doc", "UNSTRUCTURED", False),
("legacy.xls", "DOCLING", True),
("legacy.xls", "LLAMACLOUD", False),
("legacy.xls", "UNSTRUCTURED", False),
("deck.ppt", "DOCLING", True),
("deck.ppt", "LLAMACLOUD", False),
("deck.ppt", "UNSTRUCTURED", False),
("icon.svg", "DOCLING", True),
("icon.svg", "LLAMACLOUD", False),
("anim.gif", "DOCLING", True),
("anim.gif", "LLAMACLOUD", False),
("photo.webp", "DOCLING", False),
("photo.webp", "LLAMACLOUD", False),
("photo.webp", "UNSTRUCTURED", True),
("live.heic", "DOCLING", True),
("live.heic", "UNSTRUCTURED", False),
("macro.docm", "DOCLING", True),
("macro.docm", "LLAMACLOUD", False),
("mail.eml", "DOCLING", True),
("mail.eml", "UNSTRUCTURED", False),
])
def test_parser_specific_extensions(filename, service, expected_skip, mocker):
mocker.patch("app.config.config.ETL_SERVICE", service)
item = {".tag": "file", "name": filename} item = {".tag": "file", "name": filename}
assert should_skip_file(item) is True, f"{filename} should be skipped" assert should_skip_file(item) is expected_skip, (
f"{filename} with {service}: expected skip={expected_skip}"
)

View file

@ -10,7 +10,10 @@ pytestmark = pytest.mark.unit
@pytest.mark.parametrize("filename", [ @pytest.mark.parametrize("filename", [
"malware.exe", "archive.zip", "video.mov", "font.woff2", "model.blend", "malware.exe", "archive.zip", "video.mov", "font.woff2", "model.blend",
]) ])
def test_unsupported_extensions_are_skipped(filename): def test_unsupported_extensions_are_skipped_regardless_of_service(filename, mocker):
"""Truly unsupported files are skipped no matter which ETL service is configured."""
for service in ("DOCLING", "LLAMACLOUD", "UNSTRUCTURED"):
mocker.patch("app.config.config.ETL_SERVICE", service)
assert should_skip_by_extension(filename) is True assert should_skip_by_extension(filename) is True
@ -18,5 +21,27 @@ def test_unsupported_extensions_are_skipped(filename):
"report.pdf", "doc.docx", "sheet.xlsx", "slides.pptx", "report.pdf", "doc.docx", "sheet.xlsx", "slides.pptx",
"readme.txt", "data.csv", "photo.png", "notes.md", "readme.txt", "data.csv", "photo.png", "notes.md",
]) ])
def test_parseable_extensions_are_not_skipped(filename): def test_universal_extensions_are_not_skipped(filename, mocker):
assert should_skip_by_extension(filename) is False """Files supported by all parsers (or handled by plaintext/direct_convert) are never skipped."""
for service in ("DOCLING", "LLAMACLOUD", "UNSTRUCTURED"):
mocker.patch("app.config.config.ETL_SERVICE", service)
assert should_skip_by_extension(filename) is False, (
f"{filename} should NOT be skipped with {service}"
)
@pytest.mark.parametrize("filename,service,expected_skip", [
("macro.docm", "DOCLING", True),
("macro.docm", "LLAMACLOUD", False),
("mail.eml", "DOCLING", True),
("mail.eml", "UNSTRUCTURED", False),
("photo.gif", "DOCLING", True),
("photo.gif", "LLAMACLOUD", False),
("photo.heic", "UNSTRUCTURED", False),
("photo.heic", "DOCLING", True),
])
def test_parser_specific_extensions(filename, service, expected_skip, mocker):
mocker.patch("app.config.config.ETL_SERVICE", service)
assert should_skip_by_extension(filename) is expected_skip, (
f"{filename} with {service}: expected skip={expected_skip}"
)

View file

@ -7,6 +7,11 @@ from app.connectors.onedrive.file_types import should_skip_file
pytestmark = pytest.mark.unit pytestmark = pytest.mark.unit
# ---------------------------------------------------------------------------
# Structural skips (independent of ETL service)
# ---------------------------------------------------------------------------
def test_folder_is_skipped(): def test_folder_is_skipped():
item = {"folder": {}, "name": "My Folder"} item = {"folder": {}, "name": "My Folder"}
assert should_skip_file(item) is True assert should_skip_file(item) is True
@ -27,10 +32,16 @@ def test_onenote_is_skipped():
assert should_skip_file(item) is True assert should_skip_file(item) is True
# ---------------------------------------------------------------------------
# Extension-based skips (require ETL service context)
# ---------------------------------------------------------------------------
@pytest.mark.parametrize("filename", [ @pytest.mark.parametrize("filename", [
"malware.exe", "archive.zip", "video.mov", "font.woff2", "model.blend", "malware.exe", "archive.zip", "video.mov", "font.woff2", "model.blend",
]) ])
def test_unsupported_extensions_are_skipped(filename): def test_unsupported_extensions_are_skipped(filename, mocker):
mocker.patch("app.config.config.ETL_SERVICE", "DOCLING")
item = {"name": filename, "file": {"mimeType": "application/octet-stream"}} item = {"name": filename, "file": {"mimeType": "application/octet-stream"}}
assert should_skip_file(item) is True, f"{filename} should be skipped" assert should_skip_file(item) is True, f"{filename} should be skipped"
@ -39,6 +50,26 @@ def test_unsupported_extensions_are_skipped(filename):
"report.pdf", "doc.docx", "sheet.xlsx", "slides.pptx", "report.pdf", "doc.docx", "sheet.xlsx", "slides.pptx",
"readme.txt", "data.csv", "photo.png", "notes.md", "readme.txt", "data.csv", "photo.png", "notes.md",
]) ])
def test_parseable_files_are_not_skipped(filename): def test_universal_files_are_not_skipped(filename, mocker):
for service in ("DOCLING", "LLAMACLOUD", "UNSTRUCTURED"):
mocker.patch("app.config.config.ETL_SERVICE", service)
item = {"name": filename, "file": {"mimeType": "application/octet-stream"}} item = {"name": filename, "file": {"mimeType": "application/octet-stream"}}
assert should_skip_file(item) is False, f"{filename} should NOT be skipped" assert should_skip_file(item) is False, (
f"{filename} should NOT be skipped with {service}"
)
@pytest.mark.parametrize("filename,service,expected_skip", [
("macro.docm", "DOCLING", True),
("macro.docm", "LLAMACLOUD", False),
("mail.eml", "DOCLING", True),
("mail.eml", "UNSTRUCTURED", False),
("photo.heic", "UNSTRUCTURED", False),
("photo.heic", "DOCLING", True),
])
def test_parser_specific_extensions(filename, service, expected_skip, mocker):
mocker.patch("app.config.config.ETL_SERVICE", service)
item = {"name": filename, "file": {"mimeType": "application/octet-stream"}}
assert should_skip_file(item) is expected_skip, (
f"{filename} with {service}: expected skip={expected_skip}"
)

View file

@ -377,3 +377,72 @@ async def test_extract_zip_raises_unsupported_error(tmp_path):
await EtlPipelineService().extract( await EtlPipelineService().extract(
EtlRequest(file_path=str(zip_file), filename="archive.zip") EtlRequest(file_path=str(zip_file), filename="archive.zip")
) )
# ---------------------------------------------------------------------------
# Slice 14 should_skip_for_service (per-parser document filtering)
# ---------------------------------------------------------------------------
@pytest.mark.parametrize("filename,etl_service,expected_skip", [
("file.eml", "DOCLING", True),
("file.eml", "UNSTRUCTURED", False),
("file.docm", "LLAMACLOUD", False),
("file.docm", "DOCLING", True),
("file.txt", "DOCLING", False),
("file.csv", "LLAMACLOUD", False),
("file.mp3", "UNSTRUCTURED", False),
("file.exe", "LLAMACLOUD", True),
("file.pdf", "DOCLING", False),
("file.webp", "DOCLING", False),
("file.webp", "UNSTRUCTURED", True),
("file.gif", "LLAMACLOUD", False),
("file.gif", "DOCLING", True),
("file.heic", "UNSTRUCTURED", False),
("file.heic", "DOCLING", True),
("file.svg", "LLAMACLOUD", False),
("file.svg", "DOCLING", True),
("file.p7s", "UNSTRUCTURED", False),
("file.p7s", "LLAMACLOUD", True),
])
def test_should_skip_for_service(filename, etl_service, expected_skip):
from app.etl_pipeline.file_classifier import should_skip_for_service
assert should_skip_for_service(filename, etl_service) is expected_skip, (
f"{filename} with {etl_service}: expected skip={expected_skip}"
)
# ---------------------------------------------------------------------------
# Slice 14b ETL pipeline rejects per-parser incompatible documents
# ---------------------------------------------------------------------------
async def test_extract_docm_with_docling_raises_unsupported(tmp_path, mocker):
"""Docling cannot parse .docm -- pipeline should reject before dispatching."""
from app.etl_pipeline.exceptions import EtlUnsupportedFileError
mocker.patch("app.config.config.ETL_SERVICE", "DOCLING")
docm_file = tmp_path / "macro.docm"
docm_file.write_bytes(b"\x00" * 10)
with pytest.raises(EtlUnsupportedFileError, match="not supported by DOCLING"):
await EtlPipelineService().extract(
EtlRequest(file_path=str(docm_file), filename="macro.docm")
)
async def test_extract_eml_with_docling_raises_unsupported(tmp_path, mocker):
"""Docling cannot parse .eml -- pipeline should reject before dispatching."""
from app.etl_pipeline.exceptions import EtlUnsupportedFileError
mocker.patch("app.config.config.ETL_SERVICE", "DOCLING")
eml_file = tmp_path / "mail.eml"
eml_file.write_bytes(b"From: test@example.com")
with pytest.raises(EtlUnsupportedFileError, match="not supported by DOCLING"):
await EtlPipelineService().extract(
EtlRequest(file_path=str(eml_file), filename="mail.eml")
)

View file

@ -21,10 +21,17 @@ def test_exe_is_not_supported_document():
"report.pdf", "doc.docx", "old.doc", "report.pdf", "doc.docx", "old.doc",
"sheet.xlsx", "legacy.xls", "sheet.xlsx", "legacy.xls",
"slides.pptx", "deck.ppt", "slides.pptx", "deck.ppt",
"macro.docm", "macro.xlsm", "macro.pptm",
"photo.png", "photo.jpg", "photo.jpeg", "scan.bmp", "scan.tiff", "scan.tif", "photo.png", "photo.jpg", "photo.jpeg", "scan.bmp", "scan.tiff", "scan.tif",
"photo.webp", "anim.gif", "iphone.heic",
"manual.rtf", "book.epub", "manual.rtf", "book.epub",
"letter.odt", "data.ods", "presentation.odp", "letter.odt", "data.ods", "presentation.odp",
"korean.hwpx", "inbox.eml", "outlook.msg",
"korean.hwpx", "korean.hwp",
"template.dot", "template.dotm",
"template.pot", "template.potx",
"binary.xlsb", "workspace.xlw",
"vector.svg", "signature.p7s",
]) ])
def test_document_extensions_are_supported(filename): def test_document_extensions_are_supported(filename):
from app.utils.file_extensions import is_supported_document_extension from app.utils.file_extensions import is_supported_document_extension
@ -40,3 +47,70 @@ def test_non_document_extensions_are_not_supported(filename):
from app.utils.file_extensions import is_supported_document_extension from app.utils.file_extensions import is_supported_document_extension
assert is_supported_document_extension(filename) is False, f"{filename} should NOT be supported" assert is_supported_document_extension(filename) is False, f"{filename} should NOT be supported"
# ---------------------------------------------------------------------------
# Per-parser extension sets
# ---------------------------------------------------------------------------
def test_union_equals_all_three_sets():
from app.utils.file_extensions import (
DOCLING_DOCUMENT_EXTENSIONS,
DOCUMENT_EXTENSIONS,
LLAMAPARSE_DOCUMENT_EXTENSIONS,
UNSTRUCTURED_DOCUMENT_EXTENSIONS,
)
expected = (
DOCLING_DOCUMENT_EXTENSIONS
| LLAMAPARSE_DOCUMENT_EXTENSIONS
| UNSTRUCTURED_DOCUMENT_EXTENSIONS
)
assert DOCUMENT_EXTENSIONS == expected
def test_get_extensions_for_docling():
from app.utils.file_extensions import get_document_extensions_for_service
exts = get_document_extensions_for_service("DOCLING")
assert ".pdf" in exts
assert ".webp" in exts
assert ".docx" in exts
assert ".eml" not in exts
assert ".docm" not in exts
assert ".gif" not in exts
assert ".heic" not in exts
def test_get_extensions_for_llamacloud():
from app.utils.file_extensions import get_document_extensions_for_service
exts = get_document_extensions_for_service("LLAMACLOUD")
assert ".docm" in exts
assert ".gif" in exts
assert ".svg" in exts
assert ".hwp" in exts
assert ".eml" not in exts
assert ".heic" not in exts
def test_get_extensions_for_unstructured():
from app.utils.file_extensions import get_document_extensions_for_service
exts = get_document_extensions_for_service("UNSTRUCTURED")
assert ".eml" in exts
assert ".heic" in exts
assert ".p7s" in exts
assert ".docm" not in exts
assert ".gif" not in exts
assert ".svg" not in exts
def test_get_extensions_for_none_returns_union():
from app.utils.file_extensions import (
DOCUMENT_EXTENSIONS,
get_document_extensions_for_service,
)
assert get_document_extensions_for_service(None) == DOCUMENT_EXTENSIONS

View file

@ -85,7 +85,6 @@ const FILE_TYPE_CONFIG: Record<string, Record<string, string[]>> = {
"application/rtf": [".rtf"], "application/rtf": [".rtf"],
"application/xml": [".xml"], "application/xml": [".xml"],
"application/epub+zip": [".epub"], "application/epub+zip": [".epub"],
"text/html": [".html", ".htm", ".web"],
"image/gif": [".gif"], "image/gif": [".gif"],
"image/svg+xml": [".svg"], "image/svg+xml": [".svg"],
...audioFileTypes, ...audioFileTypes,
@ -472,8 +471,9 @@ export function DocumentUploadTab({
</button> </button>
)) ))
) : ( ) : (
<div <button
className="flex flex-col items-center gap-4 py-12 px-4 cursor-pointer" type="button"
className="flex flex-col items-center gap-4 py-12 px-4 cursor-pointer w-full bg-transparent border-none"
onClick={() => { onClick={() => {
if (!isElectron) fileInputRef.current?.click(); if (!isElectron) fileInputRef.current?.click();
}} }}
@ -485,10 +485,11 @@ export function DocumentUploadTab({
</p> </p>
<p className="text-sm text-muted-foreground">{t("file_size_limit")}</p> <p className="text-sm text-muted-foreground">{t("file_size_limit")}</p>
</div> </div>
<div className="w-full mt-1" onClick={(e) => e.stopPropagation()}> {/* biome-ignore lint/a11y/useSemanticElements: wrapper to stop click propagation to parent button */}
<div className="w-full mt-1" onClick={(e) => e.stopPropagation()} onKeyDown={(e) => e.stopPropagation()} role="group">
{renderBrowseButton({ fullWidth: true })} {renderBrowseButton({ fullWidth: true })}
</div> </div>
</div> </button>
)} )}
</div> </div>
@ -683,9 +684,13 @@ export function DocumentUploadTab({
</span> </span>
</AccordionTrigger> </AccordionTrigger>
<AccordionContent className="px-3 pb-3"> <AccordionContent className="px-3 pb-3">
<div className="flex flex-wrap gap-1"> <div className="flex flex-wrap gap-1.5">
{supportedExtensions.map((ext) => ( {supportedExtensions.map((ext) => (
<Badge key={ext} variant="outline" className="text-[10px] px-1.5 py-0"> <Badge
key={ext}
variant="secondary"
className="rounded border-0 bg-neutral-200/80 dark:bg-neutral-700/60 text-muted-foreground text-[10px] px-2 py-0.5 font-normal"
>
{ext} {ext}
</Badge> </Badge>
))} ))}