diff --git a/surfsense_backend/app/connectors/dropbox/file_types.py b/surfsense_backend/app/connectors/dropbox/file_types.py index 7b72c1857..13209ffd2 100644 --- a/surfsense_backend/app/connectors/dropbox/file_types.py +++ b/surfsense_backend/app/connectors/dropbox/file_types.py @@ -1,6 +1,6 @@ """File type handlers for Dropbox.""" -from app.etl_pipeline.file_classifier import FileCategory, classify_file +from app.etl_pipeline.file_classifier import should_skip_for_service PAPER_EXTENSION = ".paper" @@ -53,5 +53,7 @@ def should_skip_file(item: dict) -> bool: return False if not item.get("is_downloadable", True): return True + from app.config import config as app_config + name = item.get("name", "") - return classify_file(name) == FileCategory.UNSUPPORTED + return should_skip_for_service(name, app_config.ETL_SERVICE) diff --git a/surfsense_backend/app/connectors/google_drive/file_types.py b/surfsense_backend/app/connectors/google_drive/file_types.py index e0b8f001e..73f016ceb 100644 --- a/surfsense_backend/app/connectors/google_drive/file_types.py +++ b/surfsense_backend/app/connectors/google_drive/file_types.py @@ -1,6 +1,6 @@ """File type handlers for Google Drive.""" -from app.etl_pipeline.file_classifier import FileCategory, classify_file +from app.etl_pipeline.file_classifier import should_skip_for_service GOOGLE_DOC = "application/vnd.google-apps.document" GOOGLE_SHEET = "application/vnd.google-apps.spreadsheet" @@ -49,8 +49,10 @@ def should_skip_file(mime_type: str) -> bool: def should_skip_by_extension(filename: str) -> bool: - """Return True if the file extension is not parseable by any ETL pipeline.""" - return classify_file(filename) == FileCategory.UNSUPPORTED + """Return True if the file extension is not parseable by the configured ETL service.""" + from app.config import config as app_config + + return should_skip_for_service(filename, app_config.ETL_SERVICE) def get_export_mime_type(mime_type: str) -> str | None: diff --git a/surfsense_backend/app/connectors/onedrive/file_types.py b/surfsense_backend/app/connectors/onedrive/file_types.py index bcd78b711..f9c147da8 100644 --- a/surfsense_backend/app/connectors/onedrive/file_types.py +++ b/surfsense_backend/app/connectors/onedrive/file_types.py @@ -1,6 +1,6 @@ """File type handlers for Microsoft OneDrive.""" -from app.etl_pipeline.file_classifier import FileCategory, classify_file +from app.etl_pipeline.file_classifier import should_skip_for_service ONEDRIVE_FOLDER_FACET = "folder" ONENOTE_MIME = "application/msonenote" @@ -51,5 +51,7 @@ def should_skip_file(item: dict) -> bool: mime = item.get("file", {}).get("mimeType", "") if mime in SKIP_MIME_TYPES: return True + from app.config import config as app_config + name = item.get("name", "") - return classify_file(name) == FileCategory.UNSUPPORTED + return should_skip_for_service(name, app_config.ETL_SERVICE) diff --git a/surfsense_backend/app/etl_pipeline/etl_pipeline_service.py b/surfsense_backend/app/etl_pipeline/etl_pipeline_service.py index 7c67d2345..a0041c843 100644 --- a/surfsense_backend/app/etl_pipeline/etl_pipeline_service.py +++ b/surfsense_backend/app/etl_pipeline/etl_pipeline_service.py @@ -45,6 +45,10 @@ class EtlPipelineService: return await self._extract_document(request) async def _extract_document(self, request: EtlRequest) -> EtlResult: + from pathlib import PurePosixPath + + from app.utils.file_extensions import get_document_extensions_for_service + etl_service = app_config.ETL_SERVICE if not etl_service: raise EtlServiceUnavailableError( @@ -52,6 +56,13 @@ class EtlPipelineService: "Set ETL_SERVICE to UNSTRUCTURED, LLAMACLOUD, or DOCLING in your .env" ) + ext = PurePosixPath(request.filename).suffix.lower() + supported = get_document_extensions_for_service(etl_service) + if ext not in supported: + raise EtlUnsupportedFileError( + f"File type {ext} is not supported by {etl_service}" + ) + if etl_service == "DOCLING": from app.etl_pipeline.parsers.docling import parse_with_docling diff --git a/surfsense_backend/app/etl_pipeline/file_classifier.py b/surfsense_backend/app/etl_pipeline/file_classifier.py index eea9cce22..bc7b4537c 100644 --- a/surfsense_backend/app/etl_pipeline/file_classifier.py +++ b/surfsense_backend/app/etl_pipeline/file_classifier.py @@ -1,7 +1,7 @@ from enum import Enum from pathlib import PurePosixPath -from app.utils.file_extensions import DOCUMENT_EXTENSIONS +from app.utils.file_extensions import DOCUMENT_EXTENSIONS, get_document_extensions_for_service PLAINTEXT_EXTENSIONS = frozenset( { @@ -29,7 +29,7 @@ AUDIO_EXTENSIONS = frozenset( {".mp3", ".mp4", ".mpeg", ".mpga", ".m4a", ".wav", ".webm"} ) -DIRECT_CONVERT_EXTENSIONS = frozenset({".csv", ".tsv", ".html", ".htm"}) +DIRECT_CONVERT_EXTENSIONS = frozenset({".csv", ".tsv", ".html", ".htm", ".xhtml"}) class FileCategory(Enum): @@ -51,3 +51,18 @@ def classify_file(filename: str) -> FileCategory: if suffix in DOCUMENT_EXTENSIONS: return FileCategory.DOCUMENT return FileCategory.UNSUPPORTED + + +def should_skip_for_service(filename: str, etl_service: str | None) -> bool: + """Return True if *filename* cannot be processed by *etl_service*. + + Plaintext, audio, and direct-convert files are parser-agnostic and never + skipped. Document files are checked against the per-parser extension set. + """ + category = classify_file(filename) + if category == FileCategory.UNSUPPORTED: + return True + if category == FileCategory.DOCUMENT: + suffix = PurePosixPath(filename).suffix.lower() + return suffix not in get_document_extensions_for_service(etl_service) + return False diff --git a/surfsense_backend/app/utils/file_extensions.py b/surfsense_backend/app/utils/file_extensions.py index b0a4c808c..5eed36872 100644 --- a/surfsense_backend/app/utils/file_extensions.py +++ b/surfsense_backend/app/utils/file_extensions.py @@ -1,29 +1,69 @@ -"""Allowlist of document extensions the ETL parsers can handle. +"""Per-parser document extension sets for the ETL pipeline. -Every consumer (file_classifier, connector-level skip checks) imports from -here so there is a single source of truth. Extensions already covered by -PLAINTEXT_EXTENSIONS, AUDIO_EXTENSIONS, or DIRECT_CONVERT_EXTENSIONS in -file_classifier are NOT repeated here -- this set is exclusively for the -"document" ETL path (Docling / LlamaParse / Unstructured). +Every consumer (file_classifier, connector-level skip checks, ETL pipeline +validation) imports from here so there is a single source of truth. + +Extensions already covered by PLAINTEXT_EXTENSIONS, AUDIO_EXTENSIONS, or +DIRECT_CONVERT_EXTENSIONS in file_classifier are NOT repeated here -- these +sets are exclusively for the "document" ETL path (Docling / LlamaParse / +Unstructured). """ from pathlib import PurePosixPath -DOCUMENT_EXTENSIONS: frozenset[str] = frozenset({ - # PDF +# --------------------------------------------------------------------------- +# Per-parser document extension sets (from official documentation) +# --------------------------------------------------------------------------- + +DOCLING_DOCUMENT_EXTENSIONS: frozenset[str] = frozenset({ ".pdf", - # Microsoft Office - ".docx", ".doc", ".xlsx", ".xls", ".pptx", ".ppt", - # Images (raster: OCR / vision parsing) - ".png", ".jpg", ".jpeg", ".bmp", ".tiff", ".tif", - # Rich text / e-book - ".rtf", ".epub", - # OpenDocument - ".odt", ".ods", ".odp", - # Other (LlamaParse / Unstructured specific) - ".hwpx", + ".docx", ".xlsx", ".pptx", + ".png", ".jpg", ".jpeg", ".tiff", ".tif", ".bmp", ".webp", }) +LLAMAPARSE_DOCUMENT_EXTENSIONS: frozenset[str] = frozenset({ + ".pdf", + ".docx", ".doc", ".xlsx", ".xls", ".pptx", ".ppt", + ".docm", ".dot", ".dotm", ".pptm", ".pot", ".potx", + ".xlsm", ".xlsb", ".xlw", + ".rtf", ".epub", + ".png", ".jpg", ".jpeg", ".gif", ".bmp", ".tiff", ".tif", ".webp", ".svg", + ".odt", ".ods", ".odp", + ".hwp", ".hwpx", +}) + +UNSTRUCTURED_DOCUMENT_EXTENSIONS: frozenset[str] = frozenset({ + ".pdf", + ".docx", ".doc", ".xlsx", ".xls", ".pptx", ".ppt", + ".png", ".jpg", ".jpeg", ".bmp", ".tiff", ".tif", ".heic", + ".rtf", ".epub", ".odt", + ".eml", ".msg", ".p7s", +}) + +# --------------------------------------------------------------------------- +# Union (used by classify_file for routing) + service lookup +# --------------------------------------------------------------------------- + +DOCUMENT_EXTENSIONS: frozenset[str] = ( + DOCLING_DOCUMENT_EXTENSIONS + | LLAMAPARSE_DOCUMENT_EXTENSIONS + | UNSTRUCTURED_DOCUMENT_EXTENSIONS +) + +_SERVICE_MAP: dict[str, frozenset[str]] = { + "DOCLING": DOCLING_DOCUMENT_EXTENSIONS, + "LLAMACLOUD": LLAMAPARSE_DOCUMENT_EXTENSIONS, + "UNSTRUCTURED": UNSTRUCTURED_DOCUMENT_EXTENSIONS, +} + + +def get_document_extensions_for_service(etl_service: str | None) -> frozenset[str]: + """Return the document extensions supported by *etl_service*. + + Falls back to the full union when the service is ``None`` or unknown. + """ + return _SERVICE_MAP.get(etl_service or "", DOCUMENT_EXTENSIONS) + def is_supported_document_extension(filename: str) -> bool: """Return True if the file's extension is in the supported document set.""" diff --git a/surfsense_backend/tests/unit/connector_indexers/test_dropbox_parallel.py b/surfsense_backend/tests/unit/connector_indexers/test_dropbox_parallel.py index 7a828b9c4..8572fa8ea 100644 --- a/surfsense_backend/tests/unit/connector_indexers/test_dropbox_parallel.py +++ b/surfsense_backend/tests/unit/connector_indexers/test_dropbox_parallel.py @@ -261,6 +261,8 @@ def full_scan_mocks(mock_dropbox_client, monkeypatch): skip_results: dict[str, tuple[bool, str | None]] = {} + monkeypatch.setattr("app.config.config.ETL_SERVICE", "LLAMACLOUD") + async def _fake_skip(session, file, search_space_id): from app.connectors.dropbox.file_types import should_skip_file as _skip if _skip(file): diff --git a/surfsense_backend/tests/unit/connectors/test_dropbox_file_types.py b/surfsense_backend/tests/unit/connectors/test_dropbox_file_types.py index 5480d8c8a..e092872c5 100644 --- a/surfsense_backend/tests/unit/connectors/test_dropbox_file_types.py +++ b/surfsense_backend/tests/unit/connectors/test_dropbox_file_types.py @@ -7,6 +7,11 @@ from app.connectors.dropbox.file_types import should_skip_file pytestmark = pytest.mark.unit +# --------------------------------------------------------------------------- +# Structural skips (independent of ETL service) +# --------------------------------------------------------------------------- + + def test_folder_item_is_skipped(): item = {".tag": "folder", "name": "My Folder"} assert should_skip_file(item) is True @@ -22,13 +27,18 @@ def test_non_downloadable_item_is_skipped(): assert should_skip_file(item) is True +# --------------------------------------------------------------------------- +# Extension-based skips (require ETL service context) +# --------------------------------------------------------------------------- + + @pytest.mark.parametrize( "filename", [ "archive.zip", "backup.tar", "data.gz", "stuff.rar", "pack.7z", "program.exe", "lib.dll", "module.so", "image.dmg", "disk.iso", "movie.mov", "clip.avi", "video.mkv", "film.wmv", "stream.flv", - "icon.svg", "anim.gif", "photo.webp", "shot.heic", "favicon.ico", + "favicon.ico", "raw.cr2", "photo.nef", "image.arw", "pic.dng", "design.psd", "vector.ai", "mockup.sketch", "proto.fig", "font.ttf", "font.otf", "font.woff", "font.woff2", @@ -36,7 +46,8 @@ def test_non_downloadable_item_is_skipped(): "local.db", "data.sqlite", "access.mdb", ], ) -def test_non_parseable_extensions_are_skipped(filename): +def test_non_parseable_extensions_are_skipped(filename, mocker): + mocker.patch("app.config.config.ETL_SERVICE", "DOCLING") item = {".tag": "file", "name": filename} assert should_skip_file(item) is True, f"{filename} should be skipped" @@ -45,29 +56,61 @@ def test_non_parseable_extensions_are_skipped(filename): "filename", [ "report.pdf", "document.docx", "sheet.xlsx", "slides.pptx", - "old.doc", "legacy.xls", "deck.ppt", "readme.txt", "data.csv", "page.html", "notes.md", "config.json", "feed.xml", ], ) -def test_parseable_documents_are_not_skipped(filename): - item = {".tag": "file", "name": filename} - assert should_skip_file(item) is False, f"{filename} should NOT be skipped" +def test_parseable_documents_are_not_skipped(filename, mocker): + """Files in plaintext/direct_convert/universal document sets are never skipped.""" + for service in ("DOCLING", "LLAMACLOUD", "UNSTRUCTURED"): + mocker.patch("app.config.config.ETL_SERVICE", service) + item = {".tag": "file", "name": filename} + assert should_skip_file(item) is False, ( + f"{filename} should NOT be skipped with {service}" + ) @pytest.mark.parametrize( "filename", ["photo.jpg", "image.jpeg", "screenshot.png", "scan.bmp", "page.tiff", "doc.tif"], ) -def test_universal_images_are_not_skipped(filename): - item = {".tag": "file", "name": filename} - assert should_skip_file(item) is False, f"{filename} should NOT be skipped" +def test_universal_images_are_not_skipped(filename, mocker): + """Images supported by all parsers are never skipped.""" + for service in ("DOCLING", "LLAMACLOUD", "UNSTRUCTURED"): + mocker.patch("app.config.config.ETL_SERVICE", service) + item = {".tag": "file", "name": filename} + assert should_skip_file(item) is False, ( + f"{filename} should NOT be skipped with {service}" + ) -@pytest.mark.parametrize( - "filename", - ["icon.svg", "anim.gif", "photo.webp", "live.heic"], -) -def test_non_universal_images_are_skipped(filename): +@pytest.mark.parametrize("filename,service,expected_skip", [ + ("old.doc", "DOCLING", True), + ("old.doc", "LLAMACLOUD", False), + ("old.doc", "UNSTRUCTURED", False), + ("legacy.xls", "DOCLING", True), + ("legacy.xls", "LLAMACLOUD", False), + ("legacy.xls", "UNSTRUCTURED", False), + ("deck.ppt", "DOCLING", True), + ("deck.ppt", "LLAMACLOUD", False), + ("deck.ppt", "UNSTRUCTURED", False), + ("icon.svg", "DOCLING", True), + ("icon.svg", "LLAMACLOUD", False), + ("anim.gif", "DOCLING", True), + ("anim.gif", "LLAMACLOUD", False), + ("photo.webp", "DOCLING", False), + ("photo.webp", "LLAMACLOUD", False), + ("photo.webp", "UNSTRUCTURED", True), + ("live.heic", "DOCLING", True), + ("live.heic", "UNSTRUCTURED", False), + ("macro.docm", "DOCLING", True), + ("macro.docm", "LLAMACLOUD", False), + ("mail.eml", "DOCLING", True), + ("mail.eml", "UNSTRUCTURED", False), +]) +def test_parser_specific_extensions(filename, service, expected_skip, mocker): + mocker.patch("app.config.config.ETL_SERVICE", service) item = {".tag": "file", "name": filename} - assert should_skip_file(item) is True, f"{filename} should be skipped" + assert should_skip_file(item) is expected_skip, ( + f"{filename} with {service}: expected skip={expected_skip}" + ) diff --git a/surfsense_backend/tests/unit/connectors/test_google_drive_file_types.py b/surfsense_backend/tests/unit/connectors/test_google_drive_file_types.py index adbad74c2..4ed7eb4db 100644 --- a/surfsense_backend/tests/unit/connectors/test_google_drive_file_types.py +++ b/surfsense_backend/tests/unit/connectors/test_google_drive_file_types.py @@ -10,13 +10,38 @@ pytestmark = pytest.mark.unit @pytest.mark.parametrize("filename", [ "malware.exe", "archive.zip", "video.mov", "font.woff2", "model.blend", ]) -def test_unsupported_extensions_are_skipped(filename): - assert should_skip_by_extension(filename) is True +def test_unsupported_extensions_are_skipped_regardless_of_service(filename, mocker): + """Truly unsupported files are skipped no matter which ETL service is configured.""" + for service in ("DOCLING", "LLAMACLOUD", "UNSTRUCTURED"): + mocker.patch("app.config.config.ETL_SERVICE", service) + assert should_skip_by_extension(filename) is True @pytest.mark.parametrize("filename", [ "report.pdf", "doc.docx", "sheet.xlsx", "slides.pptx", "readme.txt", "data.csv", "photo.png", "notes.md", ]) -def test_parseable_extensions_are_not_skipped(filename): - assert should_skip_by_extension(filename) is False +def test_universal_extensions_are_not_skipped(filename, mocker): + """Files supported by all parsers (or handled by plaintext/direct_convert) are never skipped.""" + for service in ("DOCLING", "LLAMACLOUD", "UNSTRUCTURED"): + mocker.patch("app.config.config.ETL_SERVICE", service) + assert should_skip_by_extension(filename) is False, ( + f"{filename} should NOT be skipped with {service}" + ) + + +@pytest.mark.parametrize("filename,service,expected_skip", [ + ("macro.docm", "DOCLING", True), + ("macro.docm", "LLAMACLOUD", False), + ("mail.eml", "DOCLING", True), + ("mail.eml", "UNSTRUCTURED", False), + ("photo.gif", "DOCLING", True), + ("photo.gif", "LLAMACLOUD", False), + ("photo.heic", "UNSTRUCTURED", False), + ("photo.heic", "DOCLING", True), +]) +def test_parser_specific_extensions(filename, service, expected_skip, mocker): + mocker.patch("app.config.config.ETL_SERVICE", service) + assert should_skip_by_extension(filename) is expected_skip, ( + f"{filename} with {service}: expected skip={expected_skip}" + ) diff --git a/surfsense_backend/tests/unit/connectors/test_onedrive_file_types.py b/surfsense_backend/tests/unit/connectors/test_onedrive_file_types.py index a2491257d..e73f799e2 100644 --- a/surfsense_backend/tests/unit/connectors/test_onedrive_file_types.py +++ b/surfsense_backend/tests/unit/connectors/test_onedrive_file_types.py @@ -7,6 +7,11 @@ from app.connectors.onedrive.file_types import should_skip_file pytestmark = pytest.mark.unit +# --------------------------------------------------------------------------- +# Structural skips (independent of ETL service) +# --------------------------------------------------------------------------- + + def test_folder_is_skipped(): item = {"folder": {}, "name": "My Folder"} assert should_skip_file(item) is True @@ -27,10 +32,16 @@ def test_onenote_is_skipped(): assert should_skip_file(item) is True +# --------------------------------------------------------------------------- +# Extension-based skips (require ETL service context) +# --------------------------------------------------------------------------- + + @pytest.mark.parametrize("filename", [ "malware.exe", "archive.zip", "video.mov", "font.woff2", "model.blend", ]) -def test_unsupported_extensions_are_skipped(filename): +def test_unsupported_extensions_are_skipped(filename, mocker): + mocker.patch("app.config.config.ETL_SERVICE", "DOCLING") item = {"name": filename, "file": {"mimeType": "application/octet-stream"}} assert should_skip_file(item) is True, f"{filename} should be skipped" @@ -39,6 +50,26 @@ def test_unsupported_extensions_are_skipped(filename): "report.pdf", "doc.docx", "sheet.xlsx", "slides.pptx", "readme.txt", "data.csv", "photo.png", "notes.md", ]) -def test_parseable_files_are_not_skipped(filename): +def test_universal_files_are_not_skipped(filename, mocker): + for service in ("DOCLING", "LLAMACLOUD", "UNSTRUCTURED"): + mocker.patch("app.config.config.ETL_SERVICE", service) + item = {"name": filename, "file": {"mimeType": "application/octet-stream"}} + assert should_skip_file(item) is False, ( + f"{filename} should NOT be skipped with {service}" + ) + + +@pytest.mark.parametrize("filename,service,expected_skip", [ + ("macro.docm", "DOCLING", True), + ("macro.docm", "LLAMACLOUD", False), + ("mail.eml", "DOCLING", True), + ("mail.eml", "UNSTRUCTURED", False), + ("photo.heic", "UNSTRUCTURED", False), + ("photo.heic", "DOCLING", True), +]) +def test_parser_specific_extensions(filename, service, expected_skip, mocker): + mocker.patch("app.config.config.ETL_SERVICE", service) item = {"name": filename, "file": {"mimeType": "application/octet-stream"}} - assert should_skip_file(item) is False, f"{filename} should NOT be skipped" + assert should_skip_file(item) is expected_skip, ( + f"{filename} with {service}: expected skip={expected_skip}" + ) diff --git a/surfsense_backend/tests/unit/etl_pipeline/test_etl_pipeline_service.py b/surfsense_backend/tests/unit/etl_pipeline/test_etl_pipeline_service.py index facf15eab..e90847e3a 100644 --- a/surfsense_backend/tests/unit/etl_pipeline/test_etl_pipeline_service.py +++ b/surfsense_backend/tests/unit/etl_pipeline/test_etl_pipeline_service.py @@ -377,3 +377,72 @@ async def test_extract_zip_raises_unsupported_error(tmp_path): await EtlPipelineService().extract( EtlRequest(file_path=str(zip_file), filename="archive.zip") ) + + +# --------------------------------------------------------------------------- +# Slice 14 – should_skip_for_service (per-parser document filtering) +# --------------------------------------------------------------------------- + + +@pytest.mark.parametrize("filename,etl_service,expected_skip", [ + ("file.eml", "DOCLING", True), + ("file.eml", "UNSTRUCTURED", False), + ("file.docm", "LLAMACLOUD", False), + ("file.docm", "DOCLING", True), + ("file.txt", "DOCLING", False), + ("file.csv", "LLAMACLOUD", False), + ("file.mp3", "UNSTRUCTURED", False), + ("file.exe", "LLAMACLOUD", True), + ("file.pdf", "DOCLING", False), + ("file.webp", "DOCLING", False), + ("file.webp", "UNSTRUCTURED", True), + ("file.gif", "LLAMACLOUD", False), + ("file.gif", "DOCLING", True), + ("file.heic", "UNSTRUCTURED", False), + ("file.heic", "DOCLING", True), + ("file.svg", "LLAMACLOUD", False), + ("file.svg", "DOCLING", True), + ("file.p7s", "UNSTRUCTURED", False), + ("file.p7s", "LLAMACLOUD", True), +]) +def test_should_skip_for_service(filename, etl_service, expected_skip): + from app.etl_pipeline.file_classifier import should_skip_for_service + + assert should_skip_for_service(filename, etl_service) is expected_skip, ( + f"{filename} with {etl_service}: expected skip={expected_skip}" + ) + + +# --------------------------------------------------------------------------- +# Slice 14b – ETL pipeline rejects per-parser incompatible documents +# --------------------------------------------------------------------------- + + +async def test_extract_docm_with_docling_raises_unsupported(tmp_path, mocker): + """Docling cannot parse .docm -- pipeline should reject before dispatching.""" + from app.etl_pipeline.exceptions import EtlUnsupportedFileError + + mocker.patch("app.config.config.ETL_SERVICE", "DOCLING") + + docm_file = tmp_path / "macro.docm" + docm_file.write_bytes(b"\x00" * 10) + + with pytest.raises(EtlUnsupportedFileError, match="not supported by DOCLING"): + await EtlPipelineService().extract( + EtlRequest(file_path=str(docm_file), filename="macro.docm") + ) + + +async def test_extract_eml_with_docling_raises_unsupported(tmp_path, mocker): + """Docling cannot parse .eml -- pipeline should reject before dispatching.""" + from app.etl_pipeline.exceptions import EtlUnsupportedFileError + + mocker.patch("app.config.config.ETL_SERVICE", "DOCLING") + + eml_file = tmp_path / "mail.eml" + eml_file.write_bytes(b"From: test@example.com") + + with pytest.raises(EtlUnsupportedFileError, match="not supported by DOCLING"): + await EtlPipelineService().extract( + EtlRequest(file_path=str(eml_file), filename="mail.eml") + ) diff --git a/surfsense_backend/tests/unit/utils/test_file_extensions.py b/surfsense_backend/tests/unit/utils/test_file_extensions.py index a376f44bd..acd8945ce 100644 --- a/surfsense_backend/tests/unit/utils/test_file_extensions.py +++ b/surfsense_backend/tests/unit/utils/test_file_extensions.py @@ -21,10 +21,17 @@ def test_exe_is_not_supported_document(): "report.pdf", "doc.docx", "old.doc", "sheet.xlsx", "legacy.xls", "slides.pptx", "deck.ppt", + "macro.docm", "macro.xlsm", "macro.pptm", "photo.png", "photo.jpg", "photo.jpeg", "scan.bmp", "scan.tiff", "scan.tif", + "photo.webp", "anim.gif", "iphone.heic", "manual.rtf", "book.epub", "letter.odt", "data.ods", "presentation.odp", - "korean.hwpx", + "inbox.eml", "outlook.msg", + "korean.hwpx", "korean.hwp", + "template.dot", "template.dotm", + "template.pot", "template.potx", + "binary.xlsb", "workspace.xlw", + "vector.svg", "signature.p7s", ]) def test_document_extensions_are_supported(filename): from app.utils.file_extensions import is_supported_document_extension @@ -40,3 +47,70 @@ def test_non_document_extensions_are_not_supported(filename): from app.utils.file_extensions import is_supported_document_extension assert is_supported_document_extension(filename) is False, f"{filename} should NOT be supported" + + +# --------------------------------------------------------------------------- +# Per-parser extension sets +# --------------------------------------------------------------------------- + + +def test_union_equals_all_three_sets(): + from app.utils.file_extensions import ( + DOCLING_DOCUMENT_EXTENSIONS, + DOCUMENT_EXTENSIONS, + LLAMAPARSE_DOCUMENT_EXTENSIONS, + UNSTRUCTURED_DOCUMENT_EXTENSIONS, + ) + + expected = ( + DOCLING_DOCUMENT_EXTENSIONS + | LLAMAPARSE_DOCUMENT_EXTENSIONS + | UNSTRUCTURED_DOCUMENT_EXTENSIONS + ) + assert DOCUMENT_EXTENSIONS == expected + + +def test_get_extensions_for_docling(): + from app.utils.file_extensions import get_document_extensions_for_service + + exts = get_document_extensions_for_service("DOCLING") + assert ".pdf" in exts + assert ".webp" in exts + assert ".docx" in exts + assert ".eml" not in exts + assert ".docm" not in exts + assert ".gif" not in exts + assert ".heic" not in exts + + +def test_get_extensions_for_llamacloud(): + from app.utils.file_extensions import get_document_extensions_for_service + + exts = get_document_extensions_for_service("LLAMACLOUD") + assert ".docm" in exts + assert ".gif" in exts + assert ".svg" in exts + assert ".hwp" in exts + assert ".eml" not in exts + assert ".heic" not in exts + + +def test_get_extensions_for_unstructured(): + from app.utils.file_extensions import get_document_extensions_for_service + + exts = get_document_extensions_for_service("UNSTRUCTURED") + assert ".eml" in exts + assert ".heic" in exts + assert ".p7s" in exts + assert ".docm" not in exts + assert ".gif" not in exts + assert ".svg" not in exts + + +def test_get_extensions_for_none_returns_union(): + from app.utils.file_extensions import ( + DOCUMENT_EXTENSIONS, + get_document_extensions_for_service, + ) + + assert get_document_extensions_for_service(None) == DOCUMENT_EXTENSIONS diff --git a/surfsense_web/components/sources/DocumentUploadTab.tsx b/surfsense_web/components/sources/DocumentUploadTab.tsx index 6b59f8ef6..c8ce195aa 100644 --- a/surfsense_web/components/sources/DocumentUploadTab.tsx +++ b/surfsense_web/components/sources/DocumentUploadTab.tsx @@ -85,7 +85,6 @@ const FILE_TYPE_CONFIG: Record> = { "application/rtf": [".rtf"], "application/xml": [".xml"], "application/epub+zip": [".epub"], - "text/html": [".html", ".htm", ".web"], "image/gif": [".gif"], "image/svg+xml": [".svg"], ...audioFileTypes, @@ -472,12 +471,13 @@ export function DocumentUploadTab({ )) ) : ( -
{ - if (!isElectron) fileInputRef.current?.click(); - }} - > + )}
@@ -683,13 +684,17 @@ export function DocumentUploadTab({ -
- {supportedExtensions.map((ext) => ( - - {ext} - - ))} -
+
+ {supportedExtensions.map((ext) => ( + + {ext} + + ))} +