refactor: unify file skipping logic across Dropbox, Google Drive, and OneDrive connectors by replacing classification checks with a centralized service-based approach, enhancing maintainability and consistency in file handling

2026-05-03 04:42:39 +02:00 · 2026-04-07 02:19:31 +05:30 · 2026-04-07 02:19:31 +05:30 · e7beeb2a36
commit e7beeb2a36
parent f03bf05aaa
13 changed files with 388 additions and 67 deletions
--- a/surfsense_backend/app/connectors/dropbox/file_types.py
+++ b/surfsense_backend/app/connectors/dropbox/file_types.py
@ -1,6 +1,6 @@
 """File type handlers for Dropbox."""
-from app.etl_pipeline.file_classifier import FileCategory, classify_file
+from app.etl_pipeline.file_classifier import should_skip_for_service
 PAPER_EXTENSION = ".paper"
@ -53,5 +53,7 @@ def should_skip_file(item: dict) -> bool:
        return False
    if not item.get("is_downloadable", True):
        return True
    from app.config import config as app_config
    name = item.get("name", "")
-    return classify_file(name) == FileCategory.UNSUPPORTED
+    return should_skip_for_service(name, app_config.ETL_SERVICE)
--- a/surfsense_backend/app/connectors/google_drive/file_types.py
+++ b/surfsense_backend/app/connectors/google_drive/file_types.py
@ -1,6 +1,6 @@
 """File type handlers for Google Drive."""
-from app.etl_pipeline.file_classifier import FileCategory, classify_file
+from app.etl_pipeline.file_classifier import should_skip_for_service
 GOOGLE_DOC = "application/vnd.google-apps.document"
 GOOGLE_SHEET = "application/vnd.google-apps.spreadsheet"
@ -49,8 +49,10 @@ def should_skip_file(mime_type: str) -> bool:
 def should_skip_by_extension(filename: str) -> bool:
-    """Return True if the file extension is not parseable by any ETL pipeline."""
+    """Return True if the file extension is not parseable by the configured ETL service."""
-    return classify_file(filename) == FileCategory.UNSUPPORTED
+    from app.config import config as app_config
    return should_skip_for_service(filename, app_config.ETL_SERVICE)
 def get_export_mime_type(mime_type: str) -> str | None:
--- a/surfsense_backend/app/connectors/onedrive/file_types.py
+++ b/surfsense_backend/app/connectors/onedrive/file_types.py
@ -1,6 +1,6 @@
 """File type handlers for Microsoft OneDrive."""
-from app.etl_pipeline.file_classifier import FileCategory, classify_file
+from app.etl_pipeline.file_classifier import should_skip_for_service
 ONEDRIVE_FOLDER_FACET = "folder"
 ONENOTE_MIME = "application/msonenote"
@ -51,5 +51,7 @@ def should_skip_file(item: dict) -> bool:
    mime = item.get("file", {}).get("mimeType", "")
    if mime in SKIP_MIME_TYPES:
        return True
    from app.config import config as app_config
    name = item.get("name", "")
-    return classify_file(name) == FileCategory.UNSUPPORTED
+    return should_skip_for_service(name, app_config.ETL_SERVICE)
--- a/surfsense_backend/app/etl_pipeline/etl_pipeline_service.py
+++ b/surfsense_backend/app/etl_pipeline/etl_pipeline_service.py
@ -45,6 +45,10 @@ class EtlPipelineService:
        return await self._extract_document(request)
    async def _extract_document(self, request: EtlRequest) -> EtlResult:
        from pathlib import PurePosixPath
        from app.utils.file_extensions import get_document_extensions_for_service
        etl_service = app_config.ETL_SERVICE
        if not etl_service:
            raise EtlServiceUnavailableError(
@ -52,6 +56,13 @@ class EtlPipelineService:
                "Set ETL_SERVICE to UNSTRUCTURED, LLAMACLOUD, or DOCLING in your .env"
            )
        ext = PurePosixPath(request.filename).suffix.lower()
        supported = get_document_extensions_for_service(etl_service)
        if ext not in supported:
            raise EtlUnsupportedFileError(
                f"File type {ext} is not supported by {etl_service}"
            )
        if etl_service == "DOCLING":
            from app.etl_pipeline.parsers.docling import parse_with_docling
--- a/surfsense_backend/app/etl_pipeline/file_classifier.py
+++ b/surfsense_backend/app/etl_pipeline/file_classifier.py
@ -1,7 +1,7 @@
 from enum import Enum
 from pathlib import PurePosixPath
-from app.utils.file_extensions import DOCUMENT_EXTENSIONS
+from app.utils.file_extensions import DOCUMENT_EXTENSIONS, get_document_extensions_for_service
 PLAINTEXT_EXTENSIONS = frozenset(
    {
@ -29,7 +29,7 @@ AUDIO_EXTENSIONS = frozenset(
    {".mp3", ".mp4", ".mpeg", ".mpga", ".m4a", ".wav", ".webm"}
 )
-DIRECT_CONVERT_EXTENSIONS = frozenset({".csv", ".tsv", ".html", ".htm"})
+DIRECT_CONVERT_EXTENSIONS = frozenset({".csv", ".tsv", ".html", ".htm", ".xhtml"})
 class FileCategory(Enum):
@ -51,3 +51,18 @@ def classify_file(filename: str) -> FileCategory:
    if suffix in DOCUMENT_EXTENSIONS:
        return FileCategory.DOCUMENT
    return FileCategory.UNSUPPORTED
 def should_skip_for_service(filename: str, etl_service: str | None) -> bool:
    """Return True if *filename* cannot be processed by *etl_service*.
    Plaintext, audio, and direct-convert files are parser-agnostic and never
    skipped.  Document files are checked against the per-parser extension set.
    """
    category = classify_file(filename)
    if category == FileCategory.UNSUPPORTED:
        return True
    if category == FileCategory.DOCUMENT:
        suffix = PurePosixPath(filename).suffix.lower()
        return suffix not in get_document_extensions_for_service(etl_service)
    return False
--- a/surfsense_backend/app/utils/file_extensions.py
+++ b/surfsense_backend/app/utils/file_extensions.py
@ -1,29 +1,69 @@
-"""Allowlist of document extensions the ETL parsers can handle.
+"""Per-parser document extension sets for the ETL pipeline.
-Every consumer (file_classifier, connector-level skip checks) imports from
+Every consumer (file_classifier, connector-level skip checks, ETL pipeline
-here so there is a single source of truth.  Extensions already covered by
+validation) imports from here so there is a single source of truth.
-PLAINTEXT_EXTENSIONS, AUDIO_EXTENSIONS, or DIRECT_CONVERT_EXTENSIONS in
+
-file_classifier are NOT repeated here -- this set is exclusively for the
+Extensions already covered by PLAINTEXT_EXTENSIONS, AUDIO_EXTENSIONS, or
-"document" ETL path (Docling / LlamaParse / Unstructured).
+DIRECT_CONVERT_EXTENSIONS in file_classifier are NOT repeated here -- these
 sets are exclusively for the "document" ETL path (Docling / LlamaParse /
 Unstructured).
 """
 from pathlib import PurePosixPath
-DOCUMENT_EXTENSIONS: frozenset[str] = frozenset({
+# ---------------------------------------------------------------------------
-    # PDF
+# Per-parser document extension sets (from official documentation)
 # ---------------------------------------------------------------------------
 DOCLING_DOCUMENT_EXTENSIONS: frozenset[str] = frozenset({
    ".pdf",
-    # Microsoft Office
+    ".docx", ".xlsx", ".pptx",
-    ".docx", ".doc", ".xlsx", ".xls", ".pptx", ".ppt",
+    ".png", ".jpg", ".jpeg", ".tiff", ".tif", ".bmp", ".webp",
    # Images (raster: OCR / vision parsing)
    ".png", ".jpg", ".jpeg", ".bmp", ".tiff", ".tif",
    # Rich text / e-book
    ".rtf", ".epub",
    # OpenDocument
    ".odt", ".ods", ".odp",
    # Other (LlamaParse / Unstructured specific)
    ".hwpx",
 })
 LLAMAPARSE_DOCUMENT_EXTENSIONS: frozenset[str] = frozenset({
    ".pdf",
    ".docx", ".doc", ".xlsx", ".xls", ".pptx", ".ppt",
    ".docm", ".dot", ".dotm", ".pptm", ".pot", ".potx",
    ".xlsm", ".xlsb", ".xlw",
    ".rtf", ".epub",
    ".png", ".jpg", ".jpeg", ".gif", ".bmp", ".tiff", ".tif", ".webp", ".svg",
    ".odt", ".ods", ".odp",
    ".hwp", ".hwpx",
 })
 UNSTRUCTURED_DOCUMENT_EXTENSIONS: frozenset[str] = frozenset({
    ".pdf",
    ".docx", ".doc", ".xlsx", ".xls", ".pptx", ".ppt",
    ".png", ".jpg", ".jpeg", ".bmp", ".tiff", ".tif", ".heic",
    ".rtf", ".epub", ".odt",
    ".eml", ".msg", ".p7s",
 })
 # ---------------------------------------------------------------------------
 # Union (used by classify_file for routing) + service lookup
 # ---------------------------------------------------------------------------
 DOCUMENT_EXTENSIONS: frozenset[str] = (
    DOCLING_DOCUMENT_EXTENSIONS
    | LLAMAPARSE_DOCUMENT_EXTENSIONS
    | UNSTRUCTURED_DOCUMENT_EXTENSIONS
 )
 _SERVICE_MAP: dict[str, frozenset[str]] = {
    "DOCLING": DOCLING_DOCUMENT_EXTENSIONS,
    "LLAMACLOUD": LLAMAPARSE_DOCUMENT_EXTENSIONS,
    "UNSTRUCTURED": UNSTRUCTURED_DOCUMENT_EXTENSIONS,
 }
 def get_document_extensions_for_service(etl_service: str | None) -> frozenset[str]:
    """Return the document extensions supported by *etl_service*.
    Falls back to the full union when the service is ``None`` or unknown.
    """
    return _SERVICE_MAP.get(etl_service or "", DOCUMENT_EXTENSIONS)
 def is_supported_document_extension(filename: str) -> bool:
    """Return True if the file's extension is in the supported document set."""
--- a/surfsense_backend/tests/unit/connector_indexers/test_dropbox_parallel.py
+++ b/surfsense_backend/tests/unit/connector_indexers/test_dropbox_parallel.py
@ -261,6 +261,8 @@ def full_scan_mocks(mock_dropbox_client, monkeypatch):
    skip_results: dict[str, tuple[bool, str | None]] = {}
    monkeypatch.setattr("app.config.config.ETL_SERVICE", "LLAMACLOUD")
    async def _fake_skip(session, file, search_space_id):
        from app.connectors.dropbox.file_types import should_skip_file as _skip
        if _skip(file):
--- a/surfsense_backend/tests/unit/connectors/test_dropbox_file_types.py
+++ b/surfsense_backend/tests/unit/connectors/test_dropbox_file_types.py
@ -7,6 +7,11 @@ from app.connectors.dropbox.file_types import should_skip_file
 pytestmark = pytest.mark.unit
 # ---------------------------------------------------------------------------
 # Structural skips (independent of ETL service)
 # ---------------------------------------------------------------------------
 def test_folder_item_is_skipped():
    item = {".tag": "folder", "name": "My Folder"}
    assert should_skip_file(item) is True
@ -22,13 +27,18 @@ def test_non_downloadable_item_is_skipped():
    assert should_skip_file(item) is True
 # ---------------------------------------------------------------------------
 # Extension-based skips (require ETL service context)
 # ---------------------------------------------------------------------------
@pytest.mark.parametrize(
    "filename",
    [
        "archive.zip", "backup.tar", "data.gz", "stuff.rar", "pack.7z",
        "program.exe", "lib.dll", "module.so", "image.dmg", "disk.iso",
        "movie.mov", "clip.avi", "video.mkv", "film.wmv", "stream.flv",
-        "icon.svg", "anim.gif", "photo.webp", "shot.heic", "favicon.ico",
+        "favicon.ico",
        "raw.cr2", "photo.nef", "image.arw", "pic.dng",
        "design.psd", "vector.ai", "mockup.sketch", "proto.fig",
        "font.ttf", "font.otf", "font.woff", "font.woff2",
@ -36,7 +46,8 @@ def test_non_downloadable_item_is_skipped():
        "local.db", "data.sqlite", "access.mdb",
    ],
 )
-def test_non_parseable_extensions_are_skipped(filename):
+def test_non_parseable_extensions_are_skipped(filename, mocker):
    mocker.patch("app.config.config.ETL_SERVICE", "DOCLING")
    item = {".tag": "file", "name": filename}
    assert should_skip_file(item) is True, f"{filename} should be skipped"
@ -45,29 +56,61 @@ def test_non_parseable_extensions_are_skipped(filename):
    "filename",
    [
        "report.pdf", "document.docx", "sheet.xlsx", "slides.pptx",
        "old.doc", "legacy.xls", "deck.ppt",
        "readme.txt", "data.csv", "page.html", "notes.md",
        "config.json", "feed.xml",
    ],
 )
-def test_parseable_documents_are_not_skipped(filename):
+def test_parseable_documents_are_not_skipped(filename, mocker):
    """Files in plaintext/direct_convert/universal document sets are never skipped."""
    for service in ("DOCLING", "LLAMACLOUD", "UNSTRUCTURED"):
        mocker.patch("app.config.config.ETL_SERVICE", service)
        item = {".tag": "file", "name": filename}
-    assert should_skip_file(item) is False, f"{filename} should NOT be skipped"
+        assert should_skip_file(item) is False, (
            f"{filename} should NOT be skipped with {service}"
        )
@pytest.mark.parametrize(
    "filename",
    ["photo.jpg", "image.jpeg", "screenshot.png", "scan.bmp", "page.tiff", "doc.tif"],
 )
-def test_universal_images_are_not_skipped(filename):
+def test_universal_images_are_not_skipped(filename, mocker):
    """Images supported by all parsers are never skipped."""
    for service in ("DOCLING", "LLAMACLOUD", "UNSTRUCTURED"):
        mocker.patch("app.config.config.ETL_SERVICE", service)
        item = {".tag": "file", "name": filename}
-    assert should_skip_file(item) is False, f"{filename} should NOT be skipped"
+        assert should_skip_file(item) is False, (
-
+            f"{filename} should NOT be skipped with {service}"
@pytest.mark.parametrize(
    "filename",
    ["icon.svg", "anim.gif", "photo.webp", "live.heic"],
        )
-def test_non_universal_images_are_skipped(filename):
+
@pytest.mark.parametrize("filename,service,expected_skip", [
    ("old.doc", "DOCLING", True),
    ("old.doc", "LLAMACLOUD", False),
    ("old.doc", "UNSTRUCTURED", False),
    ("legacy.xls", "DOCLING", True),
    ("legacy.xls", "LLAMACLOUD", False),
    ("legacy.xls", "UNSTRUCTURED", False),
    ("deck.ppt", "DOCLING", True),
    ("deck.ppt", "LLAMACLOUD", False),
    ("deck.ppt", "UNSTRUCTURED", False),
    ("icon.svg", "DOCLING", True),
    ("icon.svg", "LLAMACLOUD", False),
    ("anim.gif", "DOCLING", True),
    ("anim.gif", "LLAMACLOUD", False),
    ("photo.webp", "DOCLING", False),
    ("photo.webp", "LLAMACLOUD", False),
    ("photo.webp", "UNSTRUCTURED", True),
    ("live.heic", "DOCLING", True),
    ("live.heic", "UNSTRUCTURED", False),
    ("macro.docm", "DOCLING", True),
    ("macro.docm", "LLAMACLOUD", False),
    ("mail.eml", "DOCLING", True),
    ("mail.eml", "UNSTRUCTURED", False),
 ])
 def test_parser_specific_extensions(filename, service, expected_skip, mocker):
    mocker.patch("app.config.config.ETL_SERVICE", service)
    item = {".tag": "file", "name": filename}
-    assert should_skip_file(item) is True, f"{filename} should be skipped"
+    assert should_skip_file(item) is expected_skip, (
        f"{filename} with {service}: expected skip={expected_skip}"
    )
--- a/surfsense_backend/tests/unit/connectors/test_google_drive_file_types.py
+++ b/surfsense_backend/tests/unit/connectors/test_google_drive_file_types.py
@ -10,7 +10,10 @@ pytestmark = pytest.mark.unit
@pytest.mark.parametrize("filename", [
    "malware.exe", "archive.zip", "video.mov", "font.woff2", "model.blend",
 ])
-def test_unsupported_extensions_are_skipped(filename):
+def test_unsupported_extensions_are_skipped_regardless_of_service(filename, mocker):
    """Truly unsupported files are skipped no matter which ETL service is configured."""
    for service in ("DOCLING", "LLAMACLOUD", "UNSTRUCTURED"):
        mocker.patch("app.config.config.ETL_SERVICE", service)
        assert should_skip_by_extension(filename) is True
@ -18,5 +21,27 @@ def test_unsupported_extensions_are_skipped(filename):
    "report.pdf", "doc.docx", "sheet.xlsx", "slides.pptx",
    "readme.txt", "data.csv", "photo.png", "notes.md",
 ])
-def test_parseable_extensions_are_not_skipped(filename):
+def test_universal_extensions_are_not_skipped(filename, mocker):
-    assert should_skip_by_extension(filename) is False
+    """Files supported by all parsers (or handled by plaintext/direct_convert) are never skipped."""
    for service in ("DOCLING", "LLAMACLOUD", "UNSTRUCTURED"):
        mocker.patch("app.config.config.ETL_SERVICE", service)
        assert should_skip_by_extension(filename) is False, (
            f"{filename} should NOT be skipped with {service}"
        )
@pytest.mark.parametrize("filename,service,expected_skip", [
    ("macro.docm", "DOCLING", True),
    ("macro.docm", "LLAMACLOUD", False),
    ("mail.eml", "DOCLING", True),
    ("mail.eml", "UNSTRUCTURED", False),
    ("photo.gif", "DOCLING", True),
    ("photo.gif", "LLAMACLOUD", False),
    ("photo.heic", "UNSTRUCTURED", False),
    ("photo.heic", "DOCLING", True),
 ])
 def test_parser_specific_extensions(filename, service, expected_skip, mocker):
    mocker.patch("app.config.config.ETL_SERVICE", service)
    assert should_skip_by_extension(filename) is expected_skip, (
        f"{filename} with {service}: expected skip={expected_skip}"
    )
--- a/surfsense_backend/tests/unit/connectors/test_onedrive_file_types.py
+++ b/surfsense_backend/tests/unit/connectors/test_onedrive_file_types.py
@ -7,6 +7,11 @@ from app.connectors.onedrive.file_types import should_skip_file
 pytestmark = pytest.mark.unit
 # ---------------------------------------------------------------------------
 # Structural skips (independent of ETL service)
 # ---------------------------------------------------------------------------
 def test_folder_is_skipped():
    item = {"folder": {}, "name": "My Folder"}
    assert should_skip_file(item) is True
@ -27,10 +32,16 @@ def test_onenote_is_skipped():
    assert should_skip_file(item) is True
 # ---------------------------------------------------------------------------
 # Extension-based skips (require ETL service context)
 # ---------------------------------------------------------------------------
@pytest.mark.parametrize("filename", [
    "malware.exe", "archive.zip", "video.mov", "font.woff2", "model.blend",
 ])
-def test_unsupported_extensions_are_skipped(filename):
+def test_unsupported_extensions_are_skipped(filename, mocker):
    mocker.patch("app.config.config.ETL_SERVICE", "DOCLING")
    item = {"name": filename, "file": {"mimeType": "application/octet-stream"}}
    assert should_skip_file(item) is True, f"{filename} should be skipped"
@ -39,6 +50,26 @@ def test_unsupported_extensions_are_skipped(filename):
    "report.pdf", "doc.docx", "sheet.xlsx", "slides.pptx",
    "readme.txt", "data.csv", "photo.png", "notes.md",
 ])
-def test_parseable_files_are_not_skipped(filename):
+def test_universal_files_are_not_skipped(filename, mocker):
    for service in ("DOCLING", "LLAMACLOUD", "UNSTRUCTURED"):
        mocker.patch("app.config.config.ETL_SERVICE", service)
        item = {"name": filename, "file": {"mimeType": "application/octet-stream"}}
-    assert should_skip_file(item) is False, f"{filename} should NOT be skipped"
+        assert should_skip_file(item) is False, (
            f"{filename} should NOT be skipped with {service}"
        )
@pytest.mark.parametrize("filename,service,expected_skip", [
    ("macro.docm", "DOCLING", True),
    ("macro.docm", "LLAMACLOUD", False),
    ("mail.eml", "DOCLING", True),
    ("mail.eml", "UNSTRUCTURED", False),
    ("photo.heic", "UNSTRUCTURED", False),
    ("photo.heic", "DOCLING", True),
 ])
 def test_parser_specific_extensions(filename, service, expected_skip, mocker):
    mocker.patch("app.config.config.ETL_SERVICE", service)
    item = {"name": filename, "file": {"mimeType": "application/octet-stream"}}
    assert should_skip_file(item) is expected_skip, (
        f"{filename} with {service}: expected skip={expected_skip}"
    )
--- a/surfsense_backend/tests/unit/etl_pipeline/test_etl_pipeline_service.py
+++ b/surfsense_backend/tests/unit/etl_pipeline/test_etl_pipeline_service.py
@ -377,3 +377,72 @@ async def test_extract_zip_raises_unsupported_error(tmp_path):
        await EtlPipelineService().extract(
            EtlRequest(file_path=str(zip_file), filename="archive.zip")
        )
 # ---------------------------------------------------------------------------
 # Slice 14 – should_skip_for_service (per-parser document filtering)
 # ---------------------------------------------------------------------------
@pytest.mark.parametrize("filename,etl_service,expected_skip", [
    ("file.eml", "DOCLING", True),
    ("file.eml", "UNSTRUCTURED", False),
    ("file.docm", "LLAMACLOUD", False),
    ("file.docm", "DOCLING", True),
    ("file.txt", "DOCLING", False),
    ("file.csv", "LLAMACLOUD", False),
    ("file.mp3", "UNSTRUCTURED", False),
    ("file.exe", "LLAMACLOUD", True),
    ("file.pdf", "DOCLING", False),
    ("file.webp", "DOCLING", False),
    ("file.webp", "UNSTRUCTURED", True),
    ("file.gif", "LLAMACLOUD", False),
    ("file.gif", "DOCLING", True),
    ("file.heic", "UNSTRUCTURED", False),
    ("file.heic", "DOCLING", True),
    ("file.svg", "LLAMACLOUD", False),
    ("file.svg", "DOCLING", True),
    ("file.p7s", "UNSTRUCTURED", False),
    ("file.p7s", "LLAMACLOUD", True),
 ])
 def test_should_skip_for_service(filename, etl_service, expected_skip):
    from app.etl_pipeline.file_classifier import should_skip_for_service
    assert should_skip_for_service(filename, etl_service) is expected_skip, (
        f"{filename} with {etl_service}: expected skip={expected_skip}"
    )
 # ---------------------------------------------------------------------------
 # Slice 14b – ETL pipeline rejects per-parser incompatible documents
 # ---------------------------------------------------------------------------
 async def test_extract_docm_with_docling_raises_unsupported(tmp_path, mocker):
    """Docling cannot parse .docm -- pipeline should reject before dispatching."""
    from app.etl_pipeline.exceptions import EtlUnsupportedFileError
    mocker.patch("app.config.config.ETL_SERVICE", "DOCLING")
    docm_file = tmp_path / "macro.docm"
    docm_file.write_bytes(b"\x00" * 10)
    with pytest.raises(EtlUnsupportedFileError, match="not supported by DOCLING"):
        await EtlPipelineService().extract(
            EtlRequest(file_path=str(docm_file), filename="macro.docm")
        )
 async def test_extract_eml_with_docling_raises_unsupported(tmp_path, mocker):
    """Docling cannot parse .eml -- pipeline should reject before dispatching."""
    from app.etl_pipeline.exceptions import EtlUnsupportedFileError
    mocker.patch("app.config.config.ETL_SERVICE", "DOCLING")
    eml_file = tmp_path / "mail.eml"
    eml_file.write_bytes(b"From: test@example.com")
    with pytest.raises(EtlUnsupportedFileError, match="not supported by DOCLING"):
        await EtlPipelineService().extract(
            EtlRequest(file_path=str(eml_file), filename="mail.eml")
        )
--- a/surfsense_backend/tests/unit/utils/test_file_extensions.py
+++ b/surfsense_backend/tests/unit/utils/test_file_extensions.py
@ -21,10 +21,17 @@ def test_exe_is_not_supported_document():
    "report.pdf", "doc.docx", "old.doc",
    "sheet.xlsx", "legacy.xls",
    "slides.pptx", "deck.ppt",
    "macro.docm", "macro.xlsm", "macro.pptm",
    "photo.png", "photo.jpg", "photo.jpeg", "scan.bmp", "scan.tiff", "scan.tif",
    "photo.webp", "anim.gif", "iphone.heic",
    "manual.rtf", "book.epub",
    "letter.odt", "data.ods", "presentation.odp",
-    "korean.hwpx",
+    "inbox.eml", "outlook.msg",
    "korean.hwpx", "korean.hwp",
    "template.dot", "template.dotm",
    "template.pot", "template.potx",
    "binary.xlsb", "workspace.xlw",
    "vector.svg", "signature.p7s",
 ])
 def test_document_extensions_are_supported(filename):
    from app.utils.file_extensions import is_supported_document_extension
@ -40,3 +47,70 @@ def test_non_document_extensions_are_not_supported(filename):
    from app.utils.file_extensions import is_supported_document_extension
    assert is_supported_document_extension(filename) is False, f"{filename} should NOT be supported"
 # ---------------------------------------------------------------------------
 # Per-parser extension sets
 # ---------------------------------------------------------------------------
 def test_union_equals_all_three_sets():
    from app.utils.file_extensions import (
        DOCLING_DOCUMENT_EXTENSIONS,
        DOCUMENT_EXTENSIONS,
        LLAMAPARSE_DOCUMENT_EXTENSIONS,
        UNSTRUCTURED_DOCUMENT_EXTENSIONS,
    )
    expected = (
        DOCLING_DOCUMENT_EXTENSIONS
        | LLAMAPARSE_DOCUMENT_EXTENSIONS
        | UNSTRUCTURED_DOCUMENT_EXTENSIONS
    )
    assert DOCUMENT_EXTENSIONS == expected
 def test_get_extensions_for_docling():
    from app.utils.file_extensions import get_document_extensions_for_service
    exts = get_document_extensions_for_service("DOCLING")
    assert ".pdf" in exts
    assert ".webp" in exts
    assert ".docx" in exts
    assert ".eml" not in exts
    assert ".docm" not in exts
    assert ".gif" not in exts
    assert ".heic" not in exts
 def test_get_extensions_for_llamacloud():
    from app.utils.file_extensions import get_document_extensions_for_service
    exts = get_document_extensions_for_service("LLAMACLOUD")
    assert ".docm" in exts
    assert ".gif" in exts
    assert ".svg" in exts
    assert ".hwp" in exts
    assert ".eml" not in exts
    assert ".heic" not in exts
 def test_get_extensions_for_unstructured():
    from app.utils.file_extensions import get_document_extensions_for_service
    exts = get_document_extensions_for_service("UNSTRUCTURED")
    assert ".eml" in exts
    assert ".heic" in exts
    assert ".p7s" in exts
    assert ".docm" not in exts
    assert ".gif" not in exts
    assert ".svg" not in exts
 def test_get_extensions_for_none_returns_union():
    from app.utils.file_extensions import (
        DOCUMENT_EXTENSIONS,
        get_document_extensions_for_service,
    )
    assert get_document_extensions_for_service(None) == DOCUMENT_EXTENSIONS
--- a/surfsense_web/components/sources/DocumentUploadTab.tsx
+++ b/surfsense_web/components/sources/DocumentUploadTab.tsx
@ -85,7 +85,6 @@ const FILE_TYPE_CONFIG: Record<string, Record<string, string[]>> = {
 		"application/rtf": [".rtf"],
 		"application/xml": [".xml"],
 		"application/epub+zip": [".epub"],
 		"text/html": [".html", ".htm", ".web"],
 		"image/gif": [".gif"],
 		"image/svg+xml": [".svg"],
 		...audioFileTypes,
@ -472,8 +471,9 @@ export function DocumentUploadTab({
 						</button>
 					))
 				) : (
-					<div
+				<button
-						className="flex flex-col items-center gap-4 py-12 px-4 cursor-pointer"
+					type="button"
 					className="flex flex-col items-center gap-4 py-12 px-4 cursor-pointer w-full bg-transparent border-none"
 					onClick={() => {
 						if (!isElectron) fileInputRef.current?.click();
 					}}
@ -485,10 +485,11 @@ export function DocumentUploadTab({
 							</p>
 							<p className="text-sm text-muted-foreground">{t("file_size_limit")}</p>
 						</div>
-						<div className="w-full mt-1" onClick={(e) => e.stopPropagation()}>
+					{/* biome-ignore lint/a11y/useSemanticElements: wrapper to stop click propagation to parent button */}
 					<div className="w-full mt-1" onClick={(e) => e.stopPropagation()} onKeyDown={(e) => e.stopPropagation()} role="group">
 						{renderBrowseButton({ fullWidth: true })}
 					</div>
-					</div>
+					</button>
 				)}
 			</div>
@ -683,9 +684,13 @@ export function DocumentUploadTab({
 						</span>
 					</AccordionTrigger>
 					<AccordionContent className="px-3 pb-3">
-						<div className="flex flex-wrap gap-1">
+					<div className="flex flex-wrap gap-1.5">
 						{supportedExtensions.map((ext) => (
-								<Badge key={ext} variant="outline" className="text-[10px] px-1.5 py-0">
+							<Badge
 								key={ext}
 								variant="secondary"
 								className="rounded border-0 bg-neutral-200/80 dark:bg-neutral-700/60 text-muted-foreground text-[10px] px-2 py-0.5 font-normal"
 							>
 								{ext}
 							</Badge>
 						))}