refactor: unify file skipping logic across Dropbox, Google Drive, and OneDrive connectors by replacing classification checks with a centralized service-based approach, enhancing maintainability and consistency in file handling

2026-05-12 01:02:39 +02:00 · 2026-04-07 02:19:31 +05:30 · 2026-04-07 02:19:31 +05:30 · e7beeb2a36
commit e7beeb2a36
parent f03bf05aaa
13 changed files with 388 additions and 67 deletions
--- a/surfsense_backend/app/connectors/dropbox/file_types.py
+++ b/surfsense_backend/app/connectors/dropbox/file_types.py
@ -1,6 +1,6 @@
 """File type handlers for Dropbox."""

-from app.etl_pipeline.file_classifier import FileCategory, classify_file
+from app.etl_pipeline.file_classifier import should_skip_for_service

 PAPER_EXTENSION = ".paper"

@ -53,5 +53,7 @@ def should_skip_file(item: dict) -> bool:
        return False
    if not item.get("is_downloadable", True):
        return True
+    from app.config import config as app_config
+
    name = item.get("name", "")
-    return classify_file(name) == FileCategory.UNSUPPORTED
+    return should_skip_for_service(name, app_config.ETL_SERVICE)
--- a/surfsense_backend/app/connectors/google_drive/file_types.py
+++ b/surfsense_backend/app/connectors/google_drive/file_types.py
@ -1,6 +1,6 @@
 """File type handlers for Google Drive."""

-from app.etl_pipeline.file_classifier import FileCategory, classify_file
+from app.etl_pipeline.file_classifier import should_skip_for_service

 GOOGLE_DOC = "application/vnd.google-apps.document"
 GOOGLE_SHEET = "application/vnd.google-apps.spreadsheet"
@ -49,8 +49,10 @@ def should_skip_file(mime_type: str) -> bool:


 def should_skip_by_extension(filename: str) -> bool:
-    """Return True if the file extension is not parseable by any ETL pipeline."""
-    return classify_file(filename) == FileCategory.UNSUPPORTED
+    """Return True if the file extension is not parseable by the configured ETL service."""
+    from app.config import config as app_config
+
+    return should_skip_for_service(filename, app_config.ETL_SERVICE)


 def get_export_mime_type(mime_type: str) -> str | None:
--- a/surfsense_backend/app/connectors/onedrive/file_types.py
+++ b/surfsense_backend/app/connectors/onedrive/file_types.py
@ -1,6 +1,6 @@
 """File type handlers for Microsoft OneDrive."""

-from app.etl_pipeline.file_classifier import FileCategory, classify_file
+from app.etl_pipeline.file_classifier import should_skip_for_service

 ONEDRIVE_FOLDER_FACET = "folder"
 ONENOTE_MIME = "application/msonenote"
@ -51,5 +51,7 @@ def should_skip_file(item: dict) -> bool:
    mime = item.get("file", {}).get("mimeType", "")
    if mime in SKIP_MIME_TYPES:
        return True
+    from app.config import config as app_config
+
    name = item.get("name", "")
-    return classify_file(name) == FileCategory.UNSUPPORTED
+    return should_skip_for_service(name, app_config.ETL_SERVICE)
--- a/surfsense_backend/app/etl_pipeline/etl_pipeline_service.py
+++ b/surfsense_backend/app/etl_pipeline/etl_pipeline_service.py
@ -45,6 +45,10 @@ class EtlPipelineService:
        return await self._extract_document(request)

    async def _extract_document(self, request: EtlRequest) -> EtlResult:
+        from pathlib import PurePosixPath
+
+        from app.utils.file_extensions import get_document_extensions_for_service
+
        etl_service = app_config.ETL_SERVICE
        if not etl_service:
            raise EtlServiceUnavailableError(
@ -52,6 +56,13 @@ class EtlPipelineService:
                "Set ETL_SERVICE to UNSTRUCTURED, LLAMACLOUD, or DOCLING in your .env"
            )

+        ext = PurePosixPath(request.filename).suffix.lower()
+        supported = get_document_extensions_for_service(etl_service)
+        if ext not in supported:
+            raise EtlUnsupportedFileError(
+                f"File type {ext} is not supported by {etl_service}"
+            )
+
        if etl_service == "DOCLING":
            from app.etl_pipeline.parsers.docling import parse_with_docling

--- a/surfsense_backend/app/etl_pipeline/file_classifier.py
+++ b/surfsense_backend/app/etl_pipeline/file_classifier.py
@ -1,7 +1,7 @@
 from enum import Enum
 from pathlib import PurePosixPath

-from app.utils.file_extensions import DOCUMENT_EXTENSIONS
+from app.utils.file_extensions import DOCUMENT_EXTENSIONS, get_document_extensions_for_service

 PLAINTEXT_EXTENSIONS = frozenset(
    {
@ -29,7 +29,7 @@ AUDIO_EXTENSIONS = frozenset(
    {".mp3", ".mp4", ".mpeg", ".mpga", ".m4a", ".wav", ".webm"}
 )

-DIRECT_CONVERT_EXTENSIONS = frozenset({".csv", ".tsv", ".html", ".htm"})
+DIRECT_CONVERT_EXTENSIONS = frozenset({".csv", ".tsv", ".html", ".htm", ".xhtml"})


 class FileCategory(Enum):
@ -51,3 +51,18 @@ def classify_file(filename: str) -> FileCategory:
    if suffix in DOCUMENT_EXTENSIONS:
        return FileCategory.DOCUMENT
    return FileCategory.UNSUPPORTED
+
+
+def should_skip_for_service(filename: str, etl_service: str | None) -> bool:
+    """Return True if *filename* cannot be processed by *etl_service*.
+
+    Plaintext, audio, and direct-convert files are parser-agnostic and never
+    skipped.  Document files are checked against the per-parser extension set.
+    """
+    category = classify_file(filename)
+    if category == FileCategory.UNSUPPORTED:
+        return True
+    if category == FileCategory.DOCUMENT:
+        suffix = PurePosixPath(filename).suffix.lower()
+        return suffix not in get_document_extensions_for_service(etl_service)
+    return False
--- a/surfsense_backend/app/utils/file_extensions.py
+++ b/surfsense_backend/app/utils/file_extensions.py
@ -1,29 +1,69 @@
-"""Allowlist of document extensions the ETL parsers can handle.
+"""Per-parser document extension sets for the ETL pipeline.

-Every consumer (file_classifier, connector-level skip checks) imports from
-here so there is a single source of truth.  Extensions already covered by
-PLAINTEXT_EXTENSIONS, AUDIO_EXTENSIONS, or DIRECT_CONVERT_EXTENSIONS in
-file_classifier are NOT repeated here -- this set is exclusively for the
-"document" ETL path (Docling / LlamaParse / Unstructured).
+Every consumer (file_classifier, connector-level skip checks, ETL pipeline
+validation) imports from here so there is a single source of truth.
+
+Extensions already covered by PLAINTEXT_EXTENSIONS, AUDIO_EXTENSIONS, or
+DIRECT_CONVERT_EXTENSIONS in file_classifier are NOT repeated here -- these
+sets are exclusively for the "document" ETL path (Docling / LlamaParse /
+Unstructured).
 """

 from pathlib import PurePosixPath

-DOCUMENT_EXTENSIONS: frozenset[str] = frozenset({
-    # PDF
+# ---------------------------------------------------------------------------
+# Per-parser document extension sets (from official documentation)
+# ---------------------------------------------------------------------------
+
+DOCLING_DOCUMENT_EXTENSIONS: frozenset[str] = frozenset({
    ".pdf",
-    # Microsoft Office
-    ".docx", ".doc", ".xlsx", ".xls", ".pptx", ".ppt",
-    # Images (raster: OCR / vision parsing)
-    ".png", ".jpg", ".jpeg", ".bmp", ".tiff", ".tif",
-    # Rich text / e-book
-    ".rtf", ".epub",
-    # OpenDocument
-    ".odt", ".ods", ".odp",
-    # Other (LlamaParse / Unstructured specific)
-    ".hwpx",
+    ".docx", ".xlsx", ".pptx",
+    ".png", ".jpg", ".jpeg", ".tiff", ".tif", ".bmp", ".webp",
 })

+LLAMAPARSE_DOCUMENT_EXTENSIONS: frozenset[str] = frozenset({
+    ".pdf",
+    ".docx", ".doc", ".xlsx", ".xls", ".pptx", ".ppt",
+    ".docm", ".dot", ".dotm", ".pptm", ".pot", ".potx",
+    ".xlsm", ".xlsb", ".xlw",
+    ".rtf", ".epub",
+    ".png", ".jpg", ".jpeg", ".gif", ".bmp", ".tiff", ".tif", ".webp", ".svg",
+    ".odt", ".ods", ".odp",
+    ".hwp", ".hwpx",
+})
+
+UNSTRUCTURED_DOCUMENT_EXTENSIONS: frozenset[str] = frozenset({
+    ".pdf",
+    ".docx", ".doc", ".xlsx", ".xls", ".pptx", ".ppt",
+    ".png", ".jpg", ".jpeg", ".bmp", ".tiff", ".tif", ".heic",
+    ".rtf", ".epub", ".odt",
+    ".eml", ".msg", ".p7s",
+})
+
+# ---------------------------------------------------------------------------
+# Union (used by classify_file for routing) + service lookup
+# ---------------------------------------------------------------------------
+
+DOCUMENT_EXTENSIONS: frozenset[str] = (
+    DOCLING_DOCUMENT_EXTENSIONS
+    | LLAMAPARSE_DOCUMENT_EXTENSIONS
+    | UNSTRUCTURED_DOCUMENT_EXTENSIONS
+)
+
+_SERVICE_MAP: dict[str, frozenset[str]] = {
+    "DOCLING": DOCLING_DOCUMENT_EXTENSIONS,
+    "LLAMACLOUD": LLAMAPARSE_DOCUMENT_EXTENSIONS,
+    "UNSTRUCTURED": UNSTRUCTURED_DOCUMENT_EXTENSIONS,
+}
+
+
+def get_document_extensions_for_service(etl_service: str | None) -> frozenset[str]:
+    """Return the document extensions supported by *etl_service*.
+
+    Falls back to the full union when the service is ``None`` or unknown.
+    """
+    return _SERVICE_MAP.get(etl_service or "", DOCUMENT_EXTENSIONS)
+

 def is_supported_document_extension(filename: str) -> bool:
    """Return True if the file's extension is in the supported document set."""