refactor: implement file type classification for supported extensions across Dropbox, Google Drive, and OneDrive connectors, enhancing file handling and error management

2026-07-02 22:01:05 +02:00 · 2026-04-06 22:03:47 +05:30 · 2026-04-06 22:03:47 +05:30 · dc7047f64d
commit dc7047f64d
parent 47f4be08d9
14 changed files with 250 additions and 27 deletions
--- a/surfsense_backend/app/utils/file_extensions.py
+++ b/surfsense_backend/app/utils/file_extensions.py
@ -0,0 +1,31 @@
+"""Allowlist of document extensions the ETL parsers can handle.
+
+Every consumer (file_classifier, connector-level skip checks) imports from
+here so there is a single source of truth.  Extensions already covered by
+PLAINTEXT_EXTENSIONS, AUDIO_EXTENSIONS, or DIRECT_CONVERT_EXTENSIONS in
+file_classifier are NOT repeated here -- this set is exclusively for the
+"document" ETL path (Docling / LlamaParse / Unstructured).
+"""
+
+from pathlib import PurePosixPath
+
+DOCUMENT_EXTENSIONS: frozenset[str] = frozenset({
+    # PDF
+    ".pdf",
+    # Microsoft Office
+    ".docx", ".doc", ".xlsx", ".xls", ".pptx", ".ppt",
+    # Images (raster -- OCR / vision parsing)
+    ".png", ".jpg", ".jpeg", ".bmp", ".tiff", ".tif",
+    # Rich text / e-book
+    ".rtf", ".epub",
+    # OpenDocument
+    ".odt", ".ods", ".odp",
+    # Other (LlamaParse / Unstructured specific)
+    ".hwpx",
+})
+
+
+def is_supported_document_extension(filename: str) -> bool:
+    """Return True if the file's extension is in the supported document set."""
+    suffix = PurePosixPath(filename).suffix.lower()
+    return suffix in DOCUMENT_EXTENSIONS