mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-04-26 01:06:23 +02:00
refactor: implement file type classification for supported extensions across Dropbox, Google Drive, and OneDrive connectors, enhancing file handling and error management
This commit is contained in:
parent
47f4be08d9
commit
dc7047f64d
14 changed files with 250 additions and 27 deletions
31
surfsense_backend/app/utils/file_extensions.py
Normal file
31
surfsense_backend/app/utils/file_extensions.py
Normal file
|
|
@ -0,0 +1,31 @@
|
|||
"""Allowlist of document extensions the ETL parsers can handle.
|
||||
|
||||
Every consumer (file_classifier, connector-level skip checks) imports from
|
||||
here so there is a single source of truth. Extensions already covered by
|
||||
PLAINTEXT_EXTENSIONS, AUDIO_EXTENSIONS, or DIRECT_CONVERT_EXTENSIONS in
|
||||
file_classifier are NOT repeated here -- this set is exclusively for the
|
||||
"document" ETL path (Docling / LlamaParse / Unstructured).
|
||||
"""
|
||||
|
||||
from pathlib import PurePosixPath
|
||||
|
||||
DOCUMENT_EXTENSIONS: frozenset[str] = frozenset({
|
||||
# PDF
|
||||
".pdf",
|
||||
# Microsoft Office
|
||||
".docx", ".doc", ".xlsx", ".xls", ".pptx", ".ppt",
|
||||
# Images (raster -- OCR / vision parsing)
|
||||
".png", ".jpg", ".jpeg", ".bmp", ".tiff", ".tif",
|
||||
# Rich text / e-book
|
||||
".rtf", ".epub",
|
||||
# OpenDocument
|
||||
".odt", ".ods", ".odp",
|
||||
# Other (LlamaParse / Unstructured specific)
|
||||
".hwpx",
|
||||
})
|
||||
|
||||
|
||||
def is_supported_document_extension(filename: str) -> bool:
|
||||
"""Return True if the file's extension is in the supported document set."""
|
||||
suffix = PurePosixPath(filename).suffix.lower()
|
||||
return suffix in DOCUMENT_EXTENSIONS
|
||||
Loading…
Add table
Add a link
Reference in a new issue