SurfSense/surfsense_backend/app/utils/file_extensions.py

"""Allowlist of document extensions the ETL parsers can handle.

Every consumer (file_classifier, connector-level skip checks) imports from
here so there is a single source of truth.  Extensions already covered by
PLAINTEXT_EXTENSIONS, AUDIO_EXTENSIONS, or DIRECT_CONVERT_EXTENSIONS in
file_classifier are NOT repeated here -- this set is exclusively for the
"document" ETL path (Docling / LlamaParse / Unstructured).
"""

from pathlib import PurePosixPath

DOCUMENT_EXTENSIONS: frozenset[str] = frozenset({
    # PDF
    ".pdf",
    # Microsoft Office
    ".docx", ".doc", ".xlsx", ".xls", ".pptx", ".ppt",
    # Images (raster -- OCR / vision parsing)
    ".png", ".jpg", ".jpeg", ".bmp", ".tiff", ".tif",
    # Rich text / e-book
    ".rtf", ".epub",
    # OpenDocument
    ".odt", ".ods", ".odp",
    # Other (LlamaParse / Unstructured specific)
    ".hwpx",
})


def is_supported_document_extension(filename: str) -> bool:
    """Return True if the file's extension is in the supported document set."""
    suffix = PurePosixPath(filename).suffix.lower()
    return suffix in DOCUMENT_EXTENSIONS