refactor: implement file type classification for supported extensions across Dropbox, Google Drive, and OneDrive connectors, enhancing file handling and error management

This commit is contained in:
Anish Sarkar 2026-04-06 22:03:47 +05:30
parent 47f4be08d9
commit dc7047f64d
14 changed files with 250 additions and 27 deletions

View file

@ -1,25 +1,8 @@
"""File type handlers for Dropbox."""
PAPER_EXTENSION = ".paper"
from app.etl_pipeline.file_classifier import FileCategory, classify_file
SKIP_EXTENSIONS: frozenset[str] = frozenset({
# Non-universal images (not supported by all 3 ETL pipelines)
".svg", ".gif", ".webp", ".heic", ".ico",
".raw", ".cr2", ".nef", ".arw", ".dng",
".psd", ".ai", ".sketch", ".fig",
# Video
".mov", ".avi", ".mkv", ".wmv", ".flv",
# Binaries / executables
".exe", ".dll", ".so", ".dylib", ".bin", ".app", ".dmg", ".iso",
# Archives
".zip", ".tar", ".gz", ".rar", ".7z", ".bz2",
# Fonts
".ttf", ".otf", ".woff", ".woff2",
# 3D / CAD
".stl", ".obj", ".fbx", ".blend",
# Database
".db", ".sqlite", ".mdb",
})
PAPER_EXTENSION = ".paper"
MIME_TO_EXTENSION: dict[str, str] = {
"application/pdf": ".pdf",
@ -71,5 +54,4 @@ def should_skip_file(item: dict) -> bool:
if not item.get("is_downloadable", True):
return True
name = item.get("name", "")
ext = get_extension_from_name(name).lower()
return ext in SKIP_EXTENSIONS
return classify_file(name) == FileCategory.UNSUPPORTED