mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-05-17 18:35:19 +02:00
refactor: implement file type classification for supported extensions across Dropbox, Google Drive, and OneDrive connectors, enhancing file handling and error management
This commit is contained in:
parent
47f4be08d9
commit
dc7047f64d
14 changed files with 250 additions and 27 deletions
|
|
@ -1,25 +1,8 @@
|
|||
"""File type handlers for Dropbox."""
|
||||
|
||||
PAPER_EXTENSION = ".paper"
|
||||
from app.etl_pipeline.file_classifier import FileCategory, classify_file
|
||||
|
||||
SKIP_EXTENSIONS: frozenset[str] = frozenset({
|
||||
# Non-universal images (not supported by all 3 ETL pipelines)
|
||||
".svg", ".gif", ".webp", ".heic", ".ico",
|
||||
".raw", ".cr2", ".nef", ".arw", ".dng",
|
||||
".psd", ".ai", ".sketch", ".fig",
|
||||
# Video
|
||||
".mov", ".avi", ".mkv", ".wmv", ".flv",
|
||||
# Binaries / executables
|
||||
".exe", ".dll", ".so", ".dylib", ".bin", ".app", ".dmg", ".iso",
|
||||
# Archives
|
||||
".zip", ".tar", ".gz", ".rar", ".7z", ".bz2",
|
||||
# Fonts
|
||||
".ttf", ".otf", ".woff", ".woff2",
|
||||
# 3D / CAD
|
||||
".stl", ".obj", ".fbx", ".blend",
|
||||
# Database
|
||||
".db", ".sqlite", ".mdb",
|
||||
})
|
||||
PAPER_EXTENSION = ".paper"
|
||||
|
||||
MIME_TO_EXTENSION: dict[str, str] = {
|
||||
"application/pdf": ".pdf",
|
||||
|
|
@ -71,5 +54,4 @@ def should_skip_file(item: dict) -> bool:
|
|||
if not item.get("is_downloadable", True):
|
||||
return True
|
||||
name = item.get("name", "")
|
||||
ext = get_extension_from_name(name).lower()
|
||||
return ext in SKIP_EXTENSIONS
|
||||
return classify_file(name) == FileCategory.UNSUPPORTED
|
||||
|
|
|
|||
|
|
@ -17,6 +17,7 @@ from .file_types import (
|
|||
get_export_mime_type,
|
||||
get_extension_from_mime,
|
||||
is_google_workspace_file,
|
||||
should_skip_by_extension,
|
||||
should_skip_file,
|
||||
)
|
||||
|
||||
|
|
@ -42,6 +43,9 @@ async def download_and_extract_content(
|
|||
if should_skip_file(mime_type):
|
||||
return None, {}, f"Skipping {mime_type}"
|
||||
|
||||
if should_skip_by_extension(file_name):
|
||||
return None, {}, f"Skipping unsupported extension: {file_name}"
|
||||
|
||||
logger.info(f"Downloading file for content extraction: {file_name} ({mime_type})")
|
||||
|
||||
drive_metadata: dict[str, Any] = {
|
||||
|
|
@ -148,10 +152,12 @@ async def download_and_process_file(
|
|||
file_name = file.get("name", "Unknown")
|
||||
mime_type = file.get("mimeType", "")
|
||||
|
||||
# Skip folders and shortcuts
|
||||
if should_skip_file(mime_type):
|
||||
return None, f"Skipping {mime_type}", None
|
||||
|
||||
if should_skip_by_extension(file_name):
|
||||
return None, f"Skipping unsupported extension: {file_name}", None
|
||||
|
||||
logger.info(f"Downloading file: {file_name} ({mime_type})")
|
||||
|
||||
temp_file_path = None
|
||||
|
|
|
|||
|
|
@ -1,5 +1,7 @@
|
|||
"""File type handlers for Google Drive."""
|
||||
|
||||
from app.etl_pipeline.file_classifier import FileCategory, classify_file
|
||||
|
||||
GOOGLE_DOC = "application/vnd.google-apps.document"
|
||||
GOOGLE_SHEET = "application/vnd.google-apps.spreadsheet"
|
||||
GOOGLE_SLIDE = "application/vnd.google-apps.presentation"
|
||||
|
|
@ -46,6 +48,11 @@ def should_skip_file(mime_type: str) -> bool:
|
|||
return mime_type in [GOOGLE_FOLDER, GOOGLE_SHORTCUT]
|
||||
|
||||
|
||||
def should_skip_by_extension(filename: str) -> bool:
|
||||
"""Return True if the file extension is not parseable by any ETL pipeline."""
|
||||
return classify_file(filename) == FileCategory.UNSUPPORTED
|
||||
|
||||
|
||||
def get_export_mime_type(mime_type: str) -> str | None:
|
||||
"""Get export MIME type for Google Workspace files."""
|
||||
return EXPORT_FORMATS.get(mime_type)
|
||||
|
|
|
|||
|
|
@ -1,5 +1,7 @@
|
|||
"""File type handlers for Microsoft OneDrive."""
|
||||
|
||||
from app.etl_pipeline.file_classifier import FileCategory, classify_file
|
||||
|
||||
ONEDRIVE_FOLDER_FACET = "folder"
|
||||
ONENOTE_MIME = "application/msonenote"
|
||||
|
||||
|
|
@ -39,7 +41,7 @@ def is_folder(item: dict) -> bool:
|
|||
|
||||
|
||||
def should_skip_file(item: dict) -> bool:
|
||||
"""Skip folders, OneNote files, remote items (shared links), and packages."""
|
||||
"""Skip folders, OneNote files, remote items (shared links), packages, and unsupported extensions."""
|
||||
if is_folder(item):
|
||||
return True
|
||||
if "remoteItem" in item:
|
||||
|
|
@ -47,4 +49,7 @@ def should_skip_file(item: dict) -> bool:
|
|||
if "package" in item:
|
||||
return True
|
||||
mime = item.get("file", {}).get("mimeType", "")
|
||||
return mime in SKIP_MIME_TYPES
|
||||
if mime in SKIP_MIME_TYPES:
|
||||
return True
|
||||
name = item.get("name", "")
|
||||
return classify_file(name) == FileCategory.UNSUPPORTED
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue