refactor: implement file type classification for supported extensions across Dropbox, Google Drive, and OneDrive connectors, enhancing file handling and error management

This commit is contained in:
Anish Sarkar 2026-04-06 22:03:47 +05:30
parent 47f4be08d9
commit dc7047f64d
14 changed files with 250 additions and 27 deletions

View file

@ -17,6 +17,7 @@ from .file_types import (
get_export_mime_type,
get_extension_from_mime,
is_google_workspace_file,
should_skip_by_extension,
should_skip_file,
)
@ -42,6 +43,9 @@ async def download_and_extract_content(
if should_skip_file(mime_type):
return None, {}, f"Skipping {mime_type}"
if should_skip_by_extension(file_name):
return None, {}, f"Skipping unsupported extension: {file_name}"
logger.info(f"Downloading file for content extraction: {file_name} ({mime_type})")
drive_metadata: dict[str, Any] = {
@ -148,10 +152,12 @@ async def download_and_process_file(
file_name = file.get("name", "Unknown")
mime_type = file.get("mimeType", "")
# Skip folders and shortcuts
if should_skip_file(mime_type):
return None, f"Skipping {mime_type}", None
if should_skip_by_extension(file_name):
return None, f"Skipping unsupported extension: {file_name}", None
logger.info(f"Downloading file: {file_name} ({mime_type})")
temp_file_path = None

View file

@ -1,5 +1,7 @@
"""File type handlers for Google Drive."""
from app.etl_pipeline.file_classifier import FileCategory, classify_file
GOOGLE_DOC = "application/vnd.google-apps.document"
GOOGLE_SHEET = "application/vnd.google-apps.spreadsheet"
GOOGLE_SLIDE = "application/vnd.google-apps.presentation"
@ -46,6 +48,11 @@ def should_skip_file(mime_type: str) -> bool:
return mime_type in [GOOGLE_FOLDER, GOOGLE_SHORTCUT]
def should_skip_by_extension(filename: str) -> bool:
"""Return True if the file extension is not parseable by any ETL pipeline."""
return classify_file(filename) == FileCategory.UNSUPPORTED
def get_export_mime_type(mime_type: str) -> str | None:
"""Get export MIME type for Google Workspace files."""
return EXPORT_FORMATS.get(mime_type)