refactor: implement file type classification for supported extensions across Dropbox, Google Drive, and OneDrive connectors, enhancing file handling and error management

This commit is contained in:
Anish Sarkar 2026-04-06 22:03:47 +05:30
parent 47f4be08d9
commit dc7047f64d
14 changed files with 250 additions and 27 deletions

View file

@ -1,6 +1,6 @@
from app.config import config as app_config
from app.etl_pipeline.etl_document import EtlRequest, EtlResult
from app.etl_pipeline.exceptions import EtlServiceUnavailableError
from app.etl_pipeline.exceptions import EtlServiceUnavailableError, EtlUnsupportedFileError
from app.etl_pipeline.file_classifier import FileCategory, classify_file
from app.etl_pipeline.parsers.audio import transcribe_audio
from app.etl_pipeline.parsers.direct_convert import convert_file_directly
@ -13,6 +13,11 @@ class EtlPipelineService:
async def extract(self, request: EtlRequest) -> EtlResult:
category = classify_file(request.filename)
if category == FileCategory.UNSUPPORTED:
raise EtlUnsupportedFileError(
f"File type not supported for parsing: {request.filename}"
)
if category == FileCategory.PLAINTEXT:
content = read_plaintext(request.file_path)
return EtlResult(

View file

@ -4,3 +4,7 @@ class EtlParseError(Exception):
class EtlServiceUnavailableError(Exception):
"""Raised when the configured ETL_SERVICE is not recognised."""
class EtlUnsupportedFileError(Exception):
"""Raised when a file type cannot be parsed by any ETL pipeline."""

View file

@ -1,6 +1,7 @@
from enum import Enum
from pathlib import PurePosixPath
from app.utils.file_extensions import DOCUMENT_EXTENSIONS
PLAINTEXT_EXTENSIONS = frozenset(
{
@ -35,6 +36,7 @@ class FileCategory(Enum):
PLAINTEXT = "plaintext"
AUDIO = "audio"
DIRECT_CONVERT = "direct_convert"
UNSUPPORTED = "unsupported"
DOCUMENT = "document"
@ -46,4 +48,6 @@ def classify_file(filename: str) -> FileCategory:
return FileCategory.AUDIO
if suffix in DIRECT_CONVERT_EXTENSIONS:
return FileCategory.DIRECT_CONVERT
return FileCategory.DOCUMENT
if suffix in DOCUMENT_EXTENSIONS:
return FileCategory.DOCUMENT
return FileCategory.UNSUPPORTED