mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-05-15 18:25:18 +02:00
refactor: implement file type classification for supported extensions across Dropbox, Google Drive, and OneDrive connectors, enhancing file handling and error management
This commit is contained in:
parent
47f4be08d9
commit
dc7047f64d
14 changed files with 250 additions and 27 deletions
|
|
@ -1,6 +1,6 @@
|
|||
from app.config import config as app_config
|
||||
from app.etl_pipeline.etl_document import EtlRequest, EtlResult
|
||||
from app.etl_pipeline.exceptions import EtlServiceUnavailableError
|
||||
from app.etl_pipeline.exceptions import EtlServiceUnavailableError, EtlUnsupportedFileError
|
||||
from app.etl_pipeline.file_classifier import FileCategory, classify_file
|
||||
from app.etl_pipeline.parsers.audio import transcribe_audio
|
||||
from app.etl_pipeline.parsers.direct_convert import convert_file_directly
|
||||
|
|
@ -13,6 +13,11 @@ class EtlPipelineService:
|
|||
async def extract(self, request: EtlRequest) -> EtlResult:
|
||||
category = classify_file(request.filename)
|
||||
|
||||
if category == FileCategory.UNSUPPORTED:
|
||||
raise EtlUnsupportedFileError(
|
||||
f"File type not supported for parsing: {request.filename}"
|
||||
)
|
||||
|
||||
if category == FileCategory.PLAINTEXT:
|
||||
content = read_plaintext(request.file_path)
|
||||
return EtlResult(
|
||||
|
|
|
|||
|
|
@ -4,3 +4,7 @@ class EtlParseError(Exception):
|
|||
|
||||
class EtlServiceUnavailableError(Exception):
|
||||
"""Raised when the configured ETL_SERVICE is not recognised."""
|
||||
|
||||
|
||||
class EtlUnsupportedFileError(Exception):
|
||||
"""Raised when a file type cannot be parsed by any ETL pipeline."""
|
||||
|
|
|
|||
|
|
@ -1,6 +1,7 @@
|
|||
from enum import Enum
|
||||
from pathlib import PurePosixPath
|
||||
|
||||
from app.utils.file_extensions import DOCUMENT_EXTENSIONS
|
||||
|
||||
PLAINTEXT_EXTENSIONS = frozenset(
|
||||
{
|
||||
|
|
@ -35,6 +36,7 @@ class FileCategory(Enum):
|
|||
PLAINTEXT = "plaintext"
|
||||
AUDIO = "audio"
|
||||
DIRECT_CONVERT = "direct_convert"
|
||||
UNSUPPORTED = "unsupported"
|
||||
DOCUMENT = "document"
|
||||
|
||||
|
||||
|
|
@ -46,4 +48,6 @@ def classify_file(filename: str) -> FileCategory:
|
|||
return FileCategory.AUDIO
|
||||
if suffix in DIRECT_CONVERT_EXTENSIONS:
|
||||
return FileCategory.DIRECT_CONVERT
|
||||
return FileCategory.DOCUMENT
|
||||
if suffix in DOCUMENT_EXTENSIONS:
|
||||
return FileCategory.DOCUMENT
|
||||
return FileCategory.UNSUPPORTED
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue