mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-04-28 10:26:33 +02:00
refactor: unify file skipping logic across Dropbox, Google Drive, and OneDrive connectors by replacing classification checks with a centralized service-based approach, enhancing maintainability and consistency in file handling
This commit is contained in:
parent
f03bf05aaa
commit
e7beeb2a36
13 changed files with 388 additions and 67 deletions
|
|
@ -1,7 +1,7 @@
|
|||
from enum import Enum
|
||||
from pathlib import PurePosixPath
|
||||
|
||||
from app.utils.file_extensions import DOCUMENT_EXTENSIONS
|
||||
from app.utils.file_extensions import DOCUMENT_EXTENSIONS, get_document_extensions_for_service
|
||||
|
||||
PLAINTEXT_EXTENSIONS = frozenset(
|
||||
{
|
||||
|
|
@ -29,7 +29,7 @@ AUDIO_EXTENSIONS = frozenset(
|
|||
{".mp3", ".mp4", ".mpeg", ".mpga", ".m4a", ".wav", ".webm"}
|
||||
)
|
||||
|
||||
DIRECT_CONVERT_EXTENSIONS = frozenset({".csv", ".tsv", ".html", ".htm"})
|
||||
DIRECT_CONVERT_EXTENSIONS = frozenset({".csv", ".tsv", ".html", ".htm", ".xhtml"})
|
||||
|
||||
|
||||
class FileCategory(Enum):
|
||||
|
|
@ -51,3 +51,18 @@ def classify_file(filename: str) -> FileCategory:
|
|||
if suffix in DOCUMENT_EXTENSIONS:
|
||||
return FileCategory.DOCUMENT
|
||||
return FileCategory.UNSUPPORTED
|
||||
|
||||
|
||||
def should_skip_for_service(filename: str, etl_service: str | None) -> bool:
|
||||
"""Return True if *filename* cannot be processed by *etl_service*.
|
||||
|
||||
Plaintext, audio, and direct-convert files are parser-agnostic and never
|
||||
skipped. Document files are checked against the per-parser extension set.
|
||||
"""
|
||||
category = classify_file(filename)
|
||||
if category == FileCategory.UNSUPPORTED:
|
||||
return True
|
||||
if category == FileCategory.DOCUMENT:
|
||||
suffix = PurePosixPath(filename).suffix.lower()
|
||||
return suffix not in get_document_extensions_for_service(etl_service)
|
||||
return False
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue