mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-05-12 01:02:39 +02:00
refactor: unify file skipping logic across Dropbox, Google Drive, and OneDrive connectors by replacing classification checks with a centralized service-based approach, enhancing maintainability and consistency in file handling
This commit is contained in:
parent
f03bf05aaa
commit
e7beeb2a36
13 changed files with 388 additions and 67 deletions
|
|
@ -1,6 +1,6 @@
|
|||
"""File type handlers for Dropbox."""
|
||||
|
||||
from app.etl_pipeline.file_classifier import FileCategory, classify_file
|
||||
from app.etl_pipeline.file_classifier import should_skip_for_service
|
||||
|
||||
PAPER_EXTENSION = ".paper"
|
||||
|
||||
|
|
@ -53,5 +53,7 @@ def should_skip_file(item: dict) -> bool:
|
|||
return False
|
||||
if not item.get("is_downloadable", True):
|
||||
return True
|
||||
from app.config import config as app_config
|
||||
|
||||
name = item.get("name", "")
|
||||
return classify_file(name) == FileCategory.UNSUPPORTED
|
||||
return should_skip_for_service(name, app_config.ETL_SERVICE)
|
||||
|
|
|
|||
|
|
@ -1,6 +1,6 @@
|
|||
"""File type handlers for Google Drive."""
|
||||
|
||||
from app.etl_pipeline.file_classifier import FileCategory, classify_file
|
||||
from app.etl_pipeline.file_classifier import should_skip_for_service
|
||||
|
||||
GOOGLE_DOC = "application/vnd.google-apps.document"
|
||||
GOOGLE_SHEET = "application/vnd.google-apps.spreadsheet"
|
||||
|
|
@ -49,8 +49,10 @@ def should_skip_file(mime_type: str) -> bool:
|
|||
|
||||
|
||||
def should_skip_by_extension(filename: str) -> bool:
|
||||
"""Return True if the file extension is not parseable by any ETL pipeline."""
|
||||
return classify_file(filename) == FileCategory.UNSUPPORTED
|
||||
"""Return True if the file extension is not parseable by the configured ETL service."""
|
||||
from app.config import config as app_config
|
||||
|
||||
return should_skip_for_service(filename, app_config.ETL_SERVICE)
|
||||
|
||||
|
||||
def get_export_mime_type(mime_type: str) -> str | None:
|
||||
|
|
|
|||
|
|
@ -1,6 +1,6 @@
|
|||
"""File type handlers for Microsoft OneDrive."""
|
||||
|
||||
from app.etl_pipeline.file_classifier import FileCategory, classify_file
|
||||
from app.etl_pipeline.file_classifier import should_skip_for_service
|
||||
|
||||
ONEDRIVE_FOLDER_FACET = "folder"
|
||||
ONENOTE_MIME = "application/msonenote"
|
||||
|
|
@ -51,5 +51,7 @@ def should_skip_file(item: dict) -> bool:
|
|||
mime = item.get("file", {}).get("mimeType", "")
|
||||
if mime in SKIP_MIME_TYPES:
|
||||
return True
|
||||
from app.config import config as app_config
|
||||
|
||||
name = item.get("name", "")
|
||||
return classify_file(name) == FileCategory.UNSUPPORTED
|
||||
return should_skip_for_service(name, app_config.ETL_SERVICE)
|
||||
|
|
|
|||
|
|
@ -45,6 +45,10 @@ class EtlPipelineService:
|
|||
return await self._extract_document(request)
|
||||
|
||||
async def _extract_document(self, request: EtlRequest) -> EtlResult:
|
||||
from pathlib import PurePosixPath
|
||||
|
||||
from app.utils.file_extensions import get_document_extensions_for_service
|
||||
|
||||
etl_service = app_config.ETL_SERVICE
|
||||
if not etl_service:
|
||||
raise EtlServiceUnavailableError(
|
||||
|
|
@ -52,6 +56,13 @@ class EtlPipelineService:
|
|||
"Set ETL_SERVICE to UNSTRUCTURED, LLAMACLOUD, or DOCLING in your .env"
|
||||
)
|
||||
|
||||
ext = PurePosixPath(request.filename).suffix.lower()
|
||||
supported = get_document_extensions_for_service(etl_service)
|
||||
if ext not in supported:
|
||||
raise EtlUnsupportedFileError(
|
||||
f"File type {ext} is not supported by {etl_service}"
|
||||
)
|
||||
|
||||
if etl_service == "DOCLING":
|
||||
from app.etl_pipeline.parsers.docling import parse_with_docling
|
||||
|
||||
|
|
|
|||
|
|
@ -1,7 +1,7 @@
|
|||
from enum import Enum
|
||||
from pathlib import PurePosixPath
|
||||
|
||||
from app.utils.file_extensions import DOCUMENT_EXTENSIONS
|
||||
from app.utils.file_extensions import DOCUMENT_EXTENSIONS, get_document_extensions_for_service
|
||||
|
||||
PLAINTEXT_EXTENSIONS = frozenset(
|
||||
{
|
||||
|
|
@ -29,7 +29,7 @@ AUDIO_EXTENSIONS = frozenset(
|
|||
{".mp3", ".mp4", ".mpeg", ".mpga", ".m4a", ".wav", ".webm"}
|
||||
)
|
||||
|
||||
DIRECT_CONVERT_EXTENSIONS = frozenset({".csv", ".tsv", ".html", ".htm"})
|
||||
DIRECT_CONVERT_EXTENSIONS = frozenset({".csv", ".tsv", ".html", ".htm", ".xhtml"})
|
||||
|
||||
|
||||
class FileCategory(Enum):
|
||||
|
|
@ -51,3 +51,18 @@ def classify_file(filename: str) -> FileCategory:
|
|||
if suffix in DOCUMENT_EXTENSIONS:
|
||||
return FileCategory.DOCUMENT
|
||||
return FileCategory.UNSUPPORTED
|
||||
|
||||
|
||||
def should_skip_for_service(filename: str, etl_service: str | None) -> bool:
|
||||
"""Return True if *filename* cannot be processed by *etl_service*.
|
||||
|
||||
Plaintext, audio, and direct-convert files are parser-agnostic and never
|
||||
skipped. Document files are checked against the per-parser extension set.
|
||||
"""
|
||||
category = classify_file(filename)
|
||||
if category == FileCategory.UNSUPPORTED:
|
||||
return True
|
||||
if category == FileCategory.DOCUMENT:
|
||||
suffix = PurePosixPath(filename).suffix.lower()
|
||||
return suffix not in get_document_extensions_for_service(etl_service)
|
||||
return False
|
||||
|
|
|
|||
|
|
@ -1,29 +1,69 @@
|
|||
"""Allowlist of document extensions the ETL parsers can handle.
|
||||
"""Per-parser document extension sets for the ETL pipeline.
|
||||
|
||||
Every consumer (file_classifier, connector-level skip checks) imports from
|
||||
here so there is a single source of truth. Extensions already covered by
|
||||
PLAINTEXT_EXTENSIONS, AUDIO_EXTENSIONS, or DIRECT_CONVERT_EXTENSIONS in
|
||||
file_classifier are NOT repeated here -- this set is exclusively for the
|
||||
"document" ETL path (Docling / LlamaParse / Unstructured).
|
||||
Every consumer (file_classifier, connector-level skip checks, ETL pipeline
|
||||
validation) imports from here so there is a single source of truth.
|
||||
|
||||
Extensions already covered by PLAINTEXT_EXTENSIONS, AUDIO_EXTENSIONS, or
|
||||
DIRECT_CONVERT_EXTENSIONS in file_classifier are NOT repeated here -- these
|
||||
sets are exclusively for the "document" ETL path (Docling / LlamaParse /
|
||||
Unstructured).
|
||||
"""
|
||||
|
||||
from pathlib import PurePosixPath
|
||||
|
||||
DOCUMENT_EXTENSIONS: frozenset[str] = frozenset({
|
||||
# PDF
|
||||
# ---------------------------------------------------------------------------
|
||||
# Per-parser document extension sets (from official documentation)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
DOCLING_DOCUMENT_EXTENSIONS: frozenset[str] = frozenset({
|
||||
".pdf",
|
||||
# Microsoft Office
|
||||
".docx", ".doc", ".xlsx", ".xls", ".pptx", ".ppt",
|
||||
# Images (raster: OCR / vision parsing)
|
||||
".png", ".jpg", ".jpeg", ".bmp", ".tiff", ".tif",
|
||||
# Rich text / e-book
|
||||
".rtf", ".epub",
|
||||
# OpenDocument
|
||||
".odt", ".ods", ".odp",
|
||||
# Other (LlamaParse / Unstructured specific)
|
||||
".hwpx",
|
||||
".docx", ".xlsx", ".pptx",
|
||||
".png", ".jpg", ".jpeg", ".tiff", ".tif", ".bmp", ".webp",
|
||||
})
|
||||
|
||||
LLAMAPARSE_DOCUMENT_EXTENSIONS: frozenset[str] = frozenset({
|
||||
".pdf",
|
||||
".docx", ".doc", ".xlsx", ".xls", ".pptx", ".ppt",
|
||||
".docm", ".dot", ".dotm", ".pptm", ".pot", ".potx",
|
||||
".xlsm", ".xlsb", ".xlw",
|
||||
".rtf", ".epub",
|
||||
".png", ".jpg", ".jpeg", ".gif", ".bmp", ".tiff", ".tif", ".webp", ".svg",
|
||||
".odt", ".ods", ".odp",
|
||||
".hwp", ".hwpx",
|
||||
})
|
||||
|
||||
UNSTRUCTURED_DOCUMENT_EXTENSIONS: frozenset[str] = frozenset({
|
||||
".pdf",
|
||||
".docx", ".doc", ".xlsx", ".xls", ".pptx", ".ppt",
|
||||
".png", ".jpg", ".jpeg", ".bmp", ".tiff", ".tif", ".heic",
|
||||
".rtf", ".epub", ".odt",
|
||||
".eml", ".msg", ".p7s",
|
||||
})
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Union (used by classify_file for routing) + service lookup
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
DOCUMENT_EXTENSIONS: frozenset[str] = (
|
||||
DOCLING_DOCUMENT_EXTENSIONS
|
||||
| LLAMAPARSE_DOCUMENT_EXTENSIONS
|
||||
| UNSTRUCTURED_DOCUMENT_EXTENSIONS
|
||||
)
|
||||
|
||||
_SERVICE_MAP: dict[str, frozenset[str]] = {
|
||||
"DOCLING": DOCLING_DOCUMENT_EXTENSIONS,
|
||||
"LLAMACLOUD": LLAMAPARSE_DOCUMENT_EXTENSIONS,
|
||||
"UNSTRUCTURED": UNSTRUCTURED_DOCUMENT_EXTENSIONS,
|
||||
}
|
||||
|
||||
|
||||
def get_document_extensions_for_service(etl_service: str | None) -> frozenset[str]:
|
||||
"""Return the document extensions supported by *etl_service*.
|
||||
|
||||
Falls back to the full union when the service is ``None`` or unknown.
|
||||
"""
|
||||
return _SERVICE_MAP.get(etl_service or "", DOCUMENT_EXTENSIONS)
|
||||
|
||||
|
||||
def is_supported_document_extension(filename: str) -> bool:
|
||||
"""Return True if the file's extension is in the supported document set."""
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue