mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-05-02 04:12:47 +02:00
refactor: unify file skipping logic across Dropbox, Google Drive, and OneDrive connectors by replacing classification checks with a centralized service-based approach, enhancing maintainability and consistency in file handling
This commit is contained in:
parent
f03bf05aaa
commit
e7beeb2a36
13 changed files with 388 additions and 67 deletions
|
|
@ -1,6 +1,6 @@
|
|||
"""File type handlers for Dropbox."""
|
||||
|
||||
from app.etl_pipeline.file_classifier import FileCategory, classify_file
|
||||
from app.etl_pipeline.file_classifier import should_skip_for_service
|
||||
|
||||
PAPER_EXTENSION = ".paper"
|
||||
|
||||
|
|
@ -53,5 +53,7 @@ def should_skip_file(item: dict) -> bool:
|
|||
return False
|
||||
if not item.get("is_downloadable", True):
|
||||
return True
|
||||
from app.config import config as app_config
|
||||
|
||||
name = item.get("name", "")
|
||||
return classify_file(name) == FileCategory.UNSUPPORTED
|
||||
return should_skip_for_service(name, app_config.ETL_SERVICE)
|
||||
|
|
|
|||
|
|
@ -1,6 +1,6 @@
|
|||
"""File type handlers for Google Drive."""
|
||||
|
||||
from app.etl_pipeline.file_classifier import FileCategory, classify_file
|
||||
from app.etl_pipeline.file_classifier import should_skip_for_service
|
||||
|
||||
GOOGLE_DOC = "application/vnd.google-apps.document"
|
||||
GOOGLE_SHEET = "application/vnd.google-apps.spreadsheet"
|
||||
|
|
@ -49,8 +49,10 @@ def should_skip_file(mime_type: str) -> bool:
|
|||
|
||||
|
||||
def should_skip_by_extension(filename: str) -> bool:
|
||||
"""Return True if the file extension is not parseable by any ETL pipeline."""
|
||||
return classify_file(filename) == FileCategory.UNSUPPORTED
|
||||
"""Return True if the file extension is not parseable by the configured ETL service."""
|
||||
from app.config import config as app_config
|
||||
|
||||
return should_skip_for_service(filename, app_config.ETL_SERVICE)
|
||||
|
||||
|
||||
def get_export_mime_type(mime_type: str) -> str | None:
|
||||
|
|
|
|||
|
|
@ -1,6 +1,6 @@
|
|||
"""File type handlers for Microsoft OneDrive."""
|
||||
|
||||
from app.etl_pipeline.file_classifier import FileCategory, classify_file
|
||||
from app.etl_pipeline.file_classifier import should_skip_for_service
|
||||
|
||||
ONEDRIVE_FOLDER_FACET = "folder"
|
||||
ONENOTE_MIME = "application/msonenote"
|
||||
|
|
@ -51,5 +51,7 @@ def should_skip_file(item: dict) -> bool:
|
|||
mime = item.get("file", {}).get("mimeType", "")
|
||||
if mime in SKIP_MIME_TYPES:
|
||||
return True
|
||||
from app.config import config as app_config
|
||||
|
||||
name = item.get("name", "")
|
||||
return classify_file(name) == FileCategory.UNSUPPORTED
|
||||
return should_skip_for_service(name, app_config.ETL_SERVICE)
|
||||
|
|
|
|||
|
|
@ -45,6 +45,10 @@ class EtlPipelineService:
|
|||
return await self._extract_document(request)
|
||||
|
||||
async def _extract_document(self, request: EtlRequest) -> EtlResult:
|
||||
from pathlib import PurePosixPath
|
||||
|
||||
from app.utils.file_extensions import get_document_extensions_for_service
|
||||
|
||||
etl_service = app_config.ETL_SERVICE
|
||||
if not etl_service:
|
||||
raise EtlServiceUnavailableError(
|
||||
|
|
@ -52,6 +56,13 @@ class EtlPipelineService:
|
|||
"Set ETL_SERVICE to UNSTRUCTURED, LLAMACLOUD, or DOCLING in your .env"
|
||||
)
|
||||
|
||||
ext = PurePosixPath(request.filename).suffix.lower()
|
||||
supported = get_document_extensions_for_service(etl_service)
|
||||
if ext not in supported:
|
||||
raise EtlUnsupportedFileError(
|
||||
f"File type {ext} is not supported by {etl_service}"
|
||||
)
|
||||
|
||||
if etl_service == "DOCLING":
|
||||
from app.etl_pipeline.parsers.docling import parse_with_docling
|
||||
|
||||
|
|
|
|||
|
|
@ -1,7 +1,7 @@
|
|||
from enum import Enum
|
||||
from pathlib import PurePosixPath
|
||||
|
||||
from app.utils.file_extensions import DOCUMENT_EXTENSIONS
|
||||
from app.utils.file_extensions import DOCUMENT_EXTENSIONS, get_document_extensions_for_service
|
||||
|
||||
PLAINTEXT_EXTENSIONS = frozenset(
|
||||
{
|
||||
|
|
@ -29,7 +29,7 @@ AUDIO_EXTENSIONS = frozenset(
|
|||
{".mp3", ".mp4", ".mpeg", ".mpga", ".m4a", ".wav", ".webm"}
|
||||
)
|
||||
|
||||
DIRECT_CONVERT_EXTENSIONS = frozenset({".csv", ".tsv", ".html", ".htm"})
|
||||
DIRECT_CONVERT_EXTENSIONS = frozenset({".csv", ".tsv", ".html", ".htm", ".xhtml"})
|
||||
|
||||
|
||||
class FileCategory(Enum):
|
||||
|
|
@ -51,3 +51,18 @@ def classify_file(filename: str) -> FileCategory:
|
|||
if suffix in DOCUMENT_EXTENSIONS:
|
||||
return FileCategory.DOCUMENT
|
||||
return FileCategory.UNSUPPORTED
|
||||
|
||||
|
||||
def should_skip_for_service(filename: str, etl_service: str | None) -> bool:
|
||||
"""Return True if *filename* cannot be processed by *etl_service*.
|
||||
|
||||
Plaintext, audio, and direct-convert files are parser-agnostic and never
|
||||
skipped. Document files are checked against the per-parser extension set.
|
||||
"""
|
||||
category = classify_file(filename)
|
||||
if category == FileCategory.UNSUPPORTED:
|
||||
return True
|
||||
if category == FileCategory.DOCUMENT:
|
||||
suffix = PurePosixPath(filename).suffix.lower()
|
||||
return suffix not in get_document_extensions_for_service(etl_service)
|
||||
return False
|
||||
|
|
|
|||
|
|
@ -1,29 +1,69 @@
|
|||
"""Allowlist of document extensions the ETL parsers can handle.
|
||||
"""Per-parser document extension sets for the ETL pipeline.
|
||||
|
||||
Every consumer (file_classifier, connector-level skip checks) imports from
|
||||
here so there is a single source of truth. Extensions already covered by
|
||||
PLAINTEXT_EXTENSIONS, AUDIO_EXTENSIONS, or DIRECT_CONVERT_EXTENSIONS in
|
||||
file_classifier are NOT repeated here -- this set is exclusively for the
|
||||
"document" ETL path (Docling / LlamaParse / Unstructured).
|
||||
Every consumer (file_classifier, connector-level skip checks, ETL pipeline
|
||||
validation) imports from here so there is a single source of truth.
|
||||
|
||||
Extensions already covered by PLAINTEXT_EXTENSIONS, AUDIO_EXTENSIONS, or
|
||||
DIRECT_CONVERT_EXTENSIONS in file_classifier are NOT repeated here -- these
|
||||
sets are exclusively for the "document" ETL path (Docling / LlamaParse /
|
||||
Unstructured).
|
||||
"""
|
||||
|
||||
from pathlib import PurePosixPath
|
||||
|
||||
DOCUMENT_EXTENSIONS: frozenset[str] = frozenset({
|
||||
# PDF
|
||||
# ---------------------------------------------------------------------------
|
||||
# Per-parser document extension sets (from official documentation)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
DOCLING_DOCUMENT_EXTENSIONS: frozenset[str] = frozenset({
|
||||
".pdf",
|
||||
# Microsoft Office
|
||||
".docx", ".doc", ".xlsx", ".xls", ".pptx", ".ppt",
|
||||
# Images (raster: OCR / vision parsing)
|
||||
".png", ".jpg", ".jpeg", ".bmp", ".tiff", ".tif",
|
||||
# Rich text / e-book
|
||||
".rtf", ".epub",
|
||||
# OpenDocument
|
||||
".odt", ".ods", ".odp",
|
||||
# Other (LlamaParse / Unstructured specific)
|
||||
".hwpx",
|
||||
".docx", ".xlsx", ".pptx",
|
||||
".png", ".jpg", ".jpeg", ".tiff", ".tif", ".bmp", ".webp",
|
||||
})
|
||||
|
||||
LLAMAPARSE_DOCUMENT_EXTENSIONS: frozenset[str] = frozenset({
|
||||
".pdf",
|
||||
".docx", ".doc", ".xlsx", ".xls", ".pptx", ".ppt",
|
||||
".docm", ".dot", ".dotm", ".pptm", ".pot", ".potx",
|
||||
".xlsm", ".xlsb", ".xlw",
|
||||
".rtf", ".epub",
|
||||
".png", ".jpg", ".jpeg", ".gif", ".bmp", ".tiff", ".tif", ".webp", ".svg",
|
||||
".odt", ".ods", ".odp",
|
||||
".hwp", ".hwpx",
|
||||
})
|
||||
|
||||
UNSTRUCTURED_DOCUMENT_EXTENSIONS: frozenset[str] = frozenset({
|
||||
".pdf",
|
||||
".docx", ".doc", ".xlsx", ".xls", ".pptx", ".ppt",
|
||||
".png", ".jpg", ".jpeg", ".bmp", ".tiff", ".tif", ".heic",
|
||||
".rtf", ".epub", ".odt",
|
||||
".eml", ".msg", ".p7s",
|
||||
})
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Union (used by classify_file for routing) + service lookup
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
DOCUMENT_EXTENSIONS: frozenset[str] = (
|
||||
DOCLING_DOCUMENT_EXTENSIONS
|
||||
| LLAMAPARSE_DOCUMENT_EXTENSIONS
|
||||
| UNSTRUCTURED_DOCUMENT_EXTENSIONS
|
||||
)
|
||||
|
||||
_SERVICE_MAP: dict[str, frozenset[str]] = {
|
||||
"DOCLING": DOCLING_DOCUMENT_EXTENSIONS,
|
||||
"LLAMACLOUD": LLAMAPARSE_DOCUMENT_EXTENSIONS,
|
||||
"UNSTRUCTURED": UNSTRUCTURED_DOCUMENT_EXTENSIONS,
|
||||
}
|
||||
|
||||
|
||||
def get_document_extensions_for_service(etl_service: str | None) -> frozenset[str]:
|
||||
"""Return the document extensions supported by *etl_service*.
|
||||
|
||||
Falls back to the full union when the service is ``None`` or unknown.
|
||||
"""
|
||||
return _SERVICE_MAP.get(etl_service or "", DOCUMENT_EXTENSIONS)
|
||||
|
||||
|
||||
def is_supported_document_extension(filename: str) -> bool:
|
||||
"""Return True if the file's extension is in the supported document set."""
|
||||
|
|
|
|||
|
|
@ -261,6 +261,8 @@ def full_scan_mocks(mock_dropbox_client, monkeypatch):
|
|||
|
||||
skip_results: dict[str, tuple[bool, str | None]] = {}
|
||||
|
||||
monkeypatch.setattr("app.config.config.ETL_SERVICE", "LLAMACLOUD")
|
||||
|
||||
async def _fake_skip(session, file, search_space_id):
|
||||
from app.connectors.dropbox.file_types import should_skip_file as _skip
|
||||
if _skip(file):
|
||||
|
|
|
|||
|
|
@ -7,6 +7,11 @@ from app.connectors.dropbox.file_types import should_skip_file
|
|||
pytestmark = pytest.mark.unit
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Structural skips (independent of ETL service)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_folder_item_is_skipped():
|
||||
item = {".tag": "folder", "name": "My Folder"}
|
||||
assert should_skip_file(item) is True
|
||||
|
|
@ -22,13 +27,18 @@ def test_non_downloadable_item_is_skipped():
|
|||
assert should_skip_file(item) is True
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Extension-based skips (require ETL service context)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"filename",
|
||||
[
|
||||
"archive.zip", "backup.tar", "data.gz", "stuff.rar", "pack.7z",
|
||||
"program.exe", "lib.dll", "module.so", "image.dmg", "disk.iso",
|
||||
"movie.mov", "clip.avi", "video.mkv", "film.wmv", "stream.flv",
|
||||
"icon.svg", "anim.gif", "photo.webp", "shot.heic", "favicon.ico",
|
||||
"favicon.ico",
|
||||
"raw.cr2", "photo.nef", "image.arw", "pic.dng",
|
||||
"design.psd", "vector.ai", "mockup.sketch", "proto.fig",
|
||||
"font.ttf", "font.otf", "font.woff", "font.woff2",
|
||||
|
|
@ -36,7 +46,8 @@ def test_non_downloadable_item_is_skipped():
|
|||
"local.db", "data.sqlite", "access.mdb",
|
||||
],
|
||||
)
|
||||
def test_non_parseable_extensions_are_skipped(filename):
|
||||
def test_non_parseable_extensions_are_skipped(filename, mocker):
|
||||
mocker.patch("app.config.config.ETL_SERVICE", "DOCLING")
|
||||
item = {".tag": "file", "name": filename}
|
||||
assert should_skip_file(item) is True, f"{filename} should be skipped"
|
||||
|
||||
|
|
@ -45,29 +56,61 @@ def test_non_parseable_extensions_are_skipped(filename):
|
|||
"filename",
|
||||
[
|
||||
"report.pdf", "document.docx", "sheet.xlsx", "slides.pptx",
|
||||
"old.doc", "legacy.xls", "deck.ppt",
|
||||
"readme.txt", "data.csv", "page.html", "notes.md",
|
||||
"config.json", "feed.xml",
|
||||
],
|
||||
)
|
||||
def test_parseable_documents_are_not_skipped(filename):
|
||||
item = {".tag": "file", "name": filename}
|
||||
assert should_skip_file(item) is False, f"{filename} should NOT be skipped"
|
||||
def test_parseable_documents_are_not_skipped(filename, mocker):
|
||||
"""Files in plaintext/direct_convert/universal document sets are never skipped."""
|
||||
for service in ("DOCLING", "LLAMACLOUD", "UNSTRUCTURED"):
|
||||
mocker.patch("app.config.config.ETL_SERVICE", service)
|
||||
item = {".tag": "file", "name": filename}
|
||||
assert should_skip_file(item) is False, (
|
||||
f"{filename} should NOT be skipped with {service}"
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"filename",
|
||||
["photo.jpg", "image.jpeg", "screenshot.png", "scan.bmp", "page.tiff", "doc.tif"],
|
||||
)
|
||||
def test_universal_images_are_not_skipped(filename):
|
||||
item = {".tag": "file", "name": filename}
|
||||
assert should_skip_file(item) is False, f"{filename} should NOT be skipped"
|
||||
def test_universal_images_are_not_skipped(filename, mocker):
|
||||
"""Images supported by all parsers are never skipped."""
|
||||
for service in ("DOCLING", "LLAMACLOUD", "UNSTRUCTURED"):
|
||||
mocker.patch("app.config.config.ETL_SERVICE", service)
|
||||
item = {".tag": "file", "name": filename}
|
||||
assert should_skip_file(item) is False, (
|
||||
f"{filename} should NOT be skipped with {service}"
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"filename",
|
||||
["icon.svg", "anim.gif", "photo.webp", "live.heic"],
|
||||
)
|
||||
def test_non_universal_images_are_skipped(filename):
|
||||
@pytest.mark.parametrize("filename,service,expected_skip", [
|
||||
("old.doc", "DOCLING", True),
|
||||
("old.doc", "LLAMACLOUD", False),
|
||||
("old.doc", "UNSTRUCTURED", False),
|
||||
("legacy.xls", "DOCLING", True),
|
||||
("legacy.xls", "LLAMACLOUD", False),
|
||||
("legacy.xls", "UNSTRUCTURED", False),
|
||||
("deck.ppt", "DOCLING", True),
|
||||
("deck.ppt", "LLAMACLOUD", False),
|
||||
("deck.ppt", "UNSTRUCTURED", False),
|
||||
("icon.svg", "DOCLING", True),
|
||||
("icon.svg", "LLAMACLOUD", False),
|
||||
("anim.gif", "DOCLING", True),
|
||||
("anim.gif", "LLAMACLOUD", False),
|
||||
("photo.webp", "DOCLING", False),
|
||||
("photo.webp", "LLAMACLOUD", False),
|
||||
("photo.webp", "UNSTRUCTURED", True),
|
||||
("live.heic", "DOCLING", True),
|
||||
("live.heic", "UNSTRUCTURED", False),
|
||||
("macro.docm", "DOCLING", True),
|
||||
("macro.docm", "LLAMACLOUD", False),
|
||||
("mail.eml", "DOCLING", True),
|
||||
("mail.eml", "UNSTRUCTURED", False),
|
||||
])
|
||||
def test_parser_specific_extensions(filename, service, expected_skip, mocker):
|
||||
mocker.patch("app.config.config.ETL_SERVICE", service)
|
||||
item = {".tag": "file", "name": filename}
|
||||
assert should_skip_file(item) is True, f"{filename} should be skipped"
|
||||
assert should_skip_file(item) is expected_skip, (
|
||||
f"{filename} with {service}: expected skip={expected_skip}"
|
||||
)
|
||||
|
|
|
|||
|
|
@ -10,13 +10,38 @@ pytestmark = pytest.mark.unit
|
|||
@pytest.mark.parametrize("filename", [
|
||||
"malware.exe", "archive.zip", "video.mov", "font.woff2", "model.blend",
|
||||
])
|
||||
def test_unsupported_extensions_are_skipped(filename):
|
||||
assert should_skip_by_extension(filename) is True
|
||||
def test_unsupported_extensions_are_skipped_regardless_of_service(filename, mocker):
|
||||
"""Truly unsupported files are skipped no matter which ETL service is configured."""
|
||||
for service in ("DOCLING", "LLAMACLOUD", "UNSTRUCTURED"):
|
||||
mocker.patch("app.config.config.ETL_SERVICE", service)
|
||||
assert should_skip_by_extension(filename) is True
|
||||
|
||||
|
||||
@pytest.mark.parametrize("filename", [
|
||||
"report.pdf", "doc.docx", "sheet.xlsx", "slides.pptx",
|
||||
"readme.txt", "data.csv", "photo.png", "notes.md",
|
||||
])
|
||||
def test_parseable_extensions_are_not_skipped(filename):
|
||||
assert should_skip_by_extension(filename) is False
|
||||
def test_universal_extensions_are_not_skipped(filename, mocker):
|
||||
"""Files supported by all parsers (or handled by plaintext/direct_convert) are never skipped."""
|
||||
for service in ("DOCLING", "LLAMACLOUD", "UNSTRUCTURED"):
|
||||
mocker.patch("app.config.config.ETL_SERVICE", service)
|
||||
assert should_skip_by_extension(filename) is False, (
|
||||
f"{filename} should NOT be skipped with {service}"
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("filename,service,expected_skip", [
|
||||
("macro.docm", "DOCLING", True),
|
||||
("macro.docm", "LLAMACLOUD", False),
|
||||
("mail.eml", "DOCLING", True),
|
||||
("mail.eml", "UNSTRUCTURED", False),
|
||||
("photo.gif", "DOCLING", True),
|
||||
("photo.gif", "LLAMACLOUD", False),
|
||||
("photo.heic", "UNSTRUCTURED", False),
|
||||
("photo.heic", "DOCLING", True),
|
||||
])
|
||||
def test_parser_specific_extensions(filename, service, expected_skip, mocker):
|
||||
mocker.patch("app.config.config.ETL_SERVICE", service)
|
||||
assert should_skip_by_extension(filename) is expected_skip, (
|
||||
f"{filename} with {service}: expected skip={expected_skip}"
|
||||
)
|
||||
|
|
|
|||
|
|
@ -7,6 +7,11 @@ from app.connectors.onedrive.file_types import should_skip_file
|
|||
pytestmark = pytest.mark.unit
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Structural skips (independent of ETL service)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_folder_is_skipped():
|
||||
item = {"folder": {}, "name": "My Folder"}
|
||||
assert should_skip_file(item) is True
|
||||
|
|
@ -27,10 +32,16 @@ def test_onenote_is_skipped():
|
|||
assert should_skip_file(item) is True
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Extension-based skips (require ETL service context)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
@pytest.mark.parametrize("filename", [
|
||||
"malware.exe", "archive.zip", "video.mov", "font.woff2", "model.blend",
|
||||
])
|
||||
def test_unsupported_extensions_are_skipped(filename):
|
||||
def test_unsupported_extensions_are_skipped(filename, mocker):
|
||||
mocker.patch("app.config.config.ETL_SERVICE", "DOCLING")
|
||||
item = {"name": filename, "file": {"mimeType": "application/octet-stream"}}
|
||||
assert should_skip_file(item) is True, f"{filename} should be skipped"
|
||||
|
||||
|
|
@ -39,6 +50,26 @@ def test_unsupported_extensions_are_skipped(filename):
|
|||
"report.pdf", "doc.docx", "sheet.xlsx", "slides.pptx",
|
||||
"readme.txt", "data.csv", "photo.png", "notes.md",
|
||||
])
|
||||
def test_parseable_files_are_not_skipped(filename):
|
||||
def test_universal_files_are_not_skipped(filename, mocker):
|
||||
for service in ("DOCLING", "LLAMACLOUD", "UNSTRUCTURED"):
|
||||
mocker.patch("app.config.config.ETL_SERVICE", service)
|
||||
item = {"name": filename, "file": {"mimeType": "application/octet-stream"}}
|
||||
assert should_skip_file(item) is False, (
|
||||
f"{filename} should NOT be skipped with {service}"
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("filename,service,expected_skip", [
|
||||
("macro.docm", "DOCLING", True),
|
||||
("macro.docm", "LLAMACLOUD", False),
|
||||
("mail.eml", "DOCLING", True),
|
||||
("mail.eml", "UNSTRUCTURED", False),
|
||||
("photo.heic", "UNSTRUCTURED", False),
|
||||
("photo.heic", "DOCLING", True),
|
||||
])
|
||||
def test_parser_specific_extensions(filename, service, expected_skip, mocker):
|
||||
mocker.patch("app.config.config.ETL_SERVICE", service)
|
||||
item = {"name": filename, "file": {"mimeType": "application/octet-stream"}}
|
||||
assert should_skip_file(item) is False, f"{filename} should NOT be skipped"
|
||||
assert should_skip_file(item) is expected_skip, (
|
||||
f"{filename} with {service}: expected skip={expected_skip}"
|
||||
)
|
||||
|
|
|
|||
|
|
@ -377,3 +377,72 @@ async def test_extract_zip_raises_unsupported_error(tmp_path):
|
|||
await EtlPipelineService().extract(
|
||||
EtlRequest(file_path=str(zip_file), filename="archive.zip")
|
||||
)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Slice 14 – should_skip_for_service (per-parser document filtering)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
@pytest.mark.parametrize("filename,etl_service,expected_skip", [
|
||||
("file.eml", "DOCLING", True),
|
||||
("file.eml", "UNSTRUCTURED", False),
|
||||
("file.docm", "LLAMACLOUD", False),
|
||||
("file.docm", "DOCLING", True),
|
||||
("file.txt", "DOCLING", False),
|
||||
("file.csv", "LLAMACLOUD", False),
|
||||
("file.mp3", "UNSTRUCTURED", False),
|
||||
("file.exe", "LLAMACLOUD", True),
|
||||
("file.pdf", "DOCLING", False),
|
||||
("file.webp", "DOCLING", False),
|
||||
("file.webp", "UNSTRUCTURED", True),
|
||||
("file.gif", "LLAMACLOUD", False),
|
||||
("file.gif", "DOCLING", True),
|
||||
("file.heic", "UNSTRUCTURED", False),
|
||||
("file.heic", "DOCLING", True),
|
||||
("file.svg", "LLAMACLOUD", False),
|
||||
("file.svg", "DOCLING", True),
|
||||
("file.p7s", "UNSTRUCTURED", False),
|
||||
("file.p7s", "LLAMACLOUD", True),
|
||||
])
|
||||
def test_should_skip_for_service(filename, etl_service, expected_skip):
|
||||
from app.etl_pipeline.file_classifier import should_skip_for_service
|
||||
|
||||
assert should_skip_for_service(filename, etl_service) is expected_skip, (
|
||||
f"{filename} with {etl_service}: expected skip={expected_skip}"
|
||||
)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Slice 14b – ETL pipeline rejects per-parser incompatible documents
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
async def test_extract_docm_with_docling_raises_unsupported(tmp_path, mocker):
|
||||
"""Docling cannot parse .docm -- pipeline should reject before dispatching."""
|
||||
from app.etl_pipeline.exceptions import EtlUnsupportedFileError
|
||||
|
||||
mocker.patch("app.config.config.ETL_SERVICE", "DOCLING")
|
||||
|
||||
docm_file = tmp_path / "macro.docm"
|
||||
docm_file.write_bytes(b"\x00" * 10)
|
||||
|
||||
with pytest.raises(EtlUnsupportedFileError, match="not supported by DOCLING"):
|
||||
await EtlPipelineService().extract(
|
||||
EtlRequest(file_path=str(docm_file), filename="macro.docm")
|
||||
)
|
||||
|
||||
|
||||
async def test_extract_eml_with_docling_raises_unsupported(tmp_path, mocker):
|
||||
"""Docling cannot parse .eml -- pipeline should reject before dispatching."""
|
||||
from app.etl_pipeline.exceptions import EtlUnsupportedFileError
|
||||
|
||||
mocker.patch("app.config.config.ETL_SERVICE", "DOCLING")
|
||||
|
||||
eml_file = tmp_path / "mail.eml"
|
||||
eml_file.write_bytes(b"From: test@example.com")
|
||||
|
||||
with pytest.raises(EtlUnsupportedFileError, match="not supported by DOCLING"):
|
||||
await EtlPipelineService().extract(
|
||||
EtlRequest(file_path=str(eml_file), filename="mail.eml")
|
||||
)
|
||||
|
|
|
|||
|
|
@ -21,10 +21,17 @@ def test_exe_is_not_supported_document():
|
|||
"report.pdf", "doc.docx", "old.doc",
|
||||
"sheet.xlsx", "legacy.xls",
|
||||
"slides.pptx", "deck.ppt",
|
||||
"macro.docm", "macro.xlsm", "macro.pptm",
|
||||
"photo.png", "photo.jpg", "photo.jpeg", "scan.bmp", "scan.tiff", "scan.tif",
|
||||
"photo.webp", "anim.gif", "iphone.heic",
|
||||
"manual.rtf", "book.epub",
|
||||
"letter.odt", "data.ods", "presentation.odp",
|
||||
"korean.hwpx",
|
||||
"inbox.eml", "outlook.msg",
|
||||
"korean.hwpx", "korean.hwp",
|
||||
"template.dot", "template.dotm",
|
||||
"template.pot", "template.potx",
|
||||
"binary.xlsb", "workspace.xlw",
|
||||
"vector.svg", "signature.p7s",
|
||||
])
|
||||
def test_document_extensions_are_supported(filename):
|
||||
from app.utils.file_extensions import is_supported_document_extension
|
||||
|
|
@ -40,3 +47,70 @@ def test_non_document_extensions_are_not_supported(filename):
|
|||
from app.utils.file_extensions import is_supported_document_extension
|
||||
|
||||
assert is_supported_document_extension(filename) is False, f"{filename} should NOT be supported"
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Per-parser extension sets
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_union_equals_all_three_sets():
|
||||
from app.utils.file_extensions import (
|
||||
DOCLING_DOCUMENT_EXTENSIONS,
|
||||
DOCUMENT_EXTENSIONS,
|
||||
LLAMAPARSE_DOCUMENT_EXTENSIONS,
|
||||
UNSTRUCTURED_DOCUMENT_EXTENSIONS,
|
||||
)
|
||||
|
||||
expected = (
|
||||
DOCLING_DOCUMENT_EXTENSIONS
|
||||
| LLAMAPARSE_DOCUMENT_EXTENSIONS
|
||||
| UNSTRUCTURED_DOCUMENT_EXTENSIONS
|
||||
)
|
||||
assert DOCUMENT_EXTENSIONS == expected
|
||||
|
||||
|
||||
def test_get_extensions_for_docling():
|
||||
from app.utils.file_extensions import get_document_extensions_for_service
|
||||
|
||||
exts = get_document_extensions_for_service("DOCLING")
|
||||
assert ".pdf" in exts
|
||||
assert ".webp" in exts
|
||||
assert ".docx" in exts
|
||||
assert ".eml" not in exts
|
||||
assert ".docm" not in exts
|
||||
assert ".gif" not in exts
|
||||
assert ".heic" not in exts
|
||||
|
||||
|
||||
def test_get_extensions_for_llamacloud():
|
||||
from app.utils.file_extensions import get_document_extensions_for_service
|
||||
|
||||
exts = get_document_extensions_for_service("LLAMACLOUD")
|
||||
assert ".docm" in exts
|
||||
assert ".gif" in exts
|
||||
assert ".svg" in exts
|
||||
assert ".hwp" in exts
|
||||
assert ".eml" not in exts
|
||||
assert ".heic" not in exts
|
||||
|
||||
|
||||
def test_get_extensions_for_unstructured():
|
||||
from app.utils.file_extensions import get_document_extensions_for_service
|
||||
|
||||
exts = get_document_extensions_for_service("UNSTRUCTURED")
|
||||
assert ".eml" in exts
|
||||
assert ".heic" in exts
|
||||
assert ".p7s" in exts
|
||||
assert ".docm" not in exts
|
||||
assert ".gif" not in exts
|
||||
assert ".svg" not in exts
|
||||
|
||||
|
||||
def test_get_extensions_for_none_returns_union():
|
||||
from app.utils.file_extensions import (
|
||||
DOCUMENT_EXTENSIONS,
|
||||
get_document_extensions_for_service,
|
||||
)
|
||||
|
||||
assert get_document_extensions_for_service(None) == DOCUMENT_EXTENSIONS
|
||||
|
|
|
|||
|
|
@ -85,7 +85,6 @@ const FILE_TYPE_CONFIG: Record<string, Record<string, string[]>> = {
|
|||
"application/rtf": [".rtf"],
|
||||
"application/xml": [".xml"],
|
||||
"application/epub+zip": [".epub"],
|
||||
"text/html": [".html", ".htm", ".web"],
|
||||
"image/gif": [".gif"],
|
||||
"image/svg+xml": [".svg"],
|
||||
...audioFileTypes,
|
||||
|
|
@ -472,12 +471,13 @@ export function DocumentUploadTab({
|
|||
</button>
|
||||
))
|
||||
) : (
|
||||
<div
|
||||
className="flex flex-col items-center gap-4 py-12 px-4 cursor-pointer"
|
||||
onClick={() => {
|
||||
if (!isElectron) fileInputRef.current?.click();
|
||||
}}
|
||||
>
|
||||
<button
|
||||
type="button"
|
||||
className="flex flex-col items-center gap-4 py-12 px-4 cursor-pointer w-full bg-transparent border-none"
|
||||
onClick={() => {
|
||||
if (!isElectron) fileInputRef.current?.click();
|
||||
}}
|
||||
>
|
||||
<Upload className="h-10 w-10 text-muted-foreground" />
|
||||
<div className="text-center space-y-1.5">
|
||||
<p className="text-base font-medium">
|
||||
|
|
@ -485,10 +485,11 @@ export function DocumentUploadTab({
|
|||
</p>
|
||||
<p className="text-sm text-muted-foreground">{t("file_size_limit")}</p>
|
||||
</div>
|
||||
<div className="w-full mt-1" onClick={(e) => e.stopPropagation()}>
|
||||
{renderBrowseButton({ fullWidth: true })}
|
||||
</div>
|
||||
{/* biome-ignore lint/a11y/useSemanticElements: wrapper to stop click propagation to parent button */}
|
||||
<div className="w-full mt-1" onClick={(e) => e.stopPropagation()} onKeyDown={(e) => e.stopPropagation()} role="group">
|
||||
{renderBrowseButton({ fullWidth: true })}
|
||||
</div>
|
||||
</button>
|
||||
)}
|
||||
</div>
|
||||
|
||||
|
|
@ -683,13 +684,17 @@ export function DocumentUploadTab({
|
|||
</span>
|
||||
</AccordionTrigger>
|
||||
<AccordionContent className="px-3 pb-3">
|
||||
<div className="flex flex-wrap gap-1">
|
||||
{supportedExtensions.map((ext) => (
|
||||
<Badge key={ext} variant="outline" className="text-[10px] px-1.5 py-0">
|
||||
{ext}
|
||||
</Badge>
|
||||
))}
|
||||
</div>
|
||||
<div className="flex flex-wrap gap-1.5">
|
||||
{supportedExtensions.map((ext) => (
|
||||
<Badge
|
||||
key={ext}
|
||||
variant="secondary"
|
||||
className="rounded border-0 bg-neutral-200/80 dark:bg-neutral-700/60 text-muted-foreground text-[10px] px-2 py-0.5 font-normal"
|
||||
>
|
||||
{ext}
|
||||
</Badge>
|
||||
))}
|
||||
</div>
|
||||
</AccordionContent>
|
||||
</AccordionItem>
|
||||
</Accordion>
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue