mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-04-25 08:46:22 +02:00
refactor: unify file skipping logic across Dropbox, Google Drive, and OneDrive connectors by replacing classification checks with a centralized service-based approach, enhancing maintainability and consistency in file handling
This commit is contained in:
parent
f03bf05aaa
commit
e7beeb2a36
13 changed files with 388 additions and 67 deletions
|
|
@ -261,6 +261,8 @@ def full_scan_mocks(mock_dropbox_client, monkeypatch):
|
|||
|
||||
skip_results: dict[str, tuple[bool, str | None]] = {}
|
||||
|
||||
monkeypatch.setattr("app.config.config.ETL_SERVICE", "LLAMACLOUD")
|
||||
|
||||
async def _fake_skip(session, file, search_space_id):
|
||||
from app.connectors.dropbox.file_types import should_skip_file as _skip
|
||||
if _skip(file):
|
||||
|
|
|
|||
|
|
@ -7,6 +7,11 @@ from app.connectors.dropbox.file_types import should_skip_file
|
|||
pytestmark = pytest.mark.unit
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Structural skips (independent of ETL service)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_folder_item_is_skipped():
|
||||
item = {".tag": "folder", "name": "My Folder"}
|
||||
assert should_skip_file(item) is True
|
||||
|
|
@ -22,13 +27,18 @@ def test_non_downloadable_item_is_skipped():
|
|||
assert should_skip_file(item) is True
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Extension-based skips (require ETL service context)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"filename",
|
||||
[
|
||||
"archive.zip", "backup.tar", "data.gz", "stuff.rar", "pack.7z",
|
||||
"program.exe", "lib.dll", "module.so", "image.dmg", "disk.iso",
|
||||
"movie.mov", "clip.avi", "video.mkv", "film.wmv", "stream.flv",
|
||||
"icon.svg", "anim.gif", "photo.webp", "shot.heic", "favicon.ico",
|
||||
"favicon.ico",
|
||||
"raw.cr2", "photo.nef", "image.arw", "pic.dng",
|
||||
"design.psd", "vector.ai", "mockup.sketch", "proto.fig",
|
||||
"font.ttf", "font.otf", "font.woff", "font.woff2",
|
||||
|
|
@ -36,7 +46,8 @@ def test_non_downloadable_item_is_skipped():
|
|||
"local.db", "data.sqlite", "access.mdb",
|
||||
],
|
||||
)
|
||||
def test_non_parseable_extensions_are_skipped(filename):
|
||||
def test_non_parseable_extensions_are_skipped(filename, mocker):
|
||||
mocker.patch("app.config.config.ETL_SERVICE", "DOCLING")
|
||||
item = {".tag": "file", "name": filename}
|
||||
assert should_skip_file(item) is True, f"{filename} should be skipped"
|
||||
|
||||
|
|
@ -45,29 +56,61 @@ def test_non_parseable_extensions_are_skipped(filename):
|
|||
"filename",
|
||||
[
|
||||
"report.pdf", "document.docx", "sheet.xlsx", "slides.pptx",
|
||||
"old.doc", "legacy.xls", "deck.ppt",
|
||||
"readme.txt", "data.csv", "page.html", "notes.md",
|
||||
"config.json", "feed.xml",
|
||||
],
|
||||
)
|
||||
def test_parseable_documents_are_not_skipped(filename):
|
||||
item = {".tag": "file", "name": filename}
|
||||
assert should_skip_file(item) is False, f"{filename} should NOT be skipped"
|
||||
def test_parseable_documents_are_not_skipped(filename, mocker):
|
||||
"""Files in plaintext/direct_convert/universal document sets are never skipped."""
|
||||
for service in ("DOCLING", "LLAMACLOUD", "UNSTRUCTURED"):
|
||||
mocker.patch("app.config.config.ETL_SERVICE", service)
|
||||
item = {".tag": "file", "name": filename}
|
||||
assert should_skip_file(item) is False, (
|
||||
f"{filename} should NOT be skipped with {service}"
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"filename",
|
||||
["photo.jpg", "image.jpeg", "screenshot.png", "scan.bmp", "page.tiff", "doc.tif"],
|
||||
)
|
||||
def test_universal_images_are_not_skipped(filename):
|
||||
item = {".tag": "file", "name": filename}
|
||||
assert should_skip_file(item) is False, f"{filename} should NOT be skipped"
|
||||
def test_universal_images_are_not_skipped(filename, mocker):
|
||||
"""Images supported by all parsers are never skipped."""
|
||||
for service in ("DOCLING", "LLAMACLOUD", "UNSTRUCTURED"):
|
||||
mocker.patch("app.config.config.ETL_SERVICE", service)
|
||||
item = {".tag": "file", "name": filename}
|
||||
assert should_skip_file(item) is False, (
|
||||
f"{filename} should NOT be skipped with {service}"
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"filename",
|
||||
["icon.svg", "anim.gif", "photo.webp", "live.heic"],
|
||||
)
|
||||
def test_non_universal_images_are_skipped(filename):
|
||||
@pytest.mark.parametrize("filename,service,expected_skip", [
|
||||
("old.doc", "DOCLING", True),
|
||||
("old.doc", "LLAMACLOUD", False),
|
||||
("old.doc", "UNSTRUCTURED", False),
|
||||
("legacy.xls", "DOCLING", True),
|
||||
("legacy.xls", "LLAMACLOUD", False),
|
||||
("legacy.xls", "UNSTRUCTURED", False),
|
||||
("deck.ppt", "DOCLING", True),
|
||||
("deck.ppt", "LLAMACLOUD", False),
|
||||
("deck.ppt", "UNSTRUCTURED", False),
|
||||
("icon.svg", "DOCLING", True),
|
||||
("icon.svg", "LLAMACLOUD", False),
|
||||
("anim.gif", "DOCLING", True),
|
||||
("anim.gif", "LLAMACLOUD", False),
|
||||
("photo.webp", "DOCLING", False),
|
||||
("photo.webp", "LLAMACLOUD", False),
|
||||
("photo.webp", "UNSTRUCTURED", True),
|
||||
("live.heic", "DOCLING", True),
|
||||
("live.heic", "UNSTRUCTURED", False),
|
||||
("macro.docm", "DOCLING", True),
|
||||
("macro.docm", "LLAMACLOUD", False),
|
||||
("mail.eml", "DOCLING", True),
|
||||
("mail.eml", "UNSTRUCTURED", False),
|
||||
])
|
||||
def test_parser_specific_extensions(filename, service, expected_skip, mocker):
|
||||
mocker.patch("app.config.config.ETL_SERVICE", service)
|
||||
item = {".tag": "file", "name": filename}
|
||||
assert should_skip_file(item) is True, f"{filename} should be skipped"
|
||||
assert should_skip_file(item) is expected_skip, (
|
||||
f"{filename} with {service}: expected skip={expected_skip}"
|
||||
)
|
||||
|
|
|
|||
|
|
@ -10,13 +10,38 @@ pytestmark = pytest.mark.unit
|
|||
@pytest.mark.parametrize("filename", [
|
||||
"malware.exe", "archive.zip", "video.mov", "font.woff2", "model.blend",
|
||||
])
|
||||
def test_unsupported_extensions_are_skipped(filename):
|
||||
assert should_skip_by_extension(filename) is True
|
||||
def test_unsupported_extensions_are_skipped_regardless_of_service(filename, mocker):
|
||||
"""Truly unsupported files are skipped no matter which ETL service is configured."""
|
||||
for service in ("DOCLING", "LLAMACLOUD", "UNSTRUCTURED"):
|
||||
mocker.patch("app.config.config.ETL_SERVICE", service)
|
||||
assert should_skip_by_extension(filename) is True
|
||||
|
||||
|
||||
@pytest.mark.parametrize("filename", [
|
||||
"report.pdf", "doc.docx", "sheet.xlsx", "slides.pptx",
|
||||
"readme.txt", "data.csv", "photo.png", "notes.md",
|
||||
])
|
||||
def test_parseable_extensions_are_not_skipped(filename):
|
||||
assert should_skip_by_extension(filename) is False
|
||||
def test_universal_extensions_are_not_skipped(filename, mocker):
|
||||
"""Files supported by all parsers (or handled by plaintext/direct_convert) are never skipped."""
|
||||
for service in ("DOCLING", "LLAMACLOUD", "UNSTRUCTURED"):
|
||||
mocker.patch("app.config.config.ETL_SERVICE", service)
|
||||
assert should_skip_by_extension(filename) is False, (
|
||||
f"{filename} should NOT be skipped with {service}"
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("filename,service,expected_skip", [
|
||||
("macro.docm", "DOCLING", True),
|
||||
("macro.docm", "LLAMACLOUD", False),
|
||||
("mail.eml", "DOCLING", True),
|
||||
("mail.eml", "UNSTRUCTURED", False),
|
||||
("photo.gif", "DOCLING", True),
|
||||
("photo.gif", "LLAMACLOUD", False),
|
||||
("photo.heic", "UNSTRUCTURED", False),
|
||||
("photo.heic", "DOCLING", True),
|
||||
])
|
||||
def test_parser_specific_extensions(filename, service, expected_skip, mocker):
|
||||
mocker.patch("app.config.config.ETL_SERVICE", service)
|
||||
assert should_skip_by_extension(filename) is expected_skip, (
|
||||
f"{filename} with {service}: expected skip={expected_skip}"
|
||||
)
|
||||
|
|
|
|||
|
|
@ -7,6 +7,11 @@ from app.connectors.onedrive.file_types import should_skip_file
|
|||
pytestmark = pytest.mark.unit
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Structural skips (independent of ETL service)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_folder_is_skipped():
|
||||
item = {"folder": {}, "name": "My Folder"}
|
||||
assert should_skip_file(item) is True
|
||||
|
|
@ -27,10 +32,16 @@ def test_onenote_is_skipped():
|
|||
assert should_skip_file(item) is True
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Extension-based skips (require ETL service context)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
@pytest.mark.parametrize("filename", [
|
||||
"malware.exe", "archive.zip", "video.mov", "font.woff2", "model.blend",
|
||||
])
|
||||
def test_unsupported_extensions_are_skipped(filename):
|
||||
def test_unsupported_extensions_are_skipped(filename, mocker):
|
||||
mocker.patch("app.config.config.ETL_SERVICE", "DOCLING")
|
||||
item = {"name": filename, "file": {"mimeType": "application/octet-stream"}}
|
||||
assert should_skip_file(item) is True, f"{filename} should be skipped"
|
||||
|
||||
|
|
@ -39,6 +50,26 @@ def test_unsupported_extensions_are_skipped(filename):
|
|||
"report.pdf", "doc.docx", "sheet.xlsx", "slides.pptx",
|
||||
"readme.txt", "data.csv", "photo.png", "notes.md",
|
||||
])
|
||||
def test_parseable_files_are_not_skipped(filename):
|
||||
def test_universal_files_are_not_skipped(filename, mocker):
|
||||
for service in ("DOCLING", "LLAMACLOUD", "UNSTRUCTURED"):
|
||||
mocker.patch("app.config.config.ETL_SERVICE", service)
|
||||
item = {"name": filename, "file": {"mimeType": "application/octet-stream"}}
|
||||
assert should_skip_file(item) is False, (
|
||||
f"{filename} should NOT be skipped with {service}"
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("filename,service,expected_skip", [
|
||||
("macro.docm", "DOCLING", True),
|
||||
("macro.docm", "LLAMACLOUD", False),
|
||||
("mail.eml", "DOCLING", True),
|
||||
("mail.eml", "UNSTRUCTURED", False),
|
||||
("photo.heic", "UNSTRUCTURED", False),
|
||||
("photo.heic", "DOCLING", True),
|
||||
])
|
||||
def test_parser_specific_extensions(filename, service, expected_skip, mocker):
|
||||
mocker.patch("app.config.config.ETL_SERVICE", service)
|
||||
item = {"name": filename, "file": {"mimeType": "application/octet-stream"}}
|
||||
assert should_skip_file(item) is False, f"{filename} should NOT be skipped"
|
||||
assert should_skip_file(item) is expected_skip, (
|
||||
f"{filename} with {service}: expected skip={expected_skip}"
|
||||
)
|
||||
|
|
|
|||
|
|
@ -377,3 +377,72 @@ async def test_extract_zip_raises_unsupported_error(tmp_path):
|
|||
await EtlPipelineService().extract(
|
||||
EtlRequest(file_path=str(zip_file), filename="archive.zip")
|
||||
)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Slice 14 – should_skip_for_service (per-parser document filtering)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
@pytest.mark.parametrize("filename,etl_service,expected_skip", [
|
||||
("file.eml", "DOCLING", True),
|
||||
("file.eml", "UNSTRUCTURED", False),
|
||||
("file.docm", "LLAMACLOUD", False),
|
||||
("file.docm", "DOCLING", True),
|
||||
("file.txt", "DOCLING", False),
|
||||
("file.csv", "LLAMACLOUD", False),
|
||||
("file.mp3", "UNSTRUCTURED", False),
|
||||
("file.exe", "LLAMACLOUD", True),
|
||||
("file.pdf", "DOCLING", False),
|
||||
("file.webp", "DOCLING", False),
|
||||
("file.webp", "UNSTRUCTURED", True),
|
||||
("file.gif", "LLAMACLOUD", False),
|
||||
("file.gif", "DOCLING", True),
|
||||
("file.heic", "UNSTRUCTURED", False),
|
||||
("file.heic", "DOCLING", True),
|
||||
("file.svg", "LLAMACLOUD", False),
|
||||
("file.svg", "DOCLING", True),
|
||||
("file.p7s", "UNSTRUCTURED", False),
|
||||
("file.p7s", "LLAMACLOUD", True),
|
||||
])
|
||||
def test_should_skip_for_service(filename, etl_service, expected_skip):
|
||||
from app.etl_pipeline.file_classifier import should_skip_for_service
|
||||
|
||||
assert should_skip_for_service(filename, etl_service) is expected_skip, (
|
||||
f"{filename} with {etl_service}: expected skip={expected_skip}"
|
||||
)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Slice 14b – ETL pipeline rejects per-parser incompatible documents
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
async def test_extract_docm_with_docling_raises_unsupported(tmp_path, mocker):
|
||||
"""Docling cannot parse .docm -- pipeline should reject before dispatching."""
|
||||
from app.etl_pipeline.exceptions import EtlUnsupportedFileError
|
||||
|
||||
mocker.patch("app.config.config.ETL_SERVICE", "DOCLING")
|
||||
|
||||
docm_file = tmp_path / "macro.docm"
|
||||
docm_file.write_bytes(b"\x00" * 10)
|
||||
|
||||
with pytest.raises(EtlUnsupportedFileError, match="not supported by DOCLING"):
|
||||
await EtlPipelineService().extract(
|
||||
EtlRequest(file_path=str(docm_file), filename="macro.docm")
|
||||
)
|
||||
|
||||
|
||||
async def test_extract_eml_with_docling_raises_unsupported(tmp_path, mocker):
|
||||
"""Docling cannot parse .eml -- pipeline should reject before dispatching."""
|
||||
from app.etl_pipeline.exceptions import EtlUnsupportedFileError
|
||||
|
||||
mocker.patch("app.config.config.ETL_SERVICE", "DOCLING")
|
||||
|
||||
eml_file = tmp_path / "mail.eml"
|
||||
eml_file.write_bytes(b"From: test@example.com")
|
||||
|
||||
with pytest.raises(EtlUnsupportedFileError, match="not supported by DOCLING"):
|
||||
await EtlPipelineService().extract(
|
||||
EtlRequest(file_path=str(eml_file), filename="mail.eml")
|
||||
)
|
||||
|
|
|
|||
|
|
@ -21,10 +21,17 @@ def test_exe_is_not_supported_document():
|
|||
"report.pdf", "doc.docx", "old.doc",
|
||||
"sheet.xlsx", "legacy.xls",
|
||||
"slides.pptx", "deck.ppt",
|
||||
"macro.docm", "macro.xlsm", "macro.pptm",
|
||||
"photo.png", "photo.jpg", "photo.jpeg", "scan.bmp", "scan.tiff", "scan.tif",
|
||||
"photo.webp", "anim.gif", "iphone.heic",
|
||||
"manual.rtf", "book.epub",
|
||||
"letter.odt", "data.ods", "presentation.odp",
|
||||
"korean.hwpx",
|
||||
"inbox.eml", "outlook.msg",
|
||||
"korean.hwpx", "korean.hwp",
|
||||
"template.dot", "template.dotm",
|
||||
"template.pot", "template.potx",
|
||||
"binary.xlsb", "workspace.xlw",
|
||||
"vector.svg", "signature.p7s",
|
||||
])
|
||||
def test_document_extensions_are_supported(filename):
|
||||
from app.utils.file_extensions import is_supported_document_extension
|
||||
|
|
@ -40,3 +47,70 @@ def test_non_document_extensions_are_not_supported(filename):
|
|||
from app.utils.file_extensions import is_supported_document_extension
|
||||
|
||||
assert is_supported_document_extension(filename) is False, f"{filename} should NOT be supported"
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Per-parser extension sets
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_union_equals_all_three_sets():
|
||||
from app.utils.file_extensions import (
|
||||
DOCLING_DOCUMENT_EXTENSIONS,
|
||||
DOCUMENT_EXTENSIONS,
|
||||
LLAMAPARSE_DOCUMENT_EXTENSIONS,
|
||||
UNSTRUCTURED_DOCUMENT_EXTENSIONS,
|
||||
)
|
||||
|
||||
expected = (
|
||||
DOCLING_DOCUMENT_EXTENSIONS
|
||||
| LLAMAPARSE_DOCUMENT_EXTENSIONS
|
||||
| UNSTRUCTURED_DOCUMENT_EXTENSIONS
|
||||
)
|
||||
assert DOCUMENT_EXTENSIONS == expected
|
||||
|
||||
|
||||
def test_get_extensions_for_docling():
|
||||
from app.utils.file_extensions import get_document_extensions_for_service
|
||||
|
||||
exts = get_document_extensions_for_service("DOCLING")
|
||||
assert ".pdf" in exts
|
||||
assert ".webp" in exts
|
||||
assert ".docx" in exts
|
||||
assert ".eml" not in exts
|
||||
assert ".docm" not in exts
|
||||
assert ".gif" not in exts
|
||||
assert ".heic" not in exts
|
||||
|
||||
|
||||
def test_get_extensions_for_llamacloud():
|
||||
from app.utils.file_extensions import get_document_extensions_for_service
|
||||
|
||||
exts = get_document_extensions_for_service("LLAMACLOUD")
|
||||
assert ".docm" in exts
|
||||
assert ".gif" in exts
|
||||
assert ".svg" in exts
|
||||
assert ".hwp" in exts
|
||||
assert ".eml" not in exts
|
||||
assert ".heic" not in exts
|
||||
|
||||
|
||||
def test_get_extensions_for_unstructured():
|
||||
from app.utils.file_extensions import get_document_extensions_for_service
|
||||
|
||||
exts = get_document_extensions_for_service("UNSTRUCTURED")
|
||||
assert ".eml" in exts
|
||||
assert ".heic" in exts
|
||||
assert ".p7s" in exts
|
||||
assert ".docm" not in exts
|
||||
assert ".gif" not in exts
|
||||
assert ".svg" not in exts
|
||||
|
||||
|
||||
def test_get_extensions_for_none_returns_union():
|
||||
from app.utils.file_extensions import (
|
||||
DOCUMENT_EXTENSIONS,
|
||||
get_document_extensions_for_service,
|
||||
)
|
||||
|
||||
assert get_document_extensions_for_service(None) == DOCUMENT_EXTENSIONS
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue