mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-04-27 09:46:25 +02:00
refactor: unify file skipping logic across Dropbox, Google Drive, and OneDrive connectors by replacing classification checks with a centralized service-based approach, enhancing maintainability and consistency in file handling
This commit is contained in:
parent
f03bf05aaa
commit
e7beeb2a36
13 changed files with 388 additions and 67 deletions
|
|
@ -7,6 +7,11 @@ from app.connectors.dropbox.file_types import should_skip_file
|
|||
pytestmark = pytest.mark.unit
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Structural skips (independent of ETL service)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_folder_item_is_skipped():
|
||||
item = {".tag": "folder", "name": "My Folder"}
|
||||
assert should_skip_file(item) is True
|
||||
|
|
@ -22,13 +27,18 @@ def test_non_downloadable_item_is_skipped():
|
|||
assert should_skip_file(item) is True
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Extension-based skips (require ETL service context)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"filename",
|
||||
[
|
||||
"archive.zip", "backup.tar", "data.gz", "stuff.rar", "pack.7z",
|
||||
"program.exe", "lib.dll", "module.so", "image.dmg", "disk.iso",
|
||||
"movie.mov", "clip.avi", "video.mkv", "film.wmv", "stream.flv",
|
||||
"icon.svg", "anim.gif", "photo.webp", "shot.heic", "favicon.ico",
|
||||
"favicon.ico",
|
||||
"raw.cr2", "photo.nef", "image.arw", "pic.dng",
|
||||
"design.psd", "vector.ai", "mockup.sketch", "proto.fig",
|
||||
"font.ttf", "font.otf", "font.woff", "font.woff2",
|
||||
|
|
@ -36,7 +46,8 @@ def test_non_downloadable_item_is_skipped():
|
|||
"local.db", "data.sqlite", "access.mdb",
|
||||
],
|
||||
)
|
||||
def test_non_parseable_extensions_are_skipped(filename):
|
||||
def test_non_parseable_extensions_are_skipped(filename, mocker):
|
||||
mocker.patch("app.config.config.ETL_SERVICE", "DOCLING")
|
||||
item = {".tag": "file", "name": filename}
|
||||
assert should_skip_file(item) is True, f"{filename} should be skipped"
|
||||
|
||||
|
|
@ -45,29 +56,61 @@ def test_non_parseable_extensions_are_skipped(filename):
|
|||
"filename",
|
||||
[
|
||||
"report.pdf", "document.docx", "sheet.xlsx", "slides.pptx",
|
||||
"old.doc", "legacy.xls", "deck.ppt",
|
||||
"readme.txt", "data.csv", "page.html", "notes.md",
|
||||
"config.json", "feed.xml",
|
||||
],
|
||||
)
|
||||
def test_parseable_documents_are_not_skipped(filename):
|
||||
item = {".tag": "file", "name": filename}
|
||||
assert should_skip_file(item) is False, f"{filename} should NOT be skipped"
|
||||
def test_parseable_documents_are_not_skipped(filename, mocker):
|
||||
"""Files in plaintext/direct_convert/universal document sets are never skipped."""
|
||||
for service in ("DOCLING", "LLAMACLOUD", "UNSTRUCTURED"):
|
||||
mocker.patch("app.config.config.ETL_SERVICE", service)
|
||||
item = {".tag": "file", "name": filename}
|
||||
assert should_skip_file(item) is False, (
|
||||
f"{filename} should NOT be skipped with {service}"
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"filename",
|
||||
["photo.jpg", "image.jpeg", "screenshot.png", "scan.bmp", "page.tiff", "doc.tif"],
|
||||
)
|
||||
def test_universal_images_are_not_skipped(filename):
|
||||
item = {".tag": "file", "name": filename}
|
||||
assert should_skip_file(item) is False, f"{filename} should NOT be skipped"
|
||||
def test_universal_images_are_not_skipped(filename, mocker):
|
||||
"""Images supported by all parsers are never skipped."""
|
||||
for service in ("DOCLING", "LLAMACLOUD", "UNSTRUCTURED"):
|
||||
mocker.patch("app.config.config.ETL_SERVICE", service)
|
||||
item = {".tag": "file", "name": filename}
|
||||
assert should_skip_file(item) is False, (
|
||||
f"{filename} should NOT be skipped with {service}"
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"filename",
|
||||
["icon.svg", "anim.gif", "photo.webp", "live.heic"],
|
||||
)
|
||||
def test_non_universal_images_are_skipped(filename):
|
||||
@pytest.mark.parametrize("filename,service,expected_skip", [
|
||||
("old.doc", "DOCLING", True),
|
||||
("old.doc", "LLAMACLOUD", False),
|
||||
("old.doc", "UNSTRUCTURED", False),
|
||||
("legacy.xls", "DOCLING", True),
|
||||
("legacy.xls", "LLAMACLOUD", False),
|
||||
("legacy.xls", "UNSTRUCTURED", False),
|
||||
("deck.ppt", "DOCLING", True),
|
||||
("deck.ppt", "LLAMACLOUD", False),
|
||||
("deck.ppt", "UNSTRUCTURED", False),
|
||||
("icon.svg", "DOCLING", True),
|
||||
("icon.svg", "LLAMACLOUD", False),
|
||||
("anim.gif", "DOCLING", True),
|
||||
("anim.gif", "LLAMACLOUD", False),
|
||||
("photo.webp", "DOCLING", False),
|
||||
("photo.webp", "LLAMACLOUD", False),
|
||||
("photo.webp", "UNSTRUCTURED", True),
|
||||
("live.heic", "DOCLING", True),
|
||||
("live.heic", "UNSTRUCTURED", False),
|
||||
("macro.docm", "DOCLING", True),
|
||||
("macro.docm", "LLAMACLOUD", False),
|
||||
("mail.eml", "DOCLING", True),
|
||||
("mail.eml", "UNSTRUCTURED", False),
|
||||
])
|
||||
def test_parser_specific_extensions(filename, service, expected_skip, mocker):
|
||||
mocker.patch("app.config.config.ETL_SERVICE", service)
|
||||
item = {".tag": "file", "name": filename}
|
||||
assert should_skip_file(item) is True, f"{filename} should be skipped"
|
||||
assert should_skip_file(item) is expected_skip, (
|
||||
f"{filename} with {service}: expected skip={expected_skip}"
|
||||
)
|
||||
|
|
|
|||
|
|
@ -10,13 +10,38 @@ pytestmark = pytest.mark.unit
|
|||
@pytest.mark.parametrize("filename", [
|
||||
"malware.exe", "archive.zip", "video.mov", "font.woff2", "model.blend",
|
||||
])
|
||||
def test_unsupported_extensions_are_skipped(filename):
|
||||
assert should_skip_by_extension(filename) is True
|
||||
def test_unsupported_extensions_are_skipped_regardless_of_service(filename, mocker):
|
||||
"""Truly unsupported files are skipped no matter which ETL service is configured."""
|
||||
for service in ("DOCLING", "LLAMACLOUD", "UNSTRUCTURED"):
|
||||
mocker.patch("app.config.config.ETL_SERVICE", service)
|
||||
assert should_skip_by_extension(filename) is True
|
||||
|
||||
|
||||
@pytest.mark.parametrize("filename", [
|
||||
"report.pdf", "doc.docx", "sheet.xlsx", "slides.pptx",
|
||||
"readme.txt", "data.csv", "photo.png", "notes.md",
|
||||
])
|
||||
def test_parseable_extensions_are_not_skipped(filename):
|
||||
assert should_skip_by_extension(filename) is False
|
||||
def test_universal_extensions_are_not_skipped(filename, mocker):
|
||||
"""Files supported by all parsers (or handled by plaintext/direct_convert) are never skipped."""
|
||||
for service in ("DOCLING", "LLAMACLOUD", "UNSTRUCTURED"):
|
||||
mocker.patch("app.config.config.ETL_SERVICE", service)
|
||||
assert should_skip_by_extension(filename) is False, (
|
||||
f"{filename} should NOT be skipped with {service}"
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("filename,service,expected_skip", [
|
||||
("macro.docm", "DOCLING", True),
|
||||
("macro.docm", "LLAMACLOUD", False),
|
||||
("mail.eml", "DOCLING", True),
|
||||
("mail.eml", "UNSTRUCTURED", False),
|
||||
("photo.gif", "DOCLING", True),
|
||||
("photo.gif", "LLAMACLOUD", False),
|
||||
("photo.heic", "UNSTRUCTURED", False),
|
||||
("photo.heic", "DOCLING", True),
|
||||
])
|
||||
def test_parser_specific_extensions(filename, service, expected_skip, mocker):
|
||||
mocker.patch("app.config.config.ETL_SERVICE", service)
|
||||
assert should_skip_by_extension(filename) is expected_skip, (
|
||||
f"{filename} with {service}: expected skip={expected_skip}"
|
||||
)
|
||||
|
|
|
|||
|
|
@ -7,6 +7,11 @@ from app.connectors.onedrive.file_types import should_skip_file
|
|||
pytestmark = pytest.mark.unit
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Structural skips (independent of ETL service)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_folder_is_skipped():
|
||||
item = {"folder": {}, "name": "My Folder"}
|
||||
assert should_skip_file(item) is True
|
||||
|
|
@ -27,10 +32,16 @@ def test_onenote_is_skipped():
|
|||
assert should_skip_file(item) is True
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Extension-based skips (require ETL service context)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
@pytest.mark.parametrize("filename", [
|
||||
"malware.exe", "archive.zip", "video.mov", "font.woff2", "model.blend",
|
||||
])
|
||||
def test_unsupported_extensions_are_skipped(filename):
|
||||
def test_unsupported_extensions_are_skipped(filename, mocker):
|
||||
mocker.patch("app.config.config.ETL_SERVICE", "DOCLING")
|
||||
item = {"name": filename, "file": {"mimeType": "application/octet-stream"}}
|
||||
assert should_skip_file(item) is True, f"{filename} should be skipped"
|
||||
|
||||
|
|
@ -39,6 +50,26 @@ def test_unsupported_extensions_are_skipped(filename):
|
|||
"report.pdf", "doc.docx", "sheet.xlsx", "slides.pptx",
|
||||
"readme.txt", "data.csv", "photo.png", "notes.md",
|
||||
])
|
||||
def test_parseable_files_are_not_skipped(filename):
|
||||
def test_universal_files_are_not_skipped(filename, mocker):
|
||||
for service in ("DOCLING", "LLAMACLOUD", "UNSTRUCTURED"):
|
||||
mocker.patch("app.config.config.ETL_SERVICE", service)
|
||||
item = {"name": filename, "file": {"mimeType": "application/octet-stream"}}
|
||||
assert should_skip_file(item) is False, (
|
||||
f"{filename} should NOT be skipped with {service}"
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("filename,service,expected_skip", [
|
||||
("macro.docm", "DOCLING", True),
|
||||
("macro.docm", "LLAMACLOUD", False),
|
||||
("mail.eml", "DOCLING", True),
|
||||
("mail.eml", "UNSTRUCTURED", False),
|
||||
("photo.heic", "UNSTRUCTURED", False),
|
||||
("photo.heic", "DOCLING", True),
|
||||
])
|
||||
def test_parser_specific_extensions(filename, service, expected_skip, mocker):
|
||||
mocker.patch("app.config.config.ETL_SERVICE", service)
|
||||
item = {"name": filename, "file": {"mimeType": "application/octet-stream"}}
|
||||
assert should_skip_file(item) is False, f"{filename} should NOT be skipped"
|
||||
assert should_skip_file(item) is expected_skip, (
|
||||
f"{filename} with {service}: expected skip={expected_skip}"
|
||||
)
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue