mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-04-25 08:46:22 +02:00
refactor: unify file skipping logic across Dropbox, Google Drive, and OneDrive connectors by replacing classification checks with a centralized service-based approach, enhancing maintainability and consistency in file handling
This commit is contained in:
parent
f03bf05aaa
commit
e7beeb2a36
13 changed files with 388 additions and 67 deletions
|
|
@ -21,10 +21,17 @@ def test_exe_is_not_supported_document():
|
|||
"report.pdf", "doc.docx", "old.doc",
|
||||
"sheet.xlsx", "legacy.xls",
|
||||
"slides.pptx", "deck.ppt",
|
||||
"macro.docm", "macro.xlsm", "macro.pptm",
|
||||
"photo.png", "photo.jpg", "photo.jpeg", "scan.bmp", "scan.tiff", "scan.tif",
|
||||
"photo.webp", "anim.gif", "iphone.heic",
|
||||
"manual.rtf", "book.epub",
|
||||
"letter.odt", "data.ods", "presentation.odp",
|
||||
"korean.hwpx",
|
||||
"inbox.eml", "outlook.msg",
|
||||
"korean.hwpx", "korean.hwp",
|
||||
"template.dot", "template.dotm",
|
||||
"template.pot", "template.potx",
|
||||
"binary.xlsb", "workspace.xlw",
|
||||
"vector.svg", "signature.p7s",
|
||||
])
|
||||
def test_document_extensions_are_supported(filename):
|
||||
from app.utils.file_extensions import is_supported_document_extension
|
||||
|
|
@ -40,3 +47,70 @@ def test_non_document_extensions_are_not_supported(filename):
|
|||
from app.utils.file_extensions import is_supported_document_extension
|
||||
|
||||
assert is_supported_document_extension(filename) is False, f"{filename} should NOT be supported"
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Per-parser extension sets
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_union_equals_all_three_sets():
|
||||
from app.utils.file_extensions import (
|
||||
DOCLING_DOCUMENT_EXTENSIONS,
|
||||
DOCUMENT_EXTENSIONS,
|
||||
LLAMAPARSE_DOCUMENT_EXTENSIONS,
|
||||
UNSTRUCTURED_DOCUMENT_EXTENSIONS,
|
||||
)
|
||||
|
||||
expected = (
|
||||
DOCLING_DOCUMENT_EXTENSIONS
|
||||
| LLAMAPARSE_DOCUMENT_EXTENSIONS
|
||||
| UNSTRUCTURED_DOCUMENT_EXTENSIONS
|
||||
)
|
||||
assert DOCUMENT_EXTENSIONS == expected
|
||||
|
||||
|
||||
def test_get_extensions_for_docling():
|
||||
from app.utils.file_extensions import get_document_extensions_for_service
|
||||
|
||||
exts = get_document_extensions_for_service("DOCLING")
|
||||
assert ".pdf" in exts
|
||||
assert ".webp" in exts
|
||||
assert ".docx" in exts
|
||||
assert ".eml" not in exts
|
||||
assert ".docm" not in exts
|
||||
assert ".gif" not in exts
|
||||
assert ".heic" not in exts
|
||||
|
||||
|
||||
def test_get_extensions_for_llamacloud():
|
||||
from app.utils.file_extensions import get_document_extensions_for_service
|
||||
|
||||
exts = get_document_extensions_for_service("LLAMACLOUD")
|
||||
assert ".docm" in exts
|
||||
assert ".gif" in exts
|
||||
assert ".svg" in exts
|
||||
assert ".hwp" in exts
|
||||
assert ".eml" not in exts
|
||||
assert ".heic" not in exts
|
||||
|
||||
|
||||
def test_get_extensions_for_unstructured():
|
||||
from app.utils.file_extensions import get_document_extensions_for_service
|
||||
|
||||
exts = get_document_extensions_for_service("UNSTRUCTURED")
|
||||
assert ".eml" in exts
|
||||
assert ".heic" in exts
|
||||
assert ".p7s" in exts
|
||||
assert ".docm" not in exts
|
||||
assert ".gif" not in exts
|
||||
assert ".svg" not in exts
|
||||
|
||||
|
||||
def test_get_extensions_for_none_returns_union():
|
||||
from app.utils.file_extensions import (
|
||||
DOCUMENT_EXTENSIONS,
|
||||
get_document_extensions_for_service,
|
||||
)
|
||||
|
||||
assert get_document_extensions_for_service(None) == DOCUMENT_EXTENSIONS
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue