refactor: implement file type classification for supported extensions across Dropbox, Google Drive, and OneDrive connectors, enhancing file handling and error management

2026-04-25 08:46:22 +02:00 · 2026-04-06 22:03:47 +05:30 · 2026-04-06 22:03:47 +05:30 · dc7047f64d
commit dc7047f64d
parent 47f4be08d9
14 changed files with 250 additions and 27 deletions
--- a/surfsense_backend/tests/unit/connectors/test_google_drive_file_types.py
+++ b/surfsense_backend/tests/unit/connectors/test_google_drive_file_types.py
@ -0,0 +1,22 @@
+"""Tests for Google Drive file type filtering."""
+
+import pytest
+
+from app.connectors.google_drive.file_types import should_skip_by_extension
+
+pytestmark = pytest.mark.unit
+
+
+@pytest.mark.parametrize("filename", [
+    "malware.exe", "archive.zip", "video.mov", "font.woff2", "model.blend",
+])
+def test_unsupported_extensions_are_skipped(filename):
+    assert should_skip_by_extension(filename) is True
+
+
+@pytest.mark.parametrize("filename", [
+    "report.pdf", "doc.docx", "sheet.xlsx", "slides.pptx",
+    "readme.txt", "data.csv", "photo.png", "notes.md",
+])
+def test_parseable_extensions_are_not_skipped(filename):
+    assert should_skip_by_extension(filename) is False
--- a/surfsense_backend/tests/unit/connectors/test_onedrive_file_types.py
+++ b/surfsense_backend/tests/unit/connectors/test_onedrive_file_types.py
@ -0,0 +1,44 @@
+"""Tests for OneDrive file type filtering."""
+
+import pytest
+
+from app.connectors.onedrive.file_types import should_skip_file
+
+pytestmark = pytest.mark.unit
+
+
+def test_folder_is_skipped():
+    item = {"folder": {}, "name": "My Folder"}
+    assert should_skip_file(item) is True
+
+
+def test_remote_item_is_skipped():
+    item = {"remoteItem": {}, "name": "shared.docx"}
+    assert should_skip_file(item) is True
+
+
+def test_package_is_skipped():
+    item = {"package": {}, "name": "notebook"}
+    assert should_skip_file(item) is True
+
+
+def test_onenote_is_skipped():
+    item = {"name": "notes", "file": {"mimeType": "application/msonenote"}}
+    assert should_skip_file(item) is True
+
+
+@pytest.mark.parametrize("filename", [
+    "malware.exe", "archive.zip", "video.mov", "font.woff2", "model.blend",
+])
+def test_unsupported_extensions_are_skipped(filename):
+    item = {"name": filename, "file": {"mimeType": "application/octet-stream"}}
+    assert should_skip_file(item) is True, f"{filename} should be skipped"
+
+
+@pytest.mark.parametrize("filename", [
+    "report.pdf", "doc.docx", "sheet.xlsx", "slides.pptx",
+    "readme.txt", "data.csv", "photo.png", "notes.md",
+])
+def test_parseable_files_are_not_skipped(filename):
+    item = {"name": filename, "file": {"mimeType": "application/octet-stream"}}
+    assert should_skip_file(item) is False, f"{filename} should NOT be skipped"
--- a/surfsense_backend/tests/unit/etl_pipeline/test_etl_pipeline_service.py
+++ b/surfsense_backend/tests/unit/etl_pipeline/test_etl_pipeline_service.py
@ -257,7 +257,7 @@ async def test_extract_pdf_with_llamacloud(tmp_path, mocker):


 async def test_unknown_extension_uses_document_etl(tmp_path, mocker):
-    """An unknown extension (e.g. .docx) falls through to the document ETL path."""
+    """An allowlisted document extension (.docx) routes to the document ETL path."""
    docx_file = tmp_path / "doc.docx"
    docx_file.write_bytes(b"PK fake docx")

@ -307,3 +307,73 @@ async def test_unknown_etl_service_raises(tmp_path, mocker):
        await EtlPipelineService().extract(
            EtlRequest(file_path=str(pdf_file), filename="report.pdf")
        )
+
+
+# ---------------------------------------------------------------------------
+# Slice 13 – unsupported file types are rejected before reaching any parser
+# ---------------------------------------------------------------------------
+
+
+def test_unknown_extension_classified_as_unsupported():
+    """An unknown extension defaults to UNSUPPORTED (allowlist behaviour)."""
+    from app.etl_pipeline.file_classifier import FileCategory, classify_file
+
+    assert classify_file("random.xyz") == FileCategory.UNSUPPORTED
+
+
+@pytest.mark.parametrize("filename", [
+    "malware.exe", "archive.zip", "video.mov", "font.woff2",
+    "model.blend", "data.parquet", "package.deb", "firmware.bin",
+])
+def test_unsupported_extensions_classified_correctly(filename):
+    """Extensions not in any allowlist are classified as UNSUPPORTED."""
+    from app.etl_pipeline.file_classifier import FileCategory, classify_file
+
+    assert classify_file(filename) == FileCategory.UNSUPPORTED
+
+
+@pytest.mark.parametrize("filename,expected", [
+    ("report.pdf", "document"),
+    ("doc.docx", "document"),
+    ("slides.pptx", "document"),
+    ("sheet.xlsx", "document"),
+    ("photo.png", "document"),
+    ("photo.jpg", "document"),
+    ("book.epub", "document"),
+    ("letter.odt", "document"),
+    ("readme.md", "plaintext"),
+    ("data.csv", "direct_convert"),
+])
+def test_parseable_extensions_classified_correctly(filename, expected):
+    """Parseable files are classified into their correct category."""
+    from app.etl_pipeline.file_classifier import FileCategory, classify_file
+
+    result = classify_file(filename)
+    assert result != FileCategory.UNSUPPORTED
+    assert result.value == expected
+
+
+async def test_extract_unsupported_file_raises_error(tmp_path):
+    """EtlPipelineService.extract() raises EtlUnsupportedFileError for .exe files."""
+    from app.etl_pipeline.exceptions import EtlUnsupportedFileError
+
+    exe_file = tmp_path / "program.exe"
+    exe_file.write_bytes(b"\x00" * 10)
+
+    with pytest.raises(EtlUnsupportedFileError, match="not supported"):
+        await EtlPipelineService().extract(
+            EtlRequest(file_path=str(exe_file), filename="program.exe")
+        )
+
+
+async def test_extract_zip_raises_unsupported_error(tmp_path):
+    """EtlPipelineService.extract() raises EtlUnsupportedFileError for .zip archives."""
+    from app.etl_pipeline.exceptions import EtlUnsupportedFileError
+
+    zip_file = tmp_path / "archive.zip"
+    zip_file.write_bytes(b"PK\x03\x04")
+
+    with pytest.raises(EtlUnsupportedFileError, match="not supported"):
+        await EtlPipelineService().extract(
+            EtlRequest(file_path=str(zip_file), filename="archive.zip")
+        )
--- a/surfsense_backend/tests/unit/utils/init.py
+++ b/surfsense_backend/tests/unit/utils/init.py
--- a/surfsense_backend/tests/unit/utils/test_file_extensions.py
+++ b/surfsense_backend/tests/unit/utils/test_file_extensions.py
@ -0,0 +1,42 @@
+"""Tests for the DOCUMENT_EXTENSIONS allowlist module."""
+
+import pytest
+
+pytestmark = pytest.mark.unit
+
+
+def test_pdf_is_supported_document():
+    from app.utils.file_extensions import is_supported_document_extension
+
+    assert is_supported_document_extension("report.pdf") is True
+
+
+def test_exe_is_not_supported_document():
+    from app.utils.file_extensions import is_supported_document_extension
+
+    assert is_supported_document_extension("malware.exe") is False
+
+
+@pytest.mark.parametrize("filename", [
+    "report.pdf", "doc.docx", "old.doc",
+    "sheet.xlsx", "legacy.xls",
+    "slides.pptx", "deck.ppt",
+    "photo.png", "photo.jpg", "photo.jpeg", "scan.bmp", "scan.tiff", "scan.tif",
+    "manual.rtf", "book.epub",
+    "letter.odt", "data.ods", "presentation.odp",
+    "korean.hwpx",
+])
+def test_document_extensions_are_supported(filename):
+    from app.utils.file_extensions import is_supported_document_extension
+
+    assert is_supported_document_extension(filename) is True, f"{filename} should be supported"
+
+
+@pytest.mark.parametrize("filename", [
+    "malware.exe", "archive.zip", "video.mov", "font.woff2",
+    "model.blend", "random.xyz", "data.parquet", "package.deb",
+])
+def test_non_document_extensions_are_not_supported(filename):
+    from app.utils.file_extensions import is_supported_document_extension
+
+    assert is_supported_document_extension(filename) is False, f"{filename} should NOT be supported"