mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-04-27 09:46:25 +02:00
chore: ran linting
This commit is contained in:
parent
5803fe79da
commit
0a26a6c5bb
54 changed files with 1015 additions and 672 deletions
|
|
@ -6,7 +6,6 @@ real so we know the full path from "cloud gives us bytes" to "we get markdown
|
|||
back" actually works.
|
||||
"""
|
||||
|
||||
import os
|
||||
from unittest.mock import AsyncMock, MagicMock
|
||||
|
||||
import pytest
|
||||
|
|
@ -21,6 +20,7 @@ _CSV_CONTENT = "name,age\nAlice,30\nBob,25\n"
|
|||
# Helpers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
async def _write_file(dest_path: str, content: str) -> None:
|
||||
"""Simulate a cloud client writing downloaded bytes to disk."""
|
||||
with open(dest_path, "w", encoding="utf-8") as f:
|
||||
|
|
@ -43,8 +43,8 @@ def _make_download_side_effect(content: str):
|
|||
# Google Drive
|
||||
# ===================================================================
|
||||
|
||||
class TestGoogleDriveContentExtraction:
|
||||
|
||||
class TestGoogleDriveContentExtraction:
|
||||
async def test_txt_file_returns_markdown(self):
|
||||
from app.connectors.google_drive.content_extractor import (
|
||||
download_and_extract_content,
|
||||
|
|
@ -76,7 +76,7 @@ class TestGoogleDriveContentExtraction:
|
|||
|
||||
file = {"id": "f2", "name": "data.csv", "mimeType": "text/csv"}
|
||||
|
||||
markdown, metadata, error = await download_and_extract_content(client, file)
|
||||
markdown, _metadata, error = await download_and_extract_content(client, file)
|
||||
|
||||
assert error is None
|
||||
assert "Alice" in markdown
|
||||
|
|
@ -93,7 +93,7 @@ class TestGoogleDriveContentExtraction:
|
|||
|
||||
file = {"id": "f3", "name": "doc.txt", "mimeType": "text/plain"}
|
||||
|
||||
markdown, metadata, error = await download_and_extract_content(client, file)
|
||||
markdown, _metadata, error = await download_and_extract_content(client, file)
|
||||
|
||||
assert markdown is None
|
||||
assert error == "Network timeout"
|
||||
|
|
@ -103,8 +103,8 @@ class TestGoogleDriveContentExtraction:
|
|||
# OneDrive
|
||||
# ===================================================================
|
||||
|
||||
class TestOneDriveContentExtraction:
|
||||
|
||||
class TestOneDriveContentExtraction:
|
||||
async def test_txt_file_returns_markdown(self):
|
||||
from app.connectors.onedrive.content_extractor import (
|
||||
download_and_extract_content,
|
||||
|
|
@ -144,7 +144,7 @@ class TestOneDriveContentExtraction:
|
|||
"file": {"mimeType": "text/csv"},
|
||||
}
|
||||
|
||||
markdown, metadata, error = await download_and_extract_content(client, file)
|
||||
markdown, _metadata, error = await download_and_extract_content(client, file)
|
||||
|
||||
assert error is None
|
||||
assert "Alice" in markdown
|
||||
|
|
@ -164,7 +164,7 @@ class TestOneDriveContentExtraction:
|
|||
"file": {"mimeType": "text/plain"},
|
||||
}
|
||||
|
||||
markdown, metadata, error = await download_and_extract_content(client, file)
|
||||
markdown, _metadata, error = await download_and_extract_content(client, file)
|
||||
|
||||
assert markdown is None
|
||||
assert error == "403 Forbidden"
|
||||
|
|
@ -174,8 +174,8 @@ class TestOneDriveContentExtraction:
|
|||
# Dropbox
|
||||
# ===================================================================
|
||||
|
||||
class TestDropboxContentExtraction:
|
||||
|
||||
class TestDropboxContentExtraction:
|
||||
async def test_txt_file_returns_markdown(self):
|
||||
from app.connectors.dropbox.content_extractor import (
|
||||
download_and_extract_content,
|
||||
|
|
@ -217,7 +217,7 @@ class TestDropboxContentExtraction:
|
|||
"path_lower": "/data.csv",
|
||||
}
|
||||
|
||||
markdown, metadata, error = await download_and_extract_content(client, file)
|
||||
markdown, _metadata, error = await download_and_extract_content(client, file)
|
||||
|
||||
assert error is None
|
||||
assert "Alice" in markdown
|
||||
|
|
@ -238,7 +238,7 @@ class TestDropboxContentExtraction:
|
|||
"path_lower": "/big.txt",
|
||||
}
|
||||
|
||||
markdown, metadata, error = await download_and_extract_content(client, file)
|
||||
markdown, _metadata, error = await download_and_extract_content(client, file)
|
||||
|
||||
assert markdown is None
|
||||
assert error == "Rate limited"
|
||||
|
|
|
|||
|
|
@ -265,6 +265,7 @@ def full_scan_mocks(mock_dropbox_client, monkeypatch):
|
|||
|
||||
async def _fake_skip(session, file, search_space_id):
|
||||
from app.connectors.dropbox.file_types import should_skip_file as _skip
|
||||
|
||||
item_skip, unsup_ext = _skip(file)
|
||||
if item_skip:
|
||||
if unsup_ext:
|
||||
|
|
@ -468,7 +469,11 @@ async def test_selected_files_fetch_failure_isolation(selected_files_mocks):
|
|||
|
||||
indexed, skipped, _unsupported, errors = await _run_selected(
|
||||
selected_files_mocks,
|
||||
[("/first.txt", "first.txt"), ("/mid.txt", "mid.txt"), ("/third.txt", "third.txt")],
|
||||
[
|
||||
("/first.txt", "first.txt"),
|
||||
("/mid.txt", "mid.txt"),
|
||||
("/third.txt", "third.txt"),
|
||||
],
|
||||
)
|
||||
|
||||
assert indexed == 2
|
||||
|
|
@ -526,8 +531,18 @@ async def test_delta_sync_deletions_call_remove_document(monkeypatch):
|
|||
import app.tasks.connector_indexers.dropbox_indexer as _mod
|
||||
|
||||
entries = [
|
||||
{".tag": "deleted", "name": "gone.txt", "path_lower": "/gone.txt", "id": "id:del1"},
|
||||
{".tag": "deleted", "name": "also_gone.pdf", "path_lower": "/also_gone.pdf", "id": "id:del2"},
|
||||
{
|
||||
".tag": "deleted",
|
||||
"name": "gone.txt",
|
||||
"path_lower": "/gone.txt",
|
||||
"id": "id:del1",
|
||||
},
|
||||
{
|
||||
".tag": "deleted",
|
||||
"name": "also_gone.pdf",
|
||||
"path_lower": "/also_gone.pdf",
|
||||
"id": "id:del2",
|
||||
},
|
||||
]
|
||||
|
||||
mock_client = MagicMock()
|
||||
|
|
@ -544,7 +559,7 @@ async def test_delta_sync_deletions_call_remove_document(monkeypatch):
|
|||
mock_task_logger = MagicMock()
|
||||
mock_task_logger.log_task_progress = AsyncMock()
|
||||
|
||||
indexed, skipped, unsupported, cursor = await _index_with_delta_sync(
|
||||
_indexed, _skipped, _unsupported, cursor = await _index_with_delta_sync(
|
||||
mock_client,
|
||||
AsyncMock(),
|
||||
_CONNECTOR_ID,
|
||||
|
|
@ -573,7 +588,9 @@ async def test_delta_sync_upserts_filtered_and_downloaded(monkeypatch):
|
|||
mock_client = MagicMock()
|
||||
mock_client.get_changes = AsyncMock(return_value=(entries, "cursor-v2", None))
|
||||
|
||||
monkeypatch.setattr(_mod, "_should_skip_file", AsyncMock(return_value=(False, None)))
|
||||
monkeypatch.setattr(
|
||||
_mod, "_should_skip_file", AsyncMock(return_value=(False, None))
|
||||
)
|
||||
|
||||
download_mock = AsyncMock(return_value=(2, 0))
|
||||
monkeypatch.setattr(_mod, "_download_and_index", download_mock)
|
||||
|
|
@ -581,7 +598,7 @@ async def test_delta_sync_upserts_filtered_and_downloaded(monkeypatch):
|
|||
mock_task_logger = MagicMock()
|
||||
mock_task_logger.log_task_progress = AsyncMock()
|
||||
|
||||
indexed, skipped, unsupported, cursor = await _index_with_delta_sync(
|
||||
indexed, skipped, _unsupported, cursor = await _index_with_delta_sync(
|
||||
mock_client,
|
||||
AsyncMock(),
|
||||
_CONNECTOR_ID,
|
||||
|
|
@ -608,8 +625,18 @@ async def test_delta_sync_mix_deletions_and_upserts(monkeypatch):
|
|||
import app.tasks.connector_indexers.dropbox_indexer as _mod
|
||||
|
||||
entries = [
|
||||
{".tag": "deleted", "name": "removed.txt", "path_lower": "/removed.txt", "id": "id:del1"},
|
||||
{".tag": "deleted", "name": "trashed.pdf", "path_lower": "/trashed.pdf", "id": "id:del2"},
|
||||
{
|
||||
".tag": "deleted",
|
||||
"name": "removed.txt",
|
||||
"path_lower": "/removed.txt",
|
||||
"id": "id:del1",
|
||||
},
|
||||
{
|
||||
".tag": "deleted",
|
||||
"name": "trashed.pdf",
|
||||
"path_lower": "/trashed.pdf",
|
||||
"id": "id:del2",
|
||||
},
|
||||
_make_file_dict("mod1", "updated.txt"),
|
||||
_make_file_dict("new1", "brandnew.docx"),
|
||||
]
|
||||
|
|
@ -623,7 +650,9 @@ async def test_delta_sync_mix_deletions_and_upserts(monkeypatch):
|
|||
remove_calls.append(file_id)
|
||||
|
||||
monkeypatch.setattr(_mod, "_remove_document", _fake_remove)
|
||||
monkeypatch.setattr(_mod, "_should_skip_file", AsyncMock(return_value=(False, None)))
|
||||
monkeypatch.setattr(
|
||||
_mod, "_should_skip_file", AsyncMock(return_value=(False, None))
|
||||
)
|
||||
|
||||
download_mock = AsyncMock(return_value=(2, 0))
|
||||
monkeypatch.setattr(_mod, "_download_and_index", download_mock)
|
||||
|
|
@ -631,7 +660,7 @@ async def test_delta_sync_mix_deletions_and_upserts(monkeypatch):
|
|||
mock_task_logger = MagicMock()
|
||||
mock_task_logger.log_task_progress = AsyncMock()
|
||||
|
||||
indexed, skipped, unsupported, cursor = await _index_with_delta_sync(
|
||||
indexed, skipped, _unsupported, cursor = await _index_with_delta_sync(
|
||||
mock_client,
|
||||
AsyncMock(),
|
||||
_CONNECTOR_ID,
|
||||
|
|
@ -665,7 +694,7 @@ async def test_delta_sync_returns_new_cursor(monkeypatch):
|
|||
mock_task_logger = MagicMock()
|
||||
mock_task_logger.log_task_progress = AsyncMock()
|
||||
|
||||
indexed, skipped, unsupported, cursor = await _index_with_delta_sync(
|
||||
indexed, skipped, _unsupported, cursor = await _index_with_delta_sync(
|
||||
mock_client,
|
||||
AsyncMock(),
|
||||
_CONNECTOR_ID,
|
||||
|
|
@ -723,9 +752,7 @@ def orchestrator_mocks(monkeypatch):
|
|||
|
||||
mock_client = MagicMock()
|
||||
mock_client.get_latest_cursor = AsyncMock(return_value=("latest-cursor-abc", None))
|
||||
monkeypatch.setattr(
|
||||
_mod, "DropboxClient", MagicMock(return_value=mock_client)
|
||||
)
|
||||
monkeypatch.setattr(_mod, "DropboxClient", MagicMock(return_value=mock_client))
|
||||
|
||||
return {
|
||||
"connector": mock_connector,
|
||||
|
|
@ -751,7 +778,7 @@ async def test_orchestrator_uses_delta_sync_when_cursor_and_last_indexed(
|
|||
mock_session = AsyncMock()
|
||||
mock_session.commit = AsyncMock()
|
||||
|
||||
indexed, skipped, error, _unsupported = await index_dropbox_files(
|
||||
_indexed, _skipped, error, _unsupported = await index_dropbox_files(
|
||||
mock_session,
|
||||
_CONNECTOR_ID,
|
||||
_SEARCH_SPACE_ID,
|
||||
|
|
@ -779,7 +806,7 @@ async def test_orchestrator_falls_back_to_full_scan_without_cursor(
|
|||
mock_session = AsyncMock()
|
||||
mock_session.commit = AsyncMock()
|
||||
|
||||
indexed, skipped, error, _unsupported = await index_dropbox_files(
|
||||
_indexed, _skipped, error, _unsupported = await index_dropbox_files(
|
||||
mock_session,
|
||||
_CONNECTOR_ID,
|
||||
_SEARCH_SPACE_ID,
|
||||
|
|
|
|||
|
|
@ -366,7 +366,7 @@ async def test_full_scan_three_phase_counts(full_scan_mocks, monkeypatch):
|
|||
full_scan_mocks["download_mock"].return_value = (mock_docs, 0)
|
||||
full_scan_mocks["batch_mock"].return_value = ([], 2, 0)
|
||||
|
||||
indexed, skipped, unsupported = await _run_full_scan(full_scan_mocks)
|
||||
indexed, skipped, _unsupported = await _run_full_scan(full_scan_mocks)
|
||||
|
||||
assert indexed == 3 # 1 renamed + 2 from batch
|
||||
assert skipped == 1 # 1 unchanged
|
||||
|
|
@ -497,7 +497,7 @@ async def test_delta_sync_removals_serial_rest_parallel(monkeypatch):
|
|||
mock_task_logger = MagicMock()
|
||||
mock_task_logger.log_task_progress = AsyncMock()
|
||||
|
||||
indexed, skipped, unsupported = await _index_with_delta_sync(
|
||||
indexed, skipped, _unsupported = await _index_with_delta_sync(
|
||||
MagicMock(),
|
||||
mock_session,
|
||||
MagicMock(),
|
||||
|
|
@ -589,7 +589,7 @@ async def test_selected_files_single_file_indexed(selected_files_mocks):
|
|||
)
|
||||
selected_files_mocks["download_and_index_mock"].return_value = (1, 0)
|
||||
|
||||
indexed, skipped, unsup, errors = await _run_selected(
|
||||
indexed, skipped, _unsup, errors = await _run_selected(
|
||||
selected_files_mocks,
|
||||
[("f1", "report.pdf")],
|
||||
)
|
||||
|
|
@ -613,7 +613,7 @@ async def test_selected_files_fetch_failure_isolation(selected_files_mocks):
|
|||
)
|
||||
selected_files_mocks["download_and_index_mock"].return_value = (2, 0)
|
||||
|
||||
indexed, skipped, unsup, errors = await _run_selected(
|
||||
indexed, skipped, _unsup, errors = await _run_selected(
|
||||
selected_files_mocks,
|
||||
[("f1", "first.txt"), ("f2", "mid.txt"), ("f3", "third.txt")],
|
||||
)
|
||||
|
|
@ -647,7 +647,7 @@ async def test_selected_files_skip_rename_counting(selected_files_mocks):
|
|||
|
||||
selected_files_mocks["download_and_index_mock"].return_value = (2, 0)
|
||||
|
||||
indexed, skipped, unsup, errors = await _run_selected(
|
||||
indexed, skipped, _unsup, errors = await _run_selected(
|
||||
selected_files_mocks,
|
||||
[
|
||||
("s1", "unchanged.txt"),
|
||||
|
|
|
|||
|
|
@ -219,7 +219,9 @@ async def test_gdrive_files_exceeding_quota_rejected(gdrive_selected_mocks):
|
|||
None,
|
||||
)
|
||||
|
||||
indexed, _skipped, _unsup, errors = await _run_gdrive_selected(m, [("big", "huge.pdf")])
|
||||
indexed, _skipped, _unsup, errors = await _run_gdrive_selected(
|
||||
m, [("big", "huge.pdf")]
|
||||
)
|
||||
|
||||
assert indexed == 0
|
||||
assert len(errors) == 1
|
||||
|
|
@ -552,7 +554,9 @@ async def test_onedrive_over_quota_rejected(onedrive_selected_mocks):
|
|||
None,
|
||||
)
|
||||
|
||||
indexed, _skipped, _unsup, errors = await _run_onedrive_selected(m, [("big", "huge.pdf")])
|
||||
indexed, _skipped, _unsup, errors = await _run_onedrive_selected(
|
||||
m, [("big", "huge.pdf")]
|
||||
)
|
||||
|
||||
assert indexed == 0
|
||||
assert len(errors) == 1
|
||||
|
|
|
|||
|
|
@ -19,6 +19,7 @@ def _make_client() -> DropboxClient:
|
|||
|
||||
# ---------- C1: get_latest_cursor ----------
|
||||
|
||||
|
||||
async def test_get_latest_cursor_returns_cursor_string(monkeypatch):
|
||||
client = _make_client()
|
||||
|
||||
|
|
@ -34,12 +35,17 @@ async def test_get_latest_cursor_returns_cursor_string(monkeypatch):
|
|||
assert error is None
|
||||
client._request.assert_called_once_with(
|
||||
"/2/files/list_folder/get_latest_cursor",
|
||||
{"path": "/my-folder", "recursive": False, "include_non_downloadable_files": True},
|
||||
{
|
||||
"path": "/my-folder",
|
||||
"recursive": False,
|
||||
"include_non_downloadable_files": True,
|
||||
},
|
||||
)
|
||||
|
||||
|
||||
# ---------- C2: get_changes returns entries and new cursor ----------
|
||||
|
||||
|
||||
async def test_get_changes_returns_entries_and_cursor(monkeypatch):
|
||||
client = _make_client()
|
||||
|
||||
|
|
@ -66,6 +72,7 @@ async def test_get_changes_returns_entries_and_cursor(monkeypatch):
|
|||
|
||||
# ---------- C3: get_changes handles pagination ----------
|
||||
|
||||
|
||||
async def test_get_changes_handles_pagination(monkeypatch):
|
||||
client = _make_client()
|
||||
|
||||
|
|
@ -98,6 +105,7 @@ async def test_get_changes_handles_pagination(monkeypatch):
|
|||
|
||||
# ---------- C4: get_changes raises on 401 ----------
|
||||
|
||||
|
||||
async def test_get_changes_returns_error_on_401(monkeypatch):
|
||||
client = _make_client()
|
||||
|
||||
|
|
|
|||
|
|
@ -41,15 +41,40 @@ def test_non_downloadable_item_is_skipped():
|
|||
@pytest.mark.parametrize(
|
||||
"filename",
|
||||
[
|
||||
"archive.zip", "backup.tar", "data.gz", "stuff.rar", "pack.7z",
|
||||
"program.exe", "lib.dll", "module.so", "image.dmg", "disk.iso",
|
||||
"movie.mov", "clip.avi", "video.mkv", "film.wmv", "stream.flv",
|
||||
"archive.zip",
|
||||
"backup.tar",
|
||||
"data.gz",
|
||||
"stuff.rar",
|
||||
"pack.7z",
|
||||
"program.exe",
|
||||
"lib.dll",
|
||||
"module.so",
|
||||
"image.dmg",
|
||||
"disk.iso",
|
||||
"movie.mov",
|
||||
"clip.avi",
|
||||
"video.mkv",
|
||||
"film.wmv",
|
||||
"stream.flv",
|
||||
"favicon.ico",
|
||||
"raw.cr2", "photo.nef", "image.arw", "pic.dng",
|
||||
"design.psd", "vector.ai", "mockup.sketch", "proto.fig",
|
||||
"font.ttf", "font.otf", "font.woff", "font.woff2",
|
||||
"model.stl", "scene.fbx", "mesh.blend",
|
||||
"local.db", "data.sqlite", "access.mdb",
|
||||
"raw.cr2",
|
||||
"photo.nef",
|
||||
"image.arw",
|
||||
"pic.dng",
|
||||
"design.psd",
|
||||
"vector.ai",
|
||||
"mockup.sketch",
|
||||
"proto.fig",
|
||||
"font.ttf",
|
||||
"font.otf",
|
||||
"font.woff",
|
||||
"font.woff2",
|
||||
"model.stl",
|
||||
"scene.fbx",
|
||||
"mesh.blend",
|
||||
"local.db",
|
||||
"data.sqlite",
|
||||
"access.mdb",
|
||||
],
|
||||
)
|
||||
def test_non_parseable_extensions_are_skipped(filename, mocker):
|
||||
|
|
@ -63,9 +88,16 @@ def test_non_parseable_extensions_are_skipped(filename, mocker):
|
|||
@pytest.mark.parametrize(
|
||||
"filename",
|
||||
[
|
||||
"report.pdf", "document.docx", "sheet.xlsx", "slides.pptx",
|
||||
"readme.txt", "data.csv", "page.html", "notes.md",
|
||||
"config.json", "feed.xml",
|
||||
"report.pdf",
|
||||
"document.docx",
|
||||
"sheet.xlsx",
|
||||
"slides.pptx",
|
||||
"readme.txt",
|
||||
"data.csv",
|
||||
"page.html",
|
||||
"notes.md",
|
||||
"config.json",
|
||||
"feed.xml",
|
||||
],
|
||||
)
|
||||
def test_parseable_documents_are_not_skipped(filename, mocker):
|
||||
|
|
@ -92,30 +124,33 @@ def test_universal_images_are_not_skipped(filename, mocker):
|
|||
assert ext is None
|
||||
|
||||
|
||||
@pytest.mark.parametrize("filename,service,expected_skip", [
|
||||
("old.doc", "DOCLING", True),
|
||||
("old.doc", "LLAMACLOUD", False),
|
||||
("old.doc", "UNSTRUCTURED", False),
|
||||
("legacy.xls", "DOCLING", True),
|
||||
("legacy.xls", "LLAMACLOUD", False),
|
||||
("legacy.xls", "UNSTRUCTURED", False),
|
||||
("deck.ppt", "DOCLING", True),
|
||||
("deck.ppt", "LLAMACLOUD", False),
|
||||
("deck.ppt", "UNSTRUCTURED", False),
|
||||
("icon.svg", "DOCLING", True),
|
||||
("icon.svg", "LLAMACLOUD", False),
|
||||
("anim.gif", "DOCLING", True),
|
||||
("anim.gif", "LLAMACLOUD", False),
|
||||
("photo.webp", "DOCLING", False),
|
||||
("photo.webp", "LLAMACLOUD", False),
|
||||
("photo.webp", "UNSTRUCTURED", True),
|
||||
("live.heic", "DOCLING", True),
|
||||
("live.heic", "UNSTRUCTURED", False),
|
||||
("macro.docm", "DOCLING", True),
|
||||
("macro.docm", "LLAMACLOUD", False),
|
||||
("mail.eml", "DOCLING", True),
|
||||
("mail.eml", "UNSTRUCTURED", False),
|
||||
])
|
||||
@pytest.mark.parametrize(
|
||||
"filename,service,expected_skip",
|
||||
[
|
||||
("old.doc", "DOCLING", True),
|
||||
("old.doc", "LLAMACLOUD", False),
|
||||
("old.doc", "UNSTRUCTURED", False),
|
||||
("legacy.xls", "DOCLING", True),
|
||||
("legacy.xls", "LLAMACLOUD", False),
|
||||
("legacy.xls", "UNSTRUCTURED", False),
|
||||
("deck.ppt", "DOCLING", True),
|
||||
("deck.ppt", "LLAMACLOUD", False),
|
||||
("deck.ppt", "UNSTRUCTURED", False),
|
||||
("icon.svg", "DOCLING", True),
|
||||
("icon.svg", "LLAMACLOUD", False),
|
||||
("anim.gif", "DOCLING", True),
|
||||
("anim.gif", "LLAMACLOUD", False),
|
||||
("photo.webp", "DOCLING", False),
|
||||
("photo.webp", "LLAMACLOUD", False),
|
||||
("photo.webp", "UNSTRUCTURED", True),
|
||||
("live.heic", "DOCLING", True),
|
||||
("live.heic", "UNSTRUCTURED", False),
|
||||
("macro.docm", "DOCLING", True),
|
||||
("macro.docm", "LLAMACLOUD", False),
|
||||
("mail.eml", "DOCLING", True),
|
||||
("mail.eml", "UNSTRUCTURED", False),
|
||||
],
|
||||
)
|
||||
def test_parser_specific_extensions(filename, service, expected_skip, mocker):
|
||||
mocker.patch("app.config.config.ETL_SERVICE", service)
|
||||
item = {".tag": "file", "name": filename}
|
||||
|
|
|
|||
|
|
@ -7,21 +7,37 @@ from app.connectors.google_drive.file_types import should_skip_by_extension
|
|||
pytestmark = pytest.mark.unit
|
||||
|
||||
|
||||
@pytest.mark.parametrize("filename", [
|
||||
"malware.exe", "archive.zip", "video.mov", "font.woff2", "model.blend",
|
||||
])
|
||||
@pytest.mark.parametrize(
|
||||
"filename",
|
||||
[
|
||||
"malware.exe",
|
||||
"archive.zip",
|
||||
"video.mov",
|
||||
"font.woff2",
|
||||
"model.blend",
|
||||
],
|
||||
)
|
||||
def test_unsupported_extensions_are_skipped_regardless_of_service(filename, mocker):
|
||||
"""Truly unsupported files are skipped no matter which ETL service is configured."""
|
||||
for service in ("DOCLING", "LLAMACLOUD", "UNSTRUCTURED"):
|
||||
mocker.patch("app.config.config.ETL_SERVICE", service)
|
||||
skip, ext = should_skip_by_extension(filename)
|
||||
skip, _ext = should_skip_by_extension(filename)
|
||||
assert skip is True
|
||||
|
||||
|
||||
@pytest.mark.parametrize("filename", [
|
||||
"report.pdf", "doc.docx", "sheet.xlsx", "slides.pptx",
|
||||
"readme.txt", "data.csv", "photo.png", "notes.md",
|
||||
])
|
||||
@pytest.mark.parametrize(
|
||||
"filename",
|
||||
[
|
||||
"report.pdf",
|
||||
"doc.docx",
|
||||
"sheet.xlsx",
|
||||
"slides.pptx",
|
||||
"readme.txt",
|
||||
"data.csv",
|
||||
"photo.png",
|
||||
"notes.md",
|
||||
],
|
||||
)
|
||||
def test_universal_extensions_are_not_skipped(filename, mocker):
|
||||
"""Files supported by all parsers (or handled by plaintext/direct_convert) are never skipped."""
|
||||
for service in ("DOCLING", "LLAMACLOUD", "UNSTRUCTURED"):
|
||||
|
|
@ -31,16 +47,19 @@ def test_universal_extensions_are_not_skipped(filename, mocker):
|
|||
assert ext is None
|
||||
|
||||
|
||||
@pytest.mark.parametrize("filename,service,expected_skip", [
|
||||
("macro.docm", "DOCLING", True),
|
||||
("macro.docm", "LLAMACLOUD", False),
|
||||
("mail.eml", "DOCLING", True),
|
||||
("mail.eml", "UNSTRUCTURED", False),
|
||||
("photo.gif", "DOCLING", True),
|
||||
("photo.gif", "LLAMACLOUD", False),
|
||||
("photo.heic", "UNSTRUCTURED", False),
|
||||
("photo.heic", "DOCLING", True),
|
||||
])
|
||||
@pytest.mark.parametrize(
|
||||
"filename,service,expected_skip",
|
||||
[
|
||||
("macro.docm", "DOCLING", True),
|
||||
("macro.docm", "LLAMACLOUD", False),
|
||||
("mail.eml", "DOCLING", True),
|
||||
("mail.eml", "UNSTRUCTURED", False),
|
||||
("photo.gif", "DOCLING", True),
|
||||
("photo.gif", "LLAMACLOUD", False),
|
||||
("photo.heic", "UNSTRUCTURED", False),
|
||||
("photo.heic", "DOCLING", True),
|
||||
],
|
||||
)
|
||||
def test_parser_specific_extensions(filename, service, expected_skip, mocker):
|
||||
mocker.patch("app.config.config.ETL_SERVICE", service)
|
||||
skip, ext = should_skip_by_extension(filename)
|
||||
|
|
|
|||
|
|
@ -45,9 +45,16 @@ def test_onenote_is_skipped():
|
|||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
@pytest.mark.parametrize("filename", [
|
||||
"malware.exe", "archive.zip", "video.mov", "font.woff2", "model.blend",
|
||||
])
|
||||
@pytest.mark.parametrize(
|
||||
"filename",
|
||||
[
|
||||
"malware.exe",
|
||||
"archive.zip",
|
||||
"video.mov",
|
||||
"font.woff2",
|
||||
"model.blend",
|
||||
],
|
||||
)
|
||||
def test_unsupported_extensions_are_skipped(filename, mocker):
|
||||
mocker.patch("app.config.config.ETL_SERVICE", "DOCLING")
|
||||
item = {"name": filename, "file": {"mimeType": "application/octet-stream"}}
|
||||
|
|
@ -56,10 +63,19 @@ def test_unsupported_extensions_are_skipped(filename, mocker):
|
|||
assert ext is not None
|
||||
|
||||
|
||||
@pytest.mark.parametrize("filename", [
|
||||
"report.pdf", "doc.docx", "sheet.xlsx", "slides.pptx",
|
||||
"readme.txt", "data.csv", "photo.png", "notes.md",
|
||||
])
|
||||
@pytest.mark.parametrize(
|
||||
"filename",
|
||||
[
|
||||
"report.pdf",
|
||||
"doc.docx",
|
||||
"sheet.xlsx",
|
||||
"slides.pptx",
|
||||
"readme.txt",
|
||||
"data.csv",
|
||||
"photo.png",
|
||||
"notes.md",
|
||||
],
|
||||
)
|
||||
def test_universal_files_are_not_skipped(filename, mocker):
|
||||
for service in ("DOCLING", "LLAMACLOUD", "UNSTRUCTURED"):
|
||||
mocker.patch("app.config.config.ETL_SERVICE", service)
|
||||
|
|
@ -69,14 +85,17 @@ def test_universal_files_are_not_skipped(filename, mocker):
|
|||
assert ext is None
|
||||
|
||||
|
||||
@pytest.mark.parametrize("filename,service,expected_skip", [
|
||||
("macro.docm", "DOCLING", True),
|
||||
("macro.docm", "LLAMACLOUD", False),
|
||||
("mail.eml", "DOCLING", True),
|
||||
("mail.eml", "UNSTRUCTURED", False),
|
||||
("photo.heic", "UNSTRUCTURED", False),
|
||||
("photo.heic", "DOCLING", True),
|
||||
])
|
||||
@pytest.mark.parametrize(
|
||||
"filename,service,expected_skip",
|
||||
[
|
||||
("macro.docm", "DOCLING", True),
|
||||
("macro.docm", "LLAMACLOUD", False),
|
||||
("mail.eml", "DOCLING", True),
|
||||
("mail.eml", "UNSTRUCTURED", False),
|
||||
("photo.heic", "UNSTRUCTURED", False),
|
||||
("photo.heic", "DOCLING", True),
|
||||
],
|
||||
)
|
||||
def test_parser_specific_extensions(filename, service, expected_skip, mocker):
|
||||
mocker.patch("app.config.config.ETL_SERVICE", service)
|
||||
item = {"name": filename, "file": {"mimeType": "application/octet-stream"}}
|
||||
|
|
|
|||
|
|
@ -24,6 +24,4 @@ def _stub_package(dotted: str, fs_dir: Path) -> None:
|
|||
|
||||
_stub_package("app", _BACKEND / "app")
|
||||
_stub_package("app.etl_pipeline", _BACKEND / "app" / "etl_pipeline")
|
||||
_stub_package(
|
||||
"app.etl_pipeline.parsers", _BACKEND / "app" / "etl_pipeline" / "parsers"
|
||||
)
|
||||
_stub_package("app.etl_pipeline.parsers", _BACKEND / "app" / "etl_pipeline" / "parsers")
|
||||
|
|
|
|||
|
|
@ -144,7 +144,7 @@ async def test_extract_mp3_returns_transcription(tmp_path, mocker):
|
|||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Slice 7 – DOCLING document parsing
|
||||
# Slice 7 - DOCLING document parsing
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
|
|
@ -172,7 +172,7 @@ async def test_extract_pdf_with_docling(tmp_path, mocker):
|
|||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Slice 8 – UNSTRUCTURED document parsing
|
||||
# Slice 8 - UNSTRUCTURED document parsing
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
|
|
@ -208,7 +208,7 @@ async def test_extract_pdf_with_unstructured(tmp_path, mocker):
|
|||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Slice 9 – LLAMACLOUD document parsing
|
||||
# Slice 9 - LLAMACLOUD document parsing
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
|
|
@ -241,9 +241,7 @@ async def test_extract_pdf_with_llamacloud(tmp_path, mocker):
|
|||
)
|
||||
|
||||
result = await EtlPipelineService().extract(
|
||||
EtlRequest(
|
||||
file_path=str(pdf_file), filename="report.pdf", estimated_pages=5
|
||||
)
|
||||
EtlRequest(file_path=str(pdf_file), filename="report.pdf", estimated_pages=5)
|
||||
)
|
||||
|
||||
assert result.markdown_content == "# LlamaCloud parsed"
|
||||
|
|
@ -252,7 +250,7 @@ async def test_extract_pdf_with_llamacloud(tmp_path, mocker):
|
|||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Slice 10 – unknown extension falls through to document ETL
|
||||
# Slice 10 - unknown extension falls through to document ETL
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
|
|
@ -279,18 +277,18 @@ async def test_unknown_extension_uses_document_etl(tmp_path, mocker):
|
|||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Slice 11 – EtlRequest validation
|
||||
# Slice 11 - EtlRequest validation
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_etl_request_requires_filename():
|
||||
"""EtlRequest rejects missing filename."""
|
||||
with pytest.raises(Exception):
|
||||
with pytest.raises(ValueError, match="filename must not be empty"):
|
||||
EtlRequest(file_path="/tmp/some.txt", filename="")
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Slice 12 – unknown ETL_SERVICE raises EtlServiceUnavailableError
|
||||
# Slice 12 - unknown ETL_SERVICE raises EtlServiceUnavailableError
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
|
|
@ -310,7 +308,7 @@ async def test_unknown_etl_service_raises(tmp_path, mocker):
|
|||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Slice 13 – unsupported file types are rejected before reaching any parser
|
||||
# Slice 13 - unsupported file types are rejected before reaching any parser
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
|
|
@ -321,10 +319,19 @@ def test_unknown_extension_classified_as_unsupported():
|
|||
assert classify_file("random.xyz") == FileCategory.UNSUPPORTED
|
||||
|
||||
|
||||
@pytest.mark.parametrize("filename", [
|
||||
"malware.exe", "archive.zip", "video.mov", "font.woff2",
|
||||
"model.blend", "data.parquet", "package.deb", "firmware.bin",
|
||||
])
|
||||
@pytest.mark.parametrize(
|
||||
"filename",
|
||||
[
|
||||
"malware.exe",
|
||||
"archive.zip",
|
||||
"video.mov",
|
||||
"font.woff2",
|
||||
"model.blend",
|
||||
"data.parquet",
|
||||
"package.deb",
|
||||
"firmware.bin",
|
||||
],
|
||||
)
|
||||
def test_unsupported_extensions_classified_correctly(filename):
|
||||
"""Extensions not in any allowlist are classified as UNSUPPORTED."""
|
||||
from app.etl_pipeline.file_classifier import FileCategory, classify_file
|
||||
|
|
@ -332,18 +339,21 @@ def test_unsupported_extensions_classified_correctly(filename):
|
|||
assert classify_file(filename) == FileCategory.UNSUPPORTED
|
||||
|
||||
|
||||
@pytest.mark.parametrize("filename,expected", [
|
||||
("report.pdf", "document"),
|
||||
("doc.docx", "document"),
|
||||
("slides.pptx", "document"),
|
||||
("sheet.xlsx", "document"),
|
||||
("photo.png", "document"),
|
||||
("photo.jpg", "document"),
|
||||
("book.epub", "document"),
|
||||
("letter.odt", "document"),
|
||||
("readme.md", "plaintext"),
|
||||
("data.csv", "direct_convert"),
|
||||
])
|
||||
@pytest.mark.parametrize(
|
||||
"filename,expected",
|
||||
[
|
||||
("report.pdf", "document"),
|
||||
("doc.docx", "document"),
|
||||
("slides.pptx", "document"),
|
||||
("sheet.xlsx", "document"),
|
||||
("photo.png", "document"),
|
||||
("photo.jpg", "document"),
|
||||
("book.epub", "document"),
|
||||
("letter.odt", "document"),
|
||||
("readme.md", "plaintext"),
|
||||
("data.csv", "direct_convert"),
|
||||
],
|
||||
)
|
||||
def test_parseable_extensions_classified_correctly(filename, expected):
|
||||
"""Parseable files are classified into their correct category."""
|
||||
from app.etl_pipeline.file_classifier import FileCategory, classify_file
|
||||
|
|
@ -380,31 +390,34 @@ async def test_extract_zip_raises_unsupported_error(tmp_path):
|
|||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Slice 14 – should_skip_for_service (per-parser document filtering)
|
||||
# Slice 14 - should_skip_for_service (per-parser document filtering)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
@pytest.mark.parametrize("filename,etl_service,expected_skip", [
|
||||
("file.eml", "DOCLING", True),
|
||||
("file.eml", "UNSTRUCTURED", False),
|
||||
("file.docm", "LLAMACLOUD", False),
|
||||
("file.docm", "DOCLING", True),
|
||||
("file.txt", "DOCLING", False),
|
||||
("file.csv", "LLAMACLOUD", False),
|
||||
("file.mp3", "UNSTRUCTURED", False),
|
||||
("file.exe", "LLAMACLOUD", True),
|
||||
("file.pdf", "DOCLING", False),
|
||||
("file.webp", "DOCLING", False),
|
||||
("file.webp", "UNSTRUCTURED", True),
|
||||
("file.gif", "LLAMACLOUD", False),
|
||||
("file.gif", "DOCLING", True),
|
||||
("file.heic", "UNSTRUCTURED", False),
|
||||
("file.heic", "DOCLING", True),
|
||||
("file.svg", "LLAMACLOUD", False),
|
||||
("file.svg", "DOCLING", True),
|
||||
("file.p7s", "UNSTRUCTURED", False),
|
||||
("file.p7s", "LLAMACLOUD", True),
|
||||
])
|
||||
@pytest.mark.parametrize(
|
||||
"filename,etl_service,expected_skip",
|
||||
[
|
||||
("file.eml", "DOCLING", True),
|
||||
("file.eml", "UNSTRUCTURED", False),
|
||||
("file.docm", "LLAMACLOUD", False),
|
||||
("file.docm", "DOCLING", True),
|
||||
("file.txt", "DOCLING", False),
|
||||
("file.csv", "LLAMACLOUD", False),
|
||||
("file.mp3", "UNSTRUCTURED", False),
|
||||
("file.exe", "LLAMACLOUD", True),
|
||||
("file.pdf", "DOCLING", False),
|
||||
("file.webp", "DOCLING", False),
|
||||
("file.webp", "UNSTRUCTURED", True),
|
||||
("file.gif", "LLAMACLOUD", False),
|
||||
("file.gif", "DOCLING", True),
|
||||
("file.heic", "UNSTRUCTURED", False),
|
||||
("file.heic", "DOCLING", True),
|
||||
("file.svg", "LLAMACLOUD", False),
|
||||
("file.svg", "DOCLING", True),
|
||||
("file.p7s", "UNSTRUCTURED", False),
|
||||
("file.p7s", "LLAMACLOUD", True),
|
||||
],
|
||||
)
|
||||
def test_should_skip_for_service(filename, etl_service, expected_skip):
|
||||
from app.etl_pipeline.file_classifier import should_skip_for_service
|
||||
|
||||
|
|
@ -414,7 +427,7 @@ def test_should_skip_for_service(filename, etl_service, expected_skip):
|
|||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Slice 14b – ETL pipeline rejects per-parser incompatible documents
|
||||
# Slice 14b - ETL pipeline rejects per-parser incompatible documents
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -30,26 +30,29 @@ def test_docling_service_does_not_restrict_allowed_formats():
|
|||
|
||||
fake_pdf_format_option_cls = MagicMock()
|
||||
|
||||
with patch.dict("sys.modules", {
|
||||
"docling": MagicMock(),
|
||||
"docling.backend": MagicMock(),
|
||||
"docling.backend.pypdfium2_backend": MagicMock(
|
||||
PyPdfiumDocumentBackend=mock_backend
|
||||
),
|
||||
"docling.datamodel": MagicMock(),
|
||||
"docling.datamodel.base_models": MagicMock(
|
||||
InputFormat=_FakeInputFormat
|
||||
),
|
||||
"docling.datamodel.pipeline_options": MagicMock(
|
||||
PdfPipelineOptions=fake_pipeline_options_cls
|
||||
),
|
||||
"docling.document_converter": MagicMock(
|
||||
DocumentConverter=mock_converter_cls,
|
||||
PdfFormatOption=fake_pdf_format_option_cls,
|
||||
),
|
||||
}):
|
||||
import app.services.docling_service as mod
|
||||
with patch.dict(
|
||||
"sys.modules",
|
||||
{
|
||||
"docling": MagicMock(),
|
||||
"docling.backend": MagicMock(),
|
||||
"docling.backend.pypdfium2_backend": MagicMock(
|
||||
PyPdfiumDocumentBackend=mock_backend
|
||||
),
|
||||
"docling.datamodel": MagicMock(),
|
||||
"docling.datamodel.base_models": MagicMock(InputFormat=_FakeInputFormat),
|
||||
"docling.datamodel.pipeline_options": MagicMock(
|
||||
PdfPipelineOptions=fake_pipeline_options_cls
|
||||
),
|
||||
"docling.document_converter": MagicMock(
|
||||
DocumentConverter=mock_converter_cls,
|
||||
PdfFormatOption=fake_pdf_format_option_cls,
|
||||
),
|
||||
},
|
||||
):
|
||||
from importlib import reload
|
||||
|
||||
import app.services.docling_service as mod
|
||||
|
||||
reload(mod)
|
||||
|
||||
mod.DoclingService()
|
||||
|
|
|
|||
|
|
@ -17,36 +17,74 @@ def test_exe_is_not_supported_document():
|
|||
assert is_supported_document_extension("malware.exe") is False
|
||||
|
||||
|
||||
@pytest.mark.parametrize("filename", [
|
||||
"report.pdf", "doc.docx", "old.doc",
|
||||
"sheet.xlsx", "legacy.xls",
|
||||
"slides.pptx", "deck.ppt",
|
||||
"macro.docm", "macro.xlsm", "macro.pptm",
|
||||
"photo.png", "photo.jpg", "photo.jpeg", "scan.bmp", "scan.tiff", "scan.tif",
|
||||
"photo.webp", "anim.gif", "iphone.heic",
|
||||
"manual.rtf", "book.epub",
|
||||
"letter.odt", "data.ods", "presentation.odp",
|
||||
"inbox.eml", "outlook.msg",
|
||||
"korean.hwpx", "korean.hwp",
|
||||
"template.dot", "template.dotm",
|
||||
"template.pot", "template.potx",
|
||||
"binary.xlsb", "workspace.xlw",
|
||||
"vector.svg", "signature.p7s",
|
||||
])
|
||||
@pytest.mark.parametrize(
|
||||
"filename",
|
||||
[
|
||||
"report.pdf",
|
||||
"doc.docx",
|
||||
"old.doc",
|
||||
"sheet.xlsx",
|
||||
"legacy.xls",
|
||||
"slides.pptx",
|
||||
"deck.ppt",
|
||||
"macro.docm",
|
||||
"macro.xlsm",
|
||||
"macro.pptm",
|
||||
"photo.png",
|
||||
"photo.jpg",
|
||||
"photo.jpeg",
|
||||
"scan.bmp",
|
||||
"scan.tiff",
|
||||
"scan.tif",
|
||||
"photo.webp",
|
||||
"anim.gif",
|
||||
"iphone.heic",
|
||||
"manual.rtf",
|
||||
"book.epub",
|
||||
"letter.odt",
|
||||
"data.ods",
|
||||
"presentation.odp",
|
||||
"inbox.eml",
|
||||
"outlook.msg",
|
||||
"korean.hwpx",
|
||||
"korean.hwp",
|
||||
"template.dot",
|
||||
"template.dotm",
|
||||
"template.pot",
|
||||
"template.potx",
|
||||
"binary.xlsb",
|
||||
"workspace.xlw",
|
||||
"vector.svg",
|
||||
"signature.p7s",
|
||||
],
|
||||
)
|
||||
def test_document_extensions_are_supported(filename):
|
||||
from app.utils.file_extensions import is_supported_document_extension
|
||||
|
||||
assert is_supported_document_extension(filename) is True, f"{filename} should be supported"
|
||||
assert is_supported_document_extension(filename) is True, (
|
||||
f"{filename} should be supported"
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("filename", [
|
||||
"malware.exe", "archive.zip", "video.mov", "font.woff2",
|
||||
"model.blend", "random.xyz", "data.parquet", "package.deb",
|
||||
])
|
||||
@pytest.mark.parametrize(
|
||||
"filename",
|
||||
[
|
||||
"malware.exe",
|
||||
"archive.zip",
|
||||
"video.mov",
|
||||
"font.woff2",
|
||||
"model.blend",
|
||||
"random.xyz",
|
||||
"data.parquet",
|
||||
"package.deb",
|
||||
],
|
||||
)
|
||||
def test_non_document_extensions_are_not_supported(filename):
|
||||
from app.utils.file_extensions import is_supported_document_extension
|
||||
|
||||
assert is_supported_document_extension(filename) is False, f"{filename} should NOT be supported"
|
||||
assert is_supported_document_extension(filename) is False, (
|
||||
f"{filename} should NOT be supported"
|
||||
)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
|
|
@ -67,7 +105,7 @@ def test_union_equals_all_three_sets():
|
|||
| LLAMAPARSE_DOCUMENT_EXTENSIONS
|
||||
| UNSTRUCTURED_DOCUMENT_EXTENSIONS
|
||||
)
|
||||
assert DOCUMENT_EXTENSIONS == expected
|
||||
assert expected == DOCUMENT_EXTENSIONS
|
||||
|
||||
|
||||
def test_get_extensions_for_docling():
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue