chore: ran linting

This commit is contained in:
Anish Sarkar 2026-04-07 05:55:39 +05:30
parent 5803fe79da
commit 0a26a6c5bb
54 changed files with 1015 additions and 672 deletions

View file

@ -6,7 +6,6 @@ real so we know the full path from "cloud gives us bytes" to "we get markdown
back" actually works.
"""
import os
from unittest.mock import AsyncMock, MagicMock
import pytest
@ -21,6 +20,7 @@ _CSV_CONTENT = "name,age\nAlice,30\nBob,25\n"
# Helpers
# ---------------------------------------------------------------------------
async def _write_file(dest_path: str, content: str) -> None:
"""Simulate a cloud client writing downloaded bytes to disk."""
with open(dest_path, "w", encoding="utf-8") as f:
@ -43,8 +43,8 @@ def _make_download_side_effect(content: str):
# Google Drive
# ===================================================================
class TestGoogleDriveContentExtraction:
class TestGoogleDriveContentExtraction:
async def test_txt_file_returns_markdown(self):
from app.connectors.google_drive.content_extractor import (
download_and_extract_content,
@ -76,7 +76,7 @@ class TestGoogleDriveContentExtraction:
file = {"id": "f2", "name": "data.csv", "mimeType": "text/csv"}
markdown, metadata, error = await download_and_extract_content(client, file)
markdown, _metadata, error = await download_and_extract_content(client, file)
assert error is None
assert "Alice" in markdown
@ -93,7 +93,7 @@ class TestGoogleDriveContentExtraction:
file = {"id": "f3", "name": "doc.txt", "mimeType": "text/plain"}
markdown, metadata, error = await download_and_extract_content(client, file)
markdown, _metadata, error = await download_and_extract_content(client, file)
assert markdown is None
assert error == "Network timeout"
@ -103,8 +103,8 @@ class TestGoogleDriveContentExtraction:
# OneDrive
# ===================================================================
class TestOneDriveContentExtraction:
class TestOneDriveContentExtraction:
async def test_txt_file_returns_markdown(self):
from app.connectors.onedrive.content_extractor import (
download_and_extract_content,
@ -144,7 +144,7 @@ class TestOneDriveContentExtraction:
"file": {"mimeType": "text/csv"},
}
markdown, metadata, error = await download_and_extract_content(client, file)
markdown, _metadata, error = await download_and_extract_content(client, file)
assert error is None
assert "Alice" in markdown
@ -164,7 +164,7 @@ class TestOneDriveContentExtraction:
"file": {"mimeType": "text/plain"},
}
markdown, metadata, error = await download_and_extract_content(client, file)
markdown, _metadata, error = await download_and_extract_content(client, file)
assert markdown is None
assert error == "403 Forbidden"
@ -174,8 +174,8 @@ class TestOneDriveContentExtraction:
# Dropbox
# ===================================================================
class TestDropboxContentExtraction:
class TestDropboxContentExtraction:
async def test_txt_file_returns_markdown(self):
from app.connectors.dropbox.content_extractor import (
download_and_extract_content,
@ -217,7 +217,7 @@ class TestDropboxContentExtraction:
"path_lower": "/data.csv",
}
markdown, metadata, error = await download_and_extract_content(client, file)
markdown, _metadata, error = await download_and_extract_content(client, file)
assert error is None
assert "Alice" in markdown
@ -238,7 +238,7 @@ class TestDropboxContentExtraction:
"path_lower": "/big.txt",
}
markdown, metadata, error = await download_and_extract_content(client, file)
markdown, _metadata, error = await download_and_extract_content(client, file)
assert markdown is None
assert error == "Rate limited"

View file

@ -265,6 +265,7 @@ def full_scan_mocks(mock_dropbox_client, monkeypatch):
async def _fake_skip(session, file, search_space_id):
from app.connectors.dropbox.file_types import should_skip_file as _skip
item_skip, unsup_ext = _skip(file)
if item_skip:
if unsup_ext:
@ -468,7 +469,11 @@ async def test_selected_files_fetch_failure_isolation(selected_files_mocks):
indexed, skipped, _unsupported, errors = await _run_selected(
selected_files_mocks,
[("/first.txt", "first.txt"), ("/mid.txt", "mid.txt"), ("/third.txt", "third.txt")],
[
("/first.txt", "first.txt"),
("/mid.txt", "mid.txt"),
("/third.txt", "third.txt"),
],
)
assert indexed == 2
@ -526,8 +531,18 @@ async def test_delta_sync_deletions_call_remove_document(monkeypatch):
import app.tasks.connector_indexers.dropbox_indexer as _mod
entries = [
{".tag": "deleted", "name": "gone.txt", "path_lower": "/gone.txt", "id": "id:del1"},
{".tag": "deleted", "name": "also_gone.pdf", "path_lower": "/also_gone.pdf", "id": "id:del2"},
{
".tag": "deleted",
"name": "gone.txt",
"path_lower": "/gone.txt",
"id": "id:del1",
},
{
".tag": "deleted",
"name": "also_gone.pdf",
"path_lower": "/also_gone.pdf",
"id": "id:del2",
},
]
mock_client = MagicMock()
@ -544,7 +559,7 @@ async def test_delta_sync_deletions_call_remove_document(monkeypatch):
mock_task_logger = MagicMock()
mock_task_logger.log_task_progress = AsyncMock()
indexed, skipped, unsupported, cursor = await _index_with_delta_sync(
_indexed, _skipped, _unsupported, cursor = await _index_with_delta_sync(
mock_client,
AsyncMock(),
_CONNECTOR_ID,
@ -573,7 +588,9 @@ async def test_delta_sync_upserts_filtered_and_downloaded(monkeypatch):
mock_client = MagicMock()
mock_client.get_changes = AsyncMock(return_value=(entries, "cursor-v2", None))
monkeypatch.setattr(_mod, "_should_skip_file", AsyncMock(return_value=(False, None)))
monkeypatch.setattr(
_mod, "_should_skip_file", AsyncMock(return_value=(False, None))
)
download_mock = AsyncMock(return_value=(2, 0))
monkeypatch.setattr(_mod, "_download_and_index", download_mock)
@ -581,7 +598,7 @@ async def test_delta_sync_upserts_filtered_and_downloaded(monkeypatch):
mock_task_logger = MagicMock()
mock_task_logger.log_task_progress = AsyncMock()
indexed, skipped, unsupported, cursor = await _index_with_delta_sync(
indexed, skipped, _unsupported, cursor = await _index_with_delta_sync(
mock_client,
AsyncMock(),
_CONNECTOR_ID,
@ -608,8 +625,18 @@ async def test_delta_sync_mix_deletions_and_upserts(monkeypatch):
import app.tasks.connector_indexers.dropbox_indexer as _mod
entries = [
{".tag": "deleted", "name": "removed.txt", "path_lower": "/removed.txt", "id": "id:del1"},
{".tag": "deleted", "name": "trashed.pdf", "path_lower": "/trashed.pdf", "id": "id:del2"},
{
".tag": "deleted",
"name": "removed.txt",
"path_lower": "/removed.txt",
"id": "id:del1",
},
{
".tag": "deleted",
"name": "trashed.pdf",
"path_lower": "/trashed.pdf",
"id": "id:del2",
},
_make_file_dict("mod1", "updated.txt"),
_make_file_dict("new1", "brandnew.docx"),
]
@ -623,7 +650,9 @@ async def test_delta_sync_mix_deletions_and_upserts(monkeypatch):
remove_calls.append(file_id)
monkeypatch.setattr(_mod, "_remove_document", _fake_remove)
monkeypatch.setattr(_mod, "_should_skip_file", AsyncMock(return_value=(False, None)))
monkeypatch.setattr(
_mod, "_should_skip_file", AsyncMock(return_value=(False, None))
)
download_mock = AsyncMock(return_value=(2, 0))
monkeypatch.setattr(_mod, "_download_and_index", download_mock)
@ -631,7 +660,7 @@ async def test_delta_sync_mix_deletions_and_upserts(monkeypatch):
mock_task_logger = MagicMock()
mock_task_logger.log_task_progress = AsyncMock()
indexed, skipped, unsupported, cursor = await _index_with_delta_sync(
indexed, skipped, _unsupported, cursor = await _index_with_delta_sync(
mock_client,
AsyncMock(),
_CONNECTOR_ID,
@ -665,7 +694,7 @@ async def test_delta_sync_returns_new_cursor(monkeypatch):
mock_task_logger = MagicMock()
mock_task_logger.log_task_progress = AsyncMock()
indexed, skipped, unsupported, cursor = await _index_with_delta_sync(
indexed, skipped, _unsupported, cursor = await _index_with_delta_sync(
mock_client,
AsyncMock(),
_CONNECTOR_ID,
@ -723,9 +752,7 @@ def orchestrator_mocks(monkeypatch):
mock_client = MagicMock()
mock_client.get_latest_cursor = AsyncMock(return_value=("latest-cursor-abc", None))
monkeypatch.setattr(
_mod, "DropboxClient", MagicMock(return_value=mock_client)
)
monkeypatch.setattr(_mod, "DropboxClient", MagicMock(return_value=mock_client))
return {
"connector": mock_connector,
@ -751,7 +778,7 @@ async def test_orchestrator_uses_delta_sync_when_cursor_and_last_indexed(
mock_session = AsyncMock()
mock_session.commit = AsyncMock()
indexed, skipped, error, _unsupported = await index_dropbox_files(
_indexed, _skipped, error, _unsupported = await index_dropbox_files(
mock_session,
_CONNECTOR_ID,
_SEARCH_SPACE_ID,
@ -779,7 +806,7 @@ async def test_orchestrator_falls_back_to_full_scan_without_cursor(
mock_session = AsyncMock()
mock_session.commit = AsyncMock()
indexed, skipped, error, _unsupported = await index_dropbox_files(
_indexed, _skipped, error, _unsupported = await index_dropbox_files(
mock_session,
_CONNECTOR_ID,
_SEARCH_SPACE_ID,

View file

@ -366,7 +366,7 @@ async def test_full_scan_three_phase_counts(full_scan_mocks, monkeypatch):
full_scan_mocks["download_mock"].return_value = (mock_docs, 0)
full_scan_mocks["batch_mock"].return_value = ([], 2, 0)
indexed, skipped, unsupported = await _run_full_scan(full_scan_mocks)
indexed, skipped, _unsupported = await _run_full_scan(full_scan_mocks)
assert indexed == 3 # 1 renamed + 2 from batch
assert skipped == 1 # 1 unchanged
@ -497,7 +497,7 @@ async def test_delta_sync_removals_serial_rest_parallel(monkeypatch):
mock_task_logger = MagicMock()
mock_task_logger.log_task_progress = AsyncMock()
indexed, skipped, unsupported = await _index_with_delta_sync(
indexed, skipped, _unsupported = await _index_with_delta_sync(
MagicMock(),
mock_session,
MagicMock(),
@ -589,7 +589,7 @@ async def test_selected_files_single_file_indexed(selected_files_mocks):
)
selected_files_mocks["download_and_index_mock"].return_value = (1, 0)
indexed, skipped, unsup, errors = await _run_selected(
indexed, skipped, _unsup, errors = await _run_selected(
selected_files_mocks,
[("f1", "report.pdf")],
)
@ -613,7 +613,7 @@ async def test_selected_files_fetch_failure_isolation(selected_files_mocks):
)
selected_files_mocks["download_and_index_mock"].return_value = (2, 0)
indexed, skipped, unsup, errors = await _run_selected(
indexed, skipped, _unsup, errors = await _run_selected(
selected_files_mocks,
[("f1", "first.txt"), ("f2", "mid.txt"), ("f3", "third.txt")],
)
@ -647,7 +647,7 @@ async def test_selected_files_skip_rename_counting(selected_files_mocks):
selected_files_mocks["download_and_index_mock"].return_value = (2, 0)
indexed, skipped, unsup, errors = await _run_selected(
indexed, skipped, _unsup, errors = await _run_selected(
selected_files_mocks,
[
("s1", "unchanged.txt"),

View file

@ -219,7 +219,9 @@ async def test_gdrive_files_exceeding_quota_rejected(gdrive_selected_mocks):
None,
)
indexed, _skipped, _unsup, errors = await _run_gdrive_selected(m, [("big", "huge.pdf")])
indexed, _skipped, _unsup, errors = await _run_gdrive_selected(
m, [("big", "huge.pdf")]
)
assert indexed == 0
assert len(errors) == 1
@ -552,7 +554,9 @@ async def test_onedrive_over_quota_rejected(onedrive_selected_mocks):
None,
)
indexed, _skipped, _unsup, errors = await _run_onedrive_selected(m, [("big", "huge.pdf")])
indexed, _skipped, _unsup, errors = await _run_onedrive_selected(
m, [("big", "huge.pdf")]
)
assert indexed == 0
assert len(errors) == 1

View file

@ -19,6 +19,7 @@ def _make_client() -> DropboxClient:
# ---------- C1: get_latest_cursor ----------
async def test_get_latest_cursor_returns_cursor_string(monkeypatch):
client = _make_client()
@ -34,12 +35,17 @@ async def test_get_latest_cursor_returns_cursor_string(monkeypatch):
assert error is None
client._request.assert_called_once_with(
"/2/files/list_folder/get_latest_cursor",
{"path": "/my-folder", "recursive": False, "include_non_downloadable_files": True},
{
"path": "/my-folder",
"recursive": False,
"include_non_downloadable_files": True,
},
)
# ---------- C2: get_changes returns entries and new cursor ----------
async def test_get_changes_returns_entries_and_cursor(monkeypatch):
client = _make_client()
@ -66,6 +72,7 @@ async def test_get_changes_returns_entries_and_cursor(monkeypatch):
# ---------- C3: get_changes handles pagination ----------
async def test_get_changes_handles_pagination(monkeypatch):
client = _make_client()
@ -98,6 +105,7 @@ async def test_get_changes_handles_pagination(monkeypatch):
# ---------- C4: get_changes raises on 401 ----------
async def test_get_changes_returns_error_on_401(monkeypatch):
client = _make_client()

View file

@ -41,15 +41,40 @@ def test_non_downloadable_item_is_skipped():
@pytest.mark.parametrize(
"filename",
[
"archive.zip", "backup.tar", "data.gz", "stuff.rar", "pack.7z",
"program.exe", "lib.dll", "module.so", "image.dmg", "disk.iso",
"movie.mov", "clip.avi", "video.mkv", "film.wmv", "stream.flv",
"archive.zip",
"backup.tar",
"data.gz",
"stuff.rar",
"pack.7z",
"program.exe",
"lib.dll",
"module.so",
"image.dmg",
"disk.iso",
"movie.mov",
"clip.avi",
"video.mkv",
"film.wmv",
"stream.flv",
"favicon.ico",
"raw.cr2", "photo.nef", "image.arw", "pic.dng",
"design.psd", "vector.ai", "mockup.sketch", "proto.fig",
"font.ttf", "font.otf", "font.woff", "font.woff2",
"model.stl", "scene.fbx", "mesh.blend",
"local.db", "data.sqlite", "access.mdb",
"raw.cr2",
"photo.nef",
"image.arw",
"pic.dng",
"design.psd",
"vector.ai",
"mockup.sketch",
"proto.fig",
"font.ttf",
"font.otf",
"font.woff",
"font.woff2",
"model.stl",
"scene.fbx",
"mesh.blend",
"local.db",
"data.sqlite",
"access.mdb",
],
)
def test_non_parseable_extensions_are_skipped(filename, mocker):
@ -63,9 +88,16 @@ def test_non_parseable_extensions_are_skipped(filename, mocker):
@pytest.mark.parametrize(
"filename",
[
"report.pdf", "document.docx", "sheet.xlsx", "slides.pptx",
"readme.txt", "data.csv", "page.html", "notes.md",
"config.json", "feed.xml",
"report.pdf",
"document.docx",
"sheet.xlsx",
"slides.pptx",
"readme.txt",
"data.csv",
"page.html",
"notes.md",
"config.json",
"feed.xml",
],
)
def test_parseable_documents_are_not_skipped(filename, mocker):
@ -92,30 +124,33 @@ def test_universal_images_are_not_skipped(filename, mocker):
assert ext is None
@pytest.mark.parametrize("filename,service,expected_skip", [
("old.doc", "DOCLING", True),
("old.doc", "LLAMACLOUD", False),
("old.doc", "UNSTRUCTURED", False),
("legacy.xls", "DOCLING", True),
("legacy.xls", "LLAMACLOUD", False),
("legacy.xls", "UNSTRUCTURED", False),
("deck.ppt", "DOCLING", True),
("deck.ppt", "LLAMACLOUD", False),
("deck.ppt", "UNSTRUCTURED", False),
("icon.svg", "DOCLING", True),
("icon.svg", "LLAMACLOUD", False),
("anim.gif", "DOCLING", True),
("anim.gif", "LLAMACLOUD", False),
("photo.webp", "DOCLING", False),
("photo.webp", "LLAMACLOUD", False),
("photo.webp", "UNSTRUCTURED", True),
("live.heic", "DOCLING", True),
("live.heic", "UNSTRUCTURED", False),
("macro.docm", "DOCLING", True),
("macro.docm", "LLAMACLOUD", False),
("mail.eml", "DOCLING", True),
("mail.eml", "UNSTRUCTURED", False),
])
@pytest.mark.parametrize(
"filename,service,expected_skip",
[
("old.doc", "DOCLING", True),
("old.doc", "LLAMACLOUD", False),
("old.doc", "UNSTRUCTURED", False),
("legacy.xls", "DOCLING", True),
("legacy.xls", "LLAMACLOUD", False),
("legacy.xls", "UNSTRUCTURED", False),
("deck.ppt", "DOCLING", True),
("deck.ppt", "LLAMACLOUD", False),
("deck.ppt", "UNSTRUCTURED", False),
("icon.svg", "DOCLING", True),
("icon.svg", "LLAMACLOUD", False),
("anim.gif", "DOCLING", True),
("anim.gif", "LLAMACLOUD", False),
("photo.webp", "DOCLING", False),
("photo.webp", "LLAMACLOUD", False),
("photo.webp", "UNSTRUCTURED", True),
("live.heic", "DOCLING", True),
("live.heic", "UNSTRUCTURED", False),
("macro.docm", "DOCLING", True),
("macro.docm", "LLAMACLOUD", False),
("mail.eml", "DOCLING", True),
("mail.eml", "UNSTRUCTURED", False),
],
)
def test_parser_specific_extensions(filename, service, expected_skip, mocker):
mocker.patch("app.config.config.ETL_SERVICE", service)
item = {".tag": "file", "name": filename}

View file

@ -7,21 +7,37 @@ from app.connectors.google_drive.file_types import should_skip_by_extension
pytestmark = pytest.mark.unit
@pytest.mark.parametrize("filename", [
"malware.exe", "archive.zip", "video.mov", "font.woff2", "model.blend",
])
@pytest.mark.parametrize(
"filename",
[
"malware.exe",
"archive.zip",
"video.mov",
"font.woff2",
"model.blend",
],
)
def test_unsupported_extensions_are_skipped_regardless_of_service(filename, mocker):
"""Truly unsupported files are skipped no matter which ETL service is configured."""
for service in ("DOCLING", "LLAMACLOUD", "UNSTRUCTURED"):
mocker.patch("app.config.config.ETL_SERVICE", service)
skip, ext = should_skip_by_extension(filename)
skip, _ext = should_skip_by_extension(filename)
assert skip is True
@pytest.mark.parametrize("filename", [
"report.pdf", "doc.docx", "sheet.xlsx", "slides.pptx",
"readme.txt", "data.csv", "photo.png", "notes.md",
])
@pytest.mark.parametrize(
"filename",
[
"report.pdf",
"doc.docx",
"sheet.xlsx",
"slides.pptx",
"readme.txt",
"data.csv",
"photo.png",
"notes.md",
],
)
def test_universal_extensions_are_not_skipped(filename, mocker):
"""Files supported by all parsers (or handled by plaintext/direct_convert) are never skipped."""
for service in ("DOCLING", "LLAMACLOUD", "UNSTRUCTURED"):
@ -31,16 +47,19 @@ def test_universal_extensions_are_not_skipped(filename, mocker):
assert ext is None
@pytest.mark.parametrize("filename,service,expected_skip", [
("macro.docm", "DOCLING", True),
("macro.docm", "LLAMACLOUD", False),
("mail.eml", "DOCLING", True),
("mail.eml", "UNSTRUCTURED", False),
("photo.gif", "DOCLING", True),
("photo.gif", "LLAMACLOUD", False),
("photo.heic", "UNSTRUCTURED", False),
("photo.heic", "DOCLING", True),
])
@pytest.mark.parametrize(
"filename,service,expected_skip",
[
("macro.docm", "DOCLING", True),
("macro.docm", "LLAMACLOUD", False),
("mail.eml", "DOCLING", True),
("mail.eml", "UNSTRUCTURED", False),
("photo.gif", "DOCLING", True),
("photo.gif", "LLAMACLOUD", False),
("photo.heic", "UNSTRUCTURED", False),
("photo.heic", "DOCLING", True),
],
)
def test_parser_specific_extensions(filename, service, expected_skip, mocker):
mocker.patch("app.config.config.ETL_SERVICE", service)
skip, ext = should_skip_by_extension(filename)

View file

@ -45,9 +45,16 @@ def test_onenote_is_skipped():
# ---------------------------------------------------------------------------
@pytest.mark.parametrize("filename", [
"malware.exe", "archive.zip", "video.mov", "font.woff2", "model.blend",
])
@pytest.mark.parametrize(
"filename",
[
"malware.exe",
"archive.zip",
"video.mov",
"font.woff2",
"model.blend",
],
)
def test_unsupported_extensions_are_skipped(filename, mocker):
mocker.patch("app.config.config.ETL_SERVICE", "DOCLING")
item = {"name": filename, "file": {"mimeType": "application/octet-stream"}}
@ -56,10 +63,19 @@ def test_unsupported_extensions_are_skipped(filename, mocker):
assert ext is not None
@pytest.mark.parametrize("filename", [
"report.pdf", "doc.docx", "sheet.xlsx", "slides.pptx",
"readme.txt", "data.csv", "photo.png", "notes.md",
])
@pytest.mark.parametrize(
"filename",
[
"report.pdf",
"doc.docx",
"sheet.xlsx",
"slides.pptx",
"readme.txt",
"data.csv",
"photo.png",
"notes.md",
],
)
def test_universal_files_are_not_skipped(filename, mocker):
for service in ("DOCLING", "LLAMACLOUD", "UNSTRUCTURED"):
mocker.patch("app.config.config.ETL_SERVICE", service)
@ -69,14 +85,17 @@ def test_universal_files_are_not_skipped(filename, mocker):
assert ext is None
@pytest.mark.parametrize("filename,service,expected_skip", [
("macro.docm", "DOCLING", True),
("macro.docm", "LLAMACLOUD", False),
("mail.eml", "DOCLING", True),
("mail.eml", "UNSTRUCTURED", False),
("photo.heic", "UNSTRUCTURED", False),
("photo.heic", "DOCLING", True),
])
@pytest.mark.parametrize(
"filename,service,expected_skip",
[
("macro.docm", "DOCLING", True),
("macro.docm", "LLAMACLOUD", False),
("mail.eml", "DOCLING", True),
("mail.eml", "UNSTRUCTURED", False),
("photo.heic", "UNSTRUCTURED", False),
("photo.heic", "DOCLING", True),
],
)
def test_parser_specific_extensions(filename, service, expected_skip, mocker):
mocker.patch("app.config.config.ETL_SERVICE", service)
item = {"name": filename, "file": {"mimeType": "application/octet-stream"}}

View file

@ -24,6 +24,4 @@ def _stub_package(dotted: str, fs_dir: Path) -> None:
_stub_package("app", _BACKEND / "app")
_stub_package("app.etl_pipeline", _BACKEND / "app" / "etl_pipeline")
_stub_package(
"app.etl_pipeline.parsers", _BACKEND / "app" / "etl_pipeline" / "parsers"
)
_stub_package("app.etl_pipeline.parsers", _BACKEND / "app" / "etl_pipeline" / "parsers")

View file

@ -144,7 +144,7 @@ async def test_extract_mp3_returns_transcription(tmp_path, mocker):
# ---------------------------------------------------------------------------
# Slice 7 DOCLING document parsing
# Slice 7 - DOCLING document parsing
# ---------------------------------------------------------------------------
@ -172,7 +172,7 @@ async def test_extract_pdf_with_docling(tmp_path, mocker):
# ---------------------------------------------------------------------------
# Slice 8 UNSTRUCTURED document parsing
# Slice 8 - UNSTRUCTURED document parsing
# ---------------------------------------------------------------------------
@ -208,7 +208,7 @@ async def test_extract_pdf_with_unstructured(tmp_path, mocker):
# ---------------------------------------------------------------------------
# Slice 9 LLAMACLOUD document parsing
# Slice 9 - LLAMACLOUD document parsing
# ---------------------------------------------------------------------------
@ -241,9 +241,7 @@ async def test_extract_pdf_with_llamacloud(tmp_path, mocker):
)
result = await EtlPipelineService().extract(
EtlRequest(
file_path=str(pdf_file), filename="report.pdf", estimated_pages=5
)
EtlRequest(file_path=str(pdf_file), filename="report.pdf", estimated_pages=5)
)
assert result.markdown_content == "# LlamaCloud parsed"
@ -252,7 +250,7 @@ async def test_extract_pdf_with_llamacloud(tmp_path, mocker):
# ---------------------------------------------------------------------------
# Slice 10 unknown extension falls through to document ETL
# Slice 10 - unknown extension falls through to document ETL
# ---------------------------------------------------------------------------
@ -279,18 +277,18 @@ async def test_unknown_extension_uses_document_etl(tmp_path, mocker):
# ---------------------------------------------------------------------------
# Slice 11 EtlRequest validation
# Slice 11 - EtlRequest validation
# ---------------------------------------------------------------------------
def test_etl_request_requires_filename():
"""EtlRequest rejects missing filename."""
with pytest.raises(Exception):
with pytest.raises(ValueError, match="filename must not be empty"):
EtlRequest(file_path="/tmp/some.txt", filename="")
# ---------------------------------------------------------------------------
# Slice 12 unknown ETL_SERVICE raises EtlServiceUnavailableError
# Slice 12 - unknown ETL_SERVICE raises EtlServiceUnavailableError
# ---------------------------------------------------------------------------
@ -310,7 +308,7 @@ async def test_unknown_etl_service_raises(tmp_path, mocker):
# ---------------------------------------------------------------------------
# Slice 13 unsupported file types are rejected before reaching any parser
# Slice 13 - unsupported file types are rejected before reaching any parser
# ---------------------------------------------------------------------------
@ -321,10 +319,19 @@ def test_unknown_extension_classified_as_unsupported():
assert classify_file("random.xyz") == FileCategory.UNSUPPORTED
@pytest.mark.parametrize("filename", [
"malware.exe", "archive.zip", "video.mov", "font.woff2",
"model.blend", "data.parquet", "package.deb", "firmware.bin",
])
@pytest.mark.parametrize(
"filename",
[
"malware.exe",
"archive.zip",
"video.mov",
"font.woff2",
"model.blend",
"data.parquet",
"package.deb",
"firmware.bin",
],
)
def test_unsupported_extensions_classified_correctly(filename):
"""Extensions not in any allowlist are classified as UNSUPPORTED."""
from app.etl_pipeline.file_classifier import FileCategory, classify_file
@ -332,18 +339,21 @@ def test_unsupported_extensions_classified_correctly(filename):
assert classify_file(filename) == FileCategory.UNSUPPORTED
@pytest.mark.parametrize("filename,expected", [
("report.pdf", "document"),
("doc.docx", "document"),
("slides.pptx", "document"),
("sheet.xlsx", "document"),
("photo.png", "document"),
("photo.jpg", "document"),
("book.epub", "document"),
("letter.odt", "document"),
("readme.md", "plaintext"),
("data.csv", "direct_convert"),
])
@pytest.mark.parametrize(
"filename,expected",
[
("report.pdf", "document"),
("doc.docx", "document"),
("slides.pptx", "document"),
("sheet.xlsx", "document"),
("photo.png", "document"),
("photo.jpg", "document"),
("book.epub", "document"),
("letter.odt", "document"),
("readme.md", "plaintext"),
("data.csv", "direct_convert"),
],
)
def test_parseable_extensions_classified_correctly(filename, expected):
"""Parseable files are classified into their correct category."""
from app.etl_pipeline.file_classifier import FileCategory, classify_file
@ -380,31 +390,34 @@ async def test_extract_zip_raises_unsupported_error(tmp_path):
# ---------------------------------------------------------------------------
# Slice 14 should_skip_for_service (per-parser document filtering)
# Slice 14 - should_skip_for_service (per-parser document filtering)
# ---------------------------------------------------------------------------
@pytest.mark.parametrize("filename,etl_service,expected_skip", [
("file.eml", "DOCLING", True),
("file.eml", "UNSTRUCTURED", False),
("file.docm", "LLAMACLOUD", False),
("file.docm", "DOCLING", True),
("file.txt", "DOCLING", False),
("file.csv", "LLAMACLOUD", False),
("file.mp3", "UNSTRUCTURED", False),
("file.exe", "LLAMACLOUD", True),
("file.pdf", "DOCLING", False),
("file.webp", "DOCLING", False),
("file.webp", "UNSTRUCTURED", True),
("file.gif", "LLAMACLOUD", False),
("file.gif", "DOCLING", True),
("file.heic", "UNSTRUCTURED", False),
("file.heic", "DOCLING", True),
("file.svg", "LLAMACLOUD", False),
("file.svg", "DOCLING", True),
("file.p7s", "UNSTRUCTURED", False),
("file.p7s", "LLAMACLOUD", True),
])
@pytest.mark.parametrize(
"filename,etl_service,expected_skip",
[
("file.eml", "DOCLING", True),
("file.eml", "UNSTRUCTURED", False),
("file.docm", "LLAMACLOUD", False),
("file.docm", "DOCLING", True),
("file.txt", "DOCLING", False),
("file.csv", "LLAMACLOUD", False),
("file.mp3", "UNSTRUCTURED", False),
("file.exe", "LLAMACLOUD", True),
("file.pdf", "DOCLING", False),
("file.webp", "DOCLING", False),
("file.webp", "UNSTRUCTURED", True),
("file.gif", "LLAMACLOUD", False),
("file.gif", "DOCLING", True),
("file.heic", "UNSTRUCTURED", False),
("file.heic", "DOCLING", True),
("file.svg", "LLAMACLOUD", False),
("file.svg", "DOCLING", True),
("file.p7s", "UNSTRUCTURED", False),
("file.p7s", "LLAMACLOUD", True),
],
)
def test_should_skip_for_service(filename, etl_service, expected_skip):
from app.etl_pipeline.file_classifier import should_skip_for_service
@ -414,7 +427,7 @@ def test_should_skip_for_service(filename, etl_service, expected_skip):
# ---------------------------------------------------------------------------
# Slice 14b ETL pipeline rejects per-parser incompatible documents
# Slice 14b - ETL pipeline rejects per-parser incompatible documents
# ---------------------------------------------------------------------------

View file

@ -30,26 +30,29 @@ def test_docling_service_does_not_restrict_allowed_formats():
fake_pdf_format_option_cls = MagicMock()
with patch.dict("sys.modules", {
"docling": MagicMock(),
"docling.backend": MagicMock(),
"docling.backend.pypdfium2_backend": MagicMock(
PyPdfiumDocumentBackend=mock_backend
),
"docling.datamodel": MagicMock(),
"docling.datamodel.base_models": MagicMock(
InputFormat=_FakeInputFormat
),
"docling.datamodel.pipeline_options": MagicMock(
PdfPipelineOptions=fake_pipeline_options_cls
),
"docling.document_converter": MagicMock(
DocumentConverter=mock_converter_cls,
PdfFormatOption=fake_pdf_format_option_cls,
),
}):
import app.services.docling_service as mod
with patch.dict(
"sys.modules",
{
"docling": MagicMock(),
"docling.backend": MagicMock(),
"docling.backend.pypdfium2_backend": MagicMock(
PyPdfiumDocumentBackend=mock_backend
),
"docling.datamodel": MagicMock(),
"docling.datamodel.base_models": MagicMock(InputFormat=_FakeInputFormat),
"docling.datamodel.pipeline_options": MagicMock(
PdfPipelineOptions=fake_pipeline_options_cls
),
"docling.document_converter": MagicMock(
DocumentConverter=mock_converter_cls,
PdfFormatOption=fake_pdf_format_option_cls,
),
},
):
from importlib import reload
import app.services.docling_service as mod
reload(mod)
mod.DoclingService()

View file

@ -17,36 +17,74 @@ def test_exe_is_not_supported_document():
assert is_supported_document_extension("malware.exe") is False
@pytest.mark.parametrize("filename", [
"report.pdf", "doc.docx", "old.doc",
"sheet.xlsx", "legacy.xls",
"slides.pptx", "deck.ppt",
"macro.docm", "macro.xlsm", "macro.pptm",
"photo.png", "photo.jpg", "photo.jpeg", "scan.bmp", "scan.tiff", "scan.tif",
"photo.webp", "anim.gif", "iphone.heic",
"manual.rtf", "book.epub",
"letter.odt", "data.ods", "presentation.odp",
"inbox.eml", "outlook.msg",
"korean.hwpx", "korean.hwp",
"template.dot", "template.dotm",
"template.pot", "template.potx",
"binary.xlsb", "workspace.xlw",
"vector.svg", "signature.p7s",
])
@pytest.mark.parametrize(
"filename",
[
"report.pdf",
"doc.docx",
"old.doc",
"sheet.xlsx",
"legacy.xls",
"slides.pptx",
"deck.ppt",
"macro.docm",
"macro.xlsm",
"macro.pptm",
"photo.png",
"photo.jpg",
"photo.jpeg",
"scan.bmp",
"scan.tiff",
"scan.tif",
"photo.webp",
"anim.gif",
"iphone.heic",
"manual.rtf",
"book.epub",
"letter.odt",
"data.ods",
"presentation.odp",
"inbox.eml",
"outlook.msg",
"korean.hwpx",
"korean.hwp",
"template.dot",
"template.dotm",
"template.pot",
"template.potx",
"binary.xlsb",
"workspace.xlw",
"vector.svg",
"signature.p7s",
],
)
def test_document_extensions_are_supported(filename):
from app.utils.file_extensions import is_supported_document_extension
assert is_supported_document_extension(filename) is True, f"{filename} should be supported"
assert is_supported_document_extension(filename) is True, (
f"{filename} should be supported"
)
@pytest.mark.parametrize("filename", [
"malware.exe", "archive.zip", "video.mov", "font.woff2",
"model.blend", "random.xyz", "data.parquet", "package.deb",
])
@pytest.mark.parametrize(
"filename",
[
"malware.exe",
"archive.zip",
"video.mov",
"font.woff2",
"model.blend",
"random.xyz",
"data.parquet",
"package.deb",
],
)
def test_non_document_extensions_are_not_supported(filename):
from app.utils.file_extensions import is_supported_document_extension
assert is_supported_document_extension(filename) is False, f"{filename} should NOT be supported"
assert is_supported_document_extension(filename) is False, (
f"{filename} should NOT be supported"
)
# ---------------------------------------------------------------------------
@ -67,7 +105,7 @@ def test_union_equals_all_three_sets():
| LLAMAPARSE_DOCUMENT_EXTENSIONS
| UNSTRUCTURED_DOCUMENT_EXTENSIONS
)
assert DOCUMENT_EXTENSIONS == expected
assert expected == DOCUMENT_EXTENSIONS
def test_get_extensions_for_docling():