diff --git a/surfsense_backend/app/tasks/connector_indexers/google_drive_indexer.py b/surfsense_backend/app/tasks/connector_indexers/google_drive_indexer.py index 5e9e0f62f..9c53092f5 100644 --- a/surfsense_backend/app/tasks/connector_indexers/google_drive_indexer.py +++ b/surfsense_backend/app/tasks/connector_indexers/google_drive_indexer.py @@ -25,7 +25,10 @@ from app.connectors.google_drive import ( get_files_in_folder, get_start_page_token, ) -from app.connectors.google_drive.file_types import should_skip_file as skip_mime +from app.connectors.google_drive.file_types import ( + should_skip_by_extension, + should_skip_file as skip_mime, +) from app.db import Document, DocumentStatus, DocumentType, SearchSourceConnectorType from app.indexing_pipeline.connector_document import ConnectorDocument from app.indexing_pipeline.document_hashing import compute_identifier_hash @@ -78,6 +81,8 @@ async def _should_skip_file( if skip_mime(mime_type): return True, "folder/shortcut" + if should_skip_by_extension(file_name): + return True, "unsupported extension" if not file_id: return True, "missing file_id" diff --git a/surfsense_backend/tests/integration/document_upload/conftest.py b/surfsense_backend/tests/integration/document_upload/conftest.py index 41c379e58..62f4f6b47 100644 --- a/surfsense_backend/tests/integration/document_upload/conftest.py +++ b/surfsense_backend/tests/integration/document_upload/conftest.py @@ -319,31 +319,23 @@ def _mock_etl_parsing(monkeypatch): # -- LlamaParse mock (external API) -------------------------------- - class _FakeMarkdownDoc: - def __init__(self, text: str): - self.text = text - - class _FakeLlamaParseResult: - async def aget_markdown_documents(self, *, split_by_page=False): - return [_FakeMarkdownDoc(_MOCK_ETL_MARKDOWN)] - - async def _fake_llamacloud_parse(**kwargs): - _reject_empty(kwargs["file_path"]) - return _FakeLlamaParseResult() + async def _fake_llamacloud_parse(file_path: str, estimated_pages: int) -> str: + _reject_empty(file_path) + return _MOCK_ETL_MARKDOWN monkeypatch.setattr( - "app.tasks.document_processors.file_processors.parse_with_llamacloud_retry", + "app.etl_pipeline.parsers.llamacloud.parse_with_llamacloud", _fake_llamacloud_parse, ) # -- Docling mock (heavy library boundary) ------------------------- - async def _fake_docling_parse(file_path: str, filename: str): + async def _fake_docling_parse(file_path: str, filename: str) -> str: _reject_empty(file_path) return _MOCK_ETL_MARKDOWN monkeypatch.setattr( - "app.tasks.document_processors.file_processors.parse_with_docling", + "app.etl_pipeline.parsers.docling.parse_with_docling", _fake_docling_parse, ) diff --git a/surfsense_backend/tests/unit/connector_indexers/test_dropbox_parallel.py b/surfsense_backend/tests/unit/connector_indexers/test_dropbox_parallel.py index 737e2c850..7a828b9c4 100644 --- a/surfsense_backend/tests/unit/connector_indexers/test_dropbox_parallel.py +++ b/surfsense_backend/tests/unit/connector_indexers/test_dropbox_parallel.py @@ -272,6 +272,23 @@ def full_scan_mocks(mock_dropbox_client, monkeypatch): download_and_index_mock = AsyncMock(return_value=(0, 0)) monkeypatch.setattr(_mod, "_download_and_index", download_and_index_mock) + from app.services.page_limit_service import PageLimitService as _RealPLS + + mock_page_limit_instance = MagicMock() + mock_page_limit_instance.get_page_usage = AsyncMock(return_value=(0, 999_999)) + mock_page_limit_instance.update_page_usage = AsyncMock() + + class _MockPageLimitService: + estimate_pages_from_metadata = staticmethod( + _RealPLS.estimate_pages_from_metadata + ) + + def __init__(self, session): + self.get_page_usage = mock_page_limit_instance.get_page_usage + self.update_page_usage = mock_page_limit_instance.update_page_usage + + monkeypatch.setattr(_mod, "PageLimitService", _MockPageLimitService) + return { "dropbox_client": mock_dropbox_client, "session": mock_session, @@ -377,6 +394,23 @@ def selected_files_mocks(mock_dropbox_client, monkeypatch): download_and_index_mock = AsyncMock(return_value=(0, 0)) monkeypatch.setattr(_mod, "_download_and_index", download_and_index_mock) + from app.services.page_limit_service import PageLimitService as _RealPLS + + mock_page_limit_instance = MagicMock() + mock_page_limit_instance.get_page_usage = AsyncMock(return_value=(0, 999_999)) + mock_page_limit_instance.update_page_usage = AsyncMock() + + class _MockPageLimitService: + estimate_pages_from_metadata = staticmethod( + _RealPLS.estimate_pages_from_metadata + ) + + def __init__(self, session): + self.get_page_usage = mock_page_limit_instance.get_page_usage + self.update_page_usage = mock_page_limit_instance.update_page_usage + + monkeypatch.setattr(_mod, "PageLimitService", _MockPageLimitService) + return { "dropbox_client": mock_dropbox_client, "session": mock_session,