diff --git a/surfsense_backend/app/services/page_limit_service.py b/surfsense_backend/app/services/page_limit_service.py index ea22067be..47fe07fc6 100644 --- a/surfsense_backend/app/services/page_limit_service.py +++ b/surfsense_backend/app/services/page_limit_service.py @@ -257,28 +257,83 @@ class PageLimitService: return max(1, file_size // (100 * 1024)) if file_ext in { - ".doc", ".docx", ".docm", ".dot", ".dotm", - ".odt", ".ott", ".sxw", ".stw", ".uot", - ".rtf", ".pages", ".wpd", ".wps", - ".abw", ".zabw", ".cwk", ".hwp", ".lwp", - ".mcw", ".mw", ".sdw", ".vor", + ".doc", + ".docx", + ".docm", + ".dot", + ".dotm", + ".odt", + ".ott", + ".sxw", + ".stw", + ".uot", + ".rtf", + ".pages", + ".wpd", + ".wps", + ".abw", + ".zabw", + ".cwk", + ".hwp", + ".lwp", + ".mcw", + ".mw", + ".sdw", + ".vor", }: return max(1, file_size // (50 * 1024)) if file_ext in { - ".ppt", ".pptx", ".pptm", ".pot", ".potx", - ".odp", ".otp", ".sxi", ".sti", ".uop", - ".key", ".sda", ".sdd", ".sdp", + ".ppt", + ".pptx", + ".pptm", + ".pot", + ".potx", + ".odp", + ".otp", + ".sxi", + ".sti", + ".uop", + ".key", + ".sda", + ".sdd", + ".sdp", }: return max(1, file_size // (200 * 1024)) if file_ext in { - ".xls", ".xlsx", ".xlsm", ".xlsb", ".xlw", ".xlr", - ".ods", ".ots", ".fods", ".numbers", - ".123", ".wk1", ".wk2", ".wk3", ".wk4", ".wks", - ".wb1", ".wb2", ".wb3", ".wq1", ".wq2", - ".csv", ".tsv", ".slk", ".sylk", ".dif", ".dbf", - ".prn", ".qpw", ".602", ".et", ".eth", + ".xls", + ".xlsx", + ".xlsm", + ".xlsb", + ".xlw", + ".xlr", + ".ods", + ".ots", + ".fods", + ".numbers", + ".123", + ".wk1", + ".wk2", + ".wk3", + ".wk4", + ".wks", + ".wb1", + ".wb2", + ".wb3", + ".wq1", + ".wq2", + ".csv", + ".tsv", + ".slk", + ".sylk", + ".dif", + ".dbf", + ".prn", + ".qpw", + ".602", + ".et", + ".eth", }: return max(1, file_size // (100 * 1024)) @@ -289,8 +344,17 @@ class PageLimitService: return max(1, file_size // 3000) if file_ext in { - ".jpg", ".jpeg", ".png", ".gif", ".bmp", ".tiff", - ".webp", ".svg", ".cgm", ".odg", ".pbd", + ".jpg", + ".jpeg", + ".png", + ".gif", + ".bmp", + ".tiff", + ".webp", + ".svg", + ".cgm", + ".odg", + ".pbd", }: return 1 diff --git a/surfsense_backend/app/tasks/connector_indexers/base.py b/surfsense_backend/app/tasks/connector_indexers/base.py index 6b4bed4b5..ffc8ab72e 100644 --- a/surfsense_backend/app/tasks/connector_indexers/base.py +++ b/surfsense_backend/app/tasks/connector_indexers/base.py @@ -4,6 +4,7 @@ Base functionality and shared imports for connector indexers. import logging from datetime import UTC, datetime, timedelta + from sqlalchemy.ext.asyncio import AsyncSession from sqlalchemy.future import select diff --git a/surfsense_backend/app/tasks/connector_indexers/local_folder_indexer.py b/surfsense_backend/app/tasks/connector_indexers/local_folder_indexer.py index acfbce0bf..fa50e86d3 100644 --- a/surfsense_backend/app/tasks/connector_indexers/local_folder_indexer.py +++ b/surfsense_backend/app/tasks/connector_indexers/local_folder_indexer.py @@ -205,6 +205,7 @@ def _compute_final_pages( actual = page_limit_service.estimate_pages_from_content_length(content_length) return max(estimated_pages, actual) + DEFAULT_EXCLUDE_PATTERNS = [ ".git", "node_modules", diff --git a/surfsense_backend/tests/integration/document_upload/conftest.py b/surfsense_backend/tests/integration/document_upload/conftest.py index 6640fefdf..41c379e58 100644 --- a/surfsense_backend/tests/integration/document_upload/conftest.py +++ b/surfsense_backend/tests/integration/document_upload/conftest.py @@ -302,6 +302,9 @@ def _mock_redis_heartbeat(monkeypatch): ) +_MOCK_ETL_MARKDOWN = "# Mocked Document\n\nThis is mocked ETL content." + + @pytest.fixture(autouse=True) def _mock_etl_parsing(monkeypatch): """Mock ETL parsing services — LlamaParse and Docling are external boundaries. @@ -310,8 +313,6 @@ def _mock_etl_parsing(monkeypatch): the actual services would, so tests covering failure paths keep working. """ - _MOCK_MARKDOWN = "# Mocked Document\n\nThis is mocked ETL content." - def _reject_empty(file_path: str) -> None: if os.path.getsize(file_path) == 0: raise RuntimeError(f"Cannot parse empty file: {file_path}") @@ -324,7 +325,7 @@ def _mock_etl_parsing(monkeypatch): class _FakeLlamaParseResult: async def aget_markdown_documents(self, *, split_by_page=False): - return [_FakeMarkdownDoc(_MOCK_MARKDOWN)] + return [_FakeMarkdownDoc(_MOCK_ETL_MARKDOWN)] async def _fake_llamacloud_parse(**kwargs): _reject_empty(kwargs["file_path"]) @@ -339,7 +340,7 @@ def _mock_etl_parsing(monkeypatch): async def _fake_docling_parse(file_path: str, filename: str): _reject_empty(file_path) - return _MOCK_MARKDOWN + return _MOCK_ETL_MARKDOWN monkeypatch.setattr( "app.tasks.document_processors.file_processors.parse_with_docling", @@ -347,10 +348,12 @@ def _mock_etl_parsing(monkeypatch): ) class _FakeDoclingResult: - class document: + class Document: @staticmethod def export_to_markdown(): - return _MOCK_MARKDOWN + return _MOCK_ETL_MARKDOWN + + document = Document() class _FakeDocumentConverter: def convert(self, file_path): diff --git a/surfsense_backend/tests/integration/indexing_pipeline/test_local_folder_pipeline.py b/surfsense_backend/tests/integration/indexing_pipeline/test_local_folder_pipeline.py index 4d9bda7ee..000f43aa8 100644 --- a/surfsense_backend/tests/integration/indexing_pipeline/test_local_folder_pipeline.py +++ b/surfsense_backend/tests/integration/indexing_pipeline/test_local_folder_pipeline.py @@ -1015,7 +1015,7 @@ class TestPageLimits: (tmp_path / "note.md").write_text("# Hello World\n\nContent here.") - count, _skipped, _root_folder_id, err = await index_local_folder( + count, _skipped, _root_folder_id, _err = await index_local_folder( session=db_session, search_space_id=db_search_space.id, user_id=str(db_user.id), diff --git a/surfsense_backend/tests/unit/connector_indexers/test_page_limits.py b/surfsense_backend/tests/unit/connector_indexers/test_page_limits.py index 1c93965f3..b31a9557f 100644 --- a/surfsense_backend/tests/unit/connector_indexers/test_page_limits.py +++ b/surfsense_backend/tests/unit/connector_indexers/test_page_limits.py @@ -58,10 +58,14 @@ class TestEstimatePagesFromMetadata: assert PageLimitService.estimate_pages_from_metadata(ext, 5_000_000) == 1 def test_audio_uses_1mb_per_page(self): - assert PageLimitService.estimate_pages_from_metadata(".mp3", 3 * 1024 * 1024) == 3 + assert ( + PageLimitService.estimate_pages_from_metadata(".mp3", 3 * 1024 * 1024) == 3 + ) def test_video_uses_5mb_per_page(self): - assert PageLimitService.estimate_pages_from_metadata(".mp4", 15 * 1024 * 1024) == 3 + assert ( + PageLimitService.estimate_pages_from_metadata(".mp4", 15 * 1024 * 1024) == 3 + ) def test_unknown_ext_uses_80kb_per_page(self): assert PageLimitService.estimate_pages_from_metadata(".xyz", 160 * 1024) == 2 @@ -189,7 +193,8 @@ async def test_gdrive_files_within_quota_are_downloaded(gdrive_selected_mocks): for fid in ("f1", "f2", "f3"): m["get_file_results"][fid] = ( - _make_gdrive_file(fid, f"{fid}.xyz", size=80 * 1024), None, + _make_gdrive_file(fid, f"{fid}.xyz", size=80 * 1024), + None, ) m["download_and_index_mock"].return_value = (3, 0) @@ -210,7 +215,8 @@ async def test_gdrive_files_exceeding_quota_rejected(gdrive_selected_mocks): m["fake_user"].pages_limit = 100 m["get_file_results"]["big"] = ( - _make_gdrive_file("big", "huge.pdf", size=500 * 1024), None, + _make_gdrive_file("big", "huge.pdf", size=500 * 1024), + None, ) indexed, _skipped, errors = await _run_gdrive_selected(m, [("big", "huge.pdf")]) @@ -228,7 +234,8 @@ async def test_gdrive_quota_mix_partial_indexing(gdrive_selected_mocks): for fid in ("f1", "f2", "f3"): m["get_file_results"][fid] = ( - _make_gdrive_file(fid, f"{fid}.xyz", size=80 * 1024), None, + _make_gdrive_file(fid, f"{fid}.xyz", size=80 * 1024), + None, ) m["download_and_index_mock"].return_value = (2, 0) @@ -250,7 +257,8 @@ async def test_gdrive_proportional_page_deduction(gdrive_selected_mocks): for fid in ("f1", "f2", "f3", "f4"): m["get_file_results"][fid] = ( - _make_gdrive_file(fid, f"{fid}.xyz", size=80 * 1024), None, + _make_gdrive_file(fid, f"{fid}.xyz", size=80 * 1024), + None, ) m["download_and_index_mock"].return_value = (2, 2) @@ -269,7 +277,8 @@ async def test_gdrive_no_deduction_when_nothing_indexed(gdrive_selected_mocks): m["fake_user"].pages_limit = 100 m["get_file_results"]["f1"] = ( - _make_gdrive_file("f1", "f1.xyz", size=80 * 1024), None, + _make_gdrive_file("f1", "f1.xyz", size=80 * 1024), + None, ) m["download_and_index_mock"].return_value = (0, 1) @@ -286,7 +295,8 @@ async def test_gdrive_zero_quota_rejects_all(gdrive_selected_mocks): for fid in ("f1", "f2"): m["get_file_results"][fid] = ( - _make_gdrive_file(fid, f"{fid}.xyz", size=80 * 1024), None, + _make_gdrive_file(fid, f"{fid}.xyz", size=80 * 1024), + None, ) indexed, _skipped, errors = await _run_gdrive_selected( @@ -367,7 +377,8 @@ async def test_gdrive_full_scan_skips_over_quota(gdrive_full_scan_mocks, monkeyp _make_gdrive_file(f"f{i}", f"file{i}.xyz", size=80 * 1024) for i in range(5) ] monkeypatch.setattr( - m["mod"], "get_files_in_folder", + m["mod"], + "get_files_in_folder", AsyncMock(return_value=(page_files, None, None)), ) m["download_mock"].return_value = ([], 0) @@ -391,7 +402,8 @@ async def test_gdrive_full_scan_deducts_after_indexing( _make_gdrive_file(f"f{i}", f"file{i}.xyz", size=80 * 1024) for i in range(3) ] monkeypatch.setattr( - m["mod"], "get_files_in_folder", + m["mod"], + "get_files_in_folder", AsyncMock(return_value=(page_files, None, None)), ) mock_docs = [MagicMock() for _ in range(3)] @@ -421,7 +433,8 @@ async def test_gdrive_delta_sync_skips_over_quota(monkeypatch): for i in range(5) ] monkeypatch.setattr( - _mod, "fetch_all_changes", + _mod, + "fetch_all_changes", AsyncMock(return_value=(changes, "new-token", None)), ) monkeypatch.setattr(_mod, "categorize_change", lambda change: "modified") @@ -447,11 +460,18 @@ async def test_gdrive_delta_sync_skips_over_quota(monkeypatch): mock_task_logger.log_task_progress = AsyncMock() _indexed, skipped = await _mod._index_with_delta_sync( - MagicMock(), session, MagicMock(), - _CONNECTOR_ID, _SEARCH_SPACE_ID, _USER_ID, - "folder-root", "start-token", - mock_task_logger, MagicMock(), - max_files=500, enable_summary=True, + MagicMock(), + session, + MagicMock(), + _CONNECTOR_ID, + _SEARCH_SPACE_ID, + _USER_ID, + "folder-root", + "start-token", + mock_task_logger, + MagicMock(), + max_files=500, + enable_summary=True, ) call_files = download_mock.call_args[0][1] @@ -511,9 +531,13 @@ async def _run_onedrive_selected(mocks, file_ids): from app.tasks.connector_indexers.onedrive_indexer import _index_selected_files return await _index_selected_files( - MagicMock(), mocks["session"], file_ids, - connector_id=_CONNECTOR_ID, search_space_id=_SEARCH_SPACE_ID, - user_id=_USER_ID, enable_summary=True, + MagicMock(), + mocks["session"], + file_ids, + connector_id=_CONNECTOR_ID, + search_space_id=_SEARCH_SPACE_ID, + user_id=_USER_ID, + enable_summary=True, ) @@ -524,7 +548,8 @@ async def test_onedrive_over_quota_rejected(onedrive_selected_mocks): m["fake_user"].pages_limit = 100 m["get_file_results"]["big"] = ( - _make_onedrive_file("big", "huge.pdf", size=500 * 1024), None, + _make_onedrive_file("big", "huge.pdf", size=500 * 1024), + None, ) indexed, _skipped, errors = await _run_onedrive_selected(m, [("big", "huge.pdf")]) @@ -542,7 +567,8 @@ async def test_onedrive_deducts_after_success(onedrive_selected_mocks): for fid in ("f1", "f2"): m["get_file_results"][fid] = ( - _make_onedrive_file(fid, f"{fid}.xyz", size=80 * 1024), None, + _make_onedrive_file(fid, f"{fid}.xyz", size=80 * 1024), + None, ) m["download_and_index_mock"].return_value = (2, 0) @@ -605,9 +631,13 @@ async def _run_dropbox_selected(mocks, file_paths): from app.tasks.connector_indexers.dropbox_indexer import _index_selected_files return await _index_selected_files( - MagicMock(), mocks["session"], file_paths, - connector_id=_CONNECTOR_ID, search_space_id=_SEARCH_SPACE_ID, - user_id=_USER_ID, enable_summary=True, + MagicMock(), + mocks["session"], + file_paths, + connector_id=_CONNECTOR_ID, + search_space_id=_SEARCH_SPACE_ID, + user_id=_USER_ID, + enable_summary=True, ) @@ -618,7 +648,8 @@ async def test_dropbox_over_quota_rejected(dropbox_selected_mocks): m["fake_user"].pages_limit = 100 m["get_file_results"]["/huge.pdf"] = ( - _make_dropbox_file("/huge.pdf", "huge.pdf", size=500 * 1024), None, + _make_dropbox_file("/huge.pdf", "huge.pdf", size=500 * 1024), + None, ) indexed, _skipped, errors = await _run_dropbox_selected( @@ -639,7 +670,8 @@ async def test_dropbox_deducts_after_success(dropbox_selected_mocks): for name in ("f1.xyz", "f2.xyz"): path = f"/{name}" m["get_file_results"][path] = ( - _make_dropbox_file(path, name, size=80 * 1024), None, + _make_dropbox_file(path, name, size=80 * 1024), + None, ) m["download_and_index_mock"].return_value = (2, 0)