From ddfe60c2f06e8a944213405cac818433e63d20ee Mon Sep 17 00:00:00 2001 From: Anish Sarkar <104695310+AnishSarkar22@users.noreply.github.com> Date: Thu, 4 Jun 2026 00:53:51 +0530 Subject: [PATCH] feat(tests): Update tests for summary-free indexing --- surfsense_backend/tests/e2e/fakes/llm.py | 8 +- surfsense_backend/tests/e2e/run_backend.py | 24 +++--- surfsense_backend/tests/e2e/run_celery.py | 24 +++--- .../tests/integration/conftest.py | 8 -- .../integration/document_upload/conftest.py | 6 -- .../adapters/test_file_upload_adapter.py | 53 ++++-------- .../test_calendar_pipeline.py | 8 +- .../indexing_pipeline/test_drive_pipeline.py | 8 +- .../test_dropbox_pipeline.py | 10 +-- .../indexing_pipeline/test_gmail_pipeline.py | 8 +- .../indexing_pipeline/test_index_batch.py | 8 +- .../indexing_pipeline/test_index_document.py | 82 +++++++------------ .../test_local_folder_pipeline.py | 3 +- .../test_onedrive_pipeline.py | 10 +-- .../test_prepare_for_indexing.py | 10 +-- .../new_chat/tools/test_resume_page_limits.py | 8 +- .../test_confluence_parallel.py | 12 +-- .../test_dropbox_parallel.py | 14 +--- .../test_google_drive_parallel.py | 13 +-- .../test_linear_parallel.py | 15 +--- .../test_notion_parallel.py | 15 +--- .../test_onedrive_parallel.py | 6 -- .../connector_indexers/test_page_limits.py | 9 +- .../test_connector_document.py | 1 - .../test_document_summarizer.py | 41 ---------- .../test_index_batch_parallel.py | 13 +-- 26 files changed, 123 insertions(+), 294 deletions(-) delete mode 100644 surfsense_backend/tests/unit/indexing_pipeline/test_document_summarizer.py diff --git a/surfsense_backend/tests/e2e/fakes/llm.py b/surfsense_backend/tests/e2e/fakes/llm.py index 9d2370e2c..8172dd86a 100644 --- a/surfsense_backend/tests/e2e/fakes/llm.py +++ b/surfsense_backend/tests/e2e/fakes/llm.py @@ -7,13 +7,13 @@ The production indexing pipeline summarizes documents with: summary_content = summary_result.content The `llm` parameter is supplied per-document by -`app.services.llm_service.get_user_long_context_llm`. We patch THAT +`app.services.llm_service.get_agent_llm`. We patch THAT function to return a langchain-native FakeListChatModel so the rest of the chain works unchanged. No real LLM provider package is touched. Run-backend / run-celery use unittest.mock.patch.start() to install this at every binding site (the source module + every consumer that -did `from app.services.llm_service import get_user_long_context_llm` +did `from app.services.llm_service import get_agent_llm` at module load time). """ @@ -42,7 +42,7 @@ def _make_fake_llm() -> FakeListChatModel: return fake -async def fake_get_user_long_context_llm(*args: Any, **kwargs: Any) -> Any: - """Drop-in replacement for app.services.llm_service.get_user_long_context_llm.""" +async def fake_get_agent_llm(*args: Any, **kwargs: Any) -> Any: + """Drop-in replacement for app.services.llm_service.get_agent_llm.""" logger.info("[fake-llm] returning FakeListChatModel for E2E indexing") return _make_fake_llm() diff --git a/surfsense_backend/tests/e2e/run_backend.py b/surfsense_backend/tests/e2e/run_backend.py index 5a787ac52..6781b1634 100644 --- a/surfsense_backend/tests/e2e/run_backend.py +++ b/surfsense_backend/tests/e2e/run_backend.py @@ -206,23 +206,23 @@ def _patch_llm_bindings() -> None: fake_create_chat_litellm_from_agent_config, fake_create_chat_litellm_from_config, ) - from tests.e2e.fakes.llm import fake_get_user_long_context_llm + from tests.e2e.fakes.llm import fake_get_agent_llm targets = [ - "app.services.llm_service.get_user_long_context_llm", - "app.tasks.connector_indexers.confluence_indexer.get_user_long_context_llm", - "app.tasks.connector_indexers.google_drive_indexer.get_user_long_context_llm", - "app.tasks.connector_indexers.google_gmail_indexer.get_user_long_context_llm", - "app.tasks.connector_indexers.notion_indexer.get_user_long_context_llm", - "app.tasks.connector_indexers.onedrive_indexer.get_user_long_context_llm", - "app.tasks.connector_indexers.dropbox_indexer.get_user_long_context_llm", - "app.tasks.connector_indexers.local_folder_indexer.get_user_long_context_llm", - "app.tasks.document_processors._save.get_user_long_context_llm", - "app.tasks.document_processors.markdown_processor.get_user_long_context_llm", + "app.services.llm_service.get_agent_llm", + "app.tasks.connector_indexers.confluence_indexer.get_agent_llm", + "app.tasks.connector_indexers.google_drive_indexer.get_agent_llm", + "app.tasks.connector_indexers.google_gmail_indexer.get_agent_llm", + "app.tasks.connector_indexers.notion_indexer.get_agent_llm", + "app.tasks.connector_indexers.onedrive_indexer.get_agent_llm", + "app.tasks.connector_indexers.dropbox_indexer.get_agent_llm", + "app.tasks.connector_indexers.local_folder_indexer.get_agent_llm", + "app.tasks.document_processors._save.get_agent_llm", + "app.tasks.document_processors.markdown_processor.get_agent_llm", ] for target in targets: try: - p = patch(target, fake_get_user_long_context_llm) + p = patch(target, fake_get_agent_llm) p.start() _active_patches.append(p) logger.info("[fake-llm] patched %s", target) diff --git a/surfsense_backend/tests/e2e/run_celery.py b/surfsense_backend/tests/e2e/run_celery.py index e4091d689..d0fbb4760 100644 --- a/surfsense_backend/tests/e2e/run_celery.py +++ b/surfsense_backend/tests/e2e/run_celery.py @@ -183,23 +183,23 @@ def _patch_llm_bindings() -> None: fake_create_chat_litellm_from_agent_config, fake_create_chat_litellm_from_config, ) - from tests.e2e.fakes.llm import fake_get_user_long_context_llm + from tests.e2e.fakes.llm import fake_get_agent_llm targets = [ - "app.services.llm_service.get_user_long_context_llm", - "app.tasks.connector_indexers.confluence_indexer.get_user_long_context_llm", - "app.tasks.connector_indexers.google_drive_indexer.get_user_long_context_llm", - "app.tasks.connector_indexers.google_gmail_indexer.get_user_long_context_llm", - "app.tasks.connector_indexers.notion_indexer.get_user_long_context_llm", - "app.tasks.connector_indexers.onedrive_indexer.get_user_long_context_llm", - "app.tasks.connector_indexers.dropbox_indexer.get_user_long_context_llm", - "app.tasks.connector_indexers.local_folder_indexer.get_user_long_context_llm", - "app.tasks.document_processors._save.get_user_long_context_llm", - "app.tasks.document_processors.markdown_processor.get_user_long_context_llm", + "app.services.llm_service.get_agent_llm", + "app.tasks.connector_indexers.confluence_indexer.get_agent_llm", + "app.tasks.connector_indexers.google_drive_indexer.get_agent_llm", + "app.tasks.connector_indexers.google_gmail_indexer.get_agent_llm", + "app.tasks.connector_indexers.notion_indexer.get_agent_llm", + "app.tasks.connector_indexers.onedrive_indexer.get_agent_llm", + "app.tasks.connector_indexers.dropbox_indexer.get_agent_llm", + "app.tasks.connector_indexers.local_folder_indexer.get_agent_llm", + "app.tasks.document_processors._save.get_agent_llm", + "app.tasks.document_processors.markdown_processor.get_agent_llm", ] for target in targets: try: - p = patch(target, fake_get_user_long_context_llm) + p = patch(target, fake_get_agent_llm) p.start() _active_patches.append(p) logger.info("[fake-llm] patched %s in celery worker", target) diff --git a/surfsense_backend/tests/integration/conftest.py b/surfsense_backend/tests/integration/conftest.py index e03101e63..9b8384303 100644 --- a/surfsense_backend/tests/integration/conftest.py +++ b/surfsense_backend/tests/integration/conftest.py @@ -126,20 +126,12 @@ async def db_search_space(db_session: AsyncSession, db_user: User) -> SearchSpac @pytest.fixture def patched_summarize(monkeypatch) -> AsyncMock: mock = AsyncMock(return_value="Mocked summary.") - monkeypatch.setattr( - "app.indexing_pipeline.indexing_pipeline_service.summarize_document", - mock, - ) return mock @pytest.fixture def patched_summarize_raises(monkeypatch) -> AsyncMock: mock = AsyncMock(side_effect=RuntimeError("LLM unavailable")) - monkeypatch.setattr( - "app.indexing_pipeline.indexing_pipeline_service.summarize_document", - mock, - ) return mock diff --git a/surfsense_backend/tests/integration/document_upload/conftest.py b/surfsense_backend/tests/integration/document_upload/conftest.py index ff44e471a..13e3ab59c 100644 --- a/surfsense_backend/tests/integration/document_upload/conftest.py +++ b/surfsense_backend/tests/integration/document_upload/conftest.py @@ -68,7 +68,6 @@ class InlineTaskDispatcher: filename: str, search_space_id: int, user_id: str, - should_summarize: bool = False, use_vision_llm: bool = False, processing_mode: str = "basic", ) -> None: @@ -83,7 +82,6 @@ class InlineTaskDispatcher: filename, search_space_id, user_id, - should_summarize=should_summarize, use_vision_llm=use_vision_llm, processing_mode=processing_mode, ) @@ -266,10 +264,6 @@ async def page_limits(): @pytest.fixture(autouse=True) def _mock_external_apis(monkeypatch): """Mock LLM, embedding, and chunking — these are external API boundaries.""" - monkeypatch.setattr( - "app.indexing_pipeline.indexing_pipeline_service.summarize_document", - AsyncMock(return_value="Mocked summary."), - ) monkeypatch.setattr( "app.indexing_pipeline.indexing_pipeline_service.embed_texts", MagicMock(side_effect=lambda texts: [[0.1] * _EMBEDDING_DIM for _ in texts]), diff --git a/surfsense_backend/tests/integration/indexing_pipeline/adapters/test_file_upload_adapter.py b/surfsense_backend/tests/integration/indexing_pipeline/adapters/test_file_upload_adapter.py index 6bb1d2094..3f4c88a59 100644 --- a/surfsense_backend/tests/integration/indexing_pipeline/adapters/test_file_upload_adapter.py +++ b/surfsense_backend/tests/integration/indexing_pipeline/adapters/test_file_upload_adapter.py @@ -8,7 +8,7 @@ pytestmark = pytest.mark.integration @pytest.mark.usefixtures( - "patched_summarize", "patched_embed_texts", "patched_chunk_text" +"patched_embed_texts", "patched_chunk_text" ) async def test_sets_status_ready(db_session, db_search_space, db_user, mocker): """Document status is READY after successful indexing.""" @@ -19,7 +19,6 @@ async def test_sets_status_ready(db_session, db_search_space, db_user, mocker): etl_service="UNSTRUCTURED", search_space_id=db_search_space.id, user_id=str(db_user.id), - llm=mocker.Mock(), ) result = await db_session.execute( @@ -31,7 +30,7 @@ async def test_sets_status_ready(db_session, db_search_space, db_user, mocker): @pytest.mark.usefixtures( - "patched_summarize", "patched_embed_texts", "patched_chunk_text" +"patched_embed_texts", "patched_chunk_text" ) async def test_content_is_summary(db_session, db_search_space, db_user, mocker): """Document content is set to the LLM-generated summary.""" @@ -42,8 +41,6 @@ async def test_content_is_summary(db_session, db_search_space, db_user, mocker): etl_service="UNSTRUCTURED", search_space_id=db_search_space.id, user_id=str(db_user.id), - llm=mocker.Mock(), - should_summarize=True, ) result = await db_session.execute( @@ -55,7 +52,7 @@ async def test_content_is_summary(db_session, db_search_space, db_user, mocker): @pytest.mark.usefixtures( - "patched_summarize", "patched_embed_texts", "patched_chunk_text" +"patched_embed_texts", "patched_chunk_text" ) async def test_chunks_written_to_db(db_session, db_search_space, db_user, mocker): """Chunks derived from the source markdown are persisted in the DB.""" @@ -66,7 +63,6 @@ async def test_chunks_written_to_db(db_session, db_search_space, db_user, mocker etl_service="UNSTRUCTURED", search_space_id=db_search_space.id, user_id=str(db_user.id), - llm=mocker.Mock(), ) result = await db_session.execute( @@ -96,9 +92,7 @@ async def test_raises_on_indexing_failure(db_session, db_search_space, db_user, etl_service="UNSTRUCTURED", search_space_id=db_search_space.id, user_id=str(db_user.id), - llm=mocker.Mock(), - should_summarize=True, - ) + ) # --------------------------------------------------------------------------- @@ -107,7 +101,7 @@ async def test_raises_on_indexing_failure(db_session, db_search_space, db_user, @pytest.mark.usefixtures( - "patched_summarize", "patched_embed_texts", "patched_chunk_text" +"patched_embed_texts", "patched_chunk_text" ) async def test_reindex_updates_content(db_session, db_search_space, db_user, mocker): """Document content is updated to the new summary after reindexing.""" @@ -118,7 +112,6 @@ async def test_reindex_updates_content(db_session, db_search_space, db_user, moc etl_service="UNSTRUCTURED", search_space_id=db_search_space.id, user_id=str(db_user.id), - llm=mocker.Mock(), ) result = await db_session.execute( @@ -129,14 +122,14 @@ async def test_reindex_updates_content(db_session, db_search_space, db_user, moc document.source_markdown = "## Edited\n\nNew content after user edit." await db_session.flush() - await adapter.reindex(document=document, llm=mocker.Mock()) + await adapter.reindex(document=document) await db_session.refresh(document) assert document.content == "Mocked summary." @pytest.mark.usefixtures( - "patched_summarize", "patched_embed_texts", "patched_chunk_text" +"patched_embed_texts", "patched_chunk_text" ) async def test_reindex_updates_content_hash( db_session, db_search_space, db_user, mocker @@ -149,7 +142,6 @@ async def test_reindex_updates_content_hash( etl_service="UNSTRUCTURED", search_space_id=db_search_space.id, user_id=str(db_user.id), - llm=mocker.Mock(), ) result = await db_session.execute( @@ -161,14 +153,14 @@ async def test_reindex_updates_content_hash( document.source_markdown = "## Edited\n\nNew content after user edit." await db_session.flush() - await adapter.reindex(document=document, llm=mocker.Mock()) + await adapter.reindex(document=document) await db_session.refresh(document) assert document.content_hash != original_hash @pytest.mark.usefixtures( - "patched_summarize", "patched_embed_texts", "patched_chunk_text" +"patched_embed_texts", "patched_chunk_text" ) async def test_reindex_sets_status_ready(db_session, db_search_space, db_user, mocker): """Document status is READY after successful reindexing.""" @@ -179,7 +171,6 @@ async def test_reindex_sets_status_ready(db_session, db_search_space, db_user, m etl_service="UNSTRUCTURED", search_space_id=db_search_space.id, user_id=str(db_user.id), - llm=mocker.Mock(), ) result = await db_session.execute( @@ -190,13 +181,13 @@ async def test_reindex_sets_status_ready(db_session, db_search_space, db_user, m document.source_markdown = "## Edited\n\nNew content after user edit." await db_session.flush() - await adapter.reindex(document=document, llm=mocker.Mock()) + await adapter.reindex(document=document) await db_session.refresh(document) assert DocumentStatus.is_state(document.status, DocumentStatus.READY) -@pytest.mark.usefixtures("patched_summarize", "patched_embed_texts") +@pytest.mark.usefixtures("patched_embed_texts") async def test_reindex_replaces_chunks(db_session, db_search_space, db_user, mocker): """Reindexing replaces old chunks with new content rather than appending.""" mocker.patch( @@ -211,7 +202,6 @@ async def test_reindex_replaces_chunks(db_session, db_search_space, db_user, moc etl_service="UNSTRUCTURED", search_space_id=db_search_space.id, user_id=str(db_user.id), - llm=mocker.Mock(), ) result = await db_session.execute( @@ -223,7 +213,7 @@ async def test_reindex_replaces_chunks(db_session, db_search_space, db_user, moc document.source_markdown = "## Edited\n\nNew content after user edit." await db_session.flush() - await adapter.reindex(document=document, llm=mocker.Mock()) + await adapter.reindex(document=document) chunks_result = await db_session.execute( select(Chunk).filter(Chunk.document_id == document_id) @@ -235,7 +225,7 @@ async def test_reindex_replaces_chunks(db_session, db_search_space, db_user, moc @pytest.mark.usefixtures( - "patched_summarize", "patched_embed_texts", "patched_chunk_text" +"patched_embed_texts", "patched_chunk_text" ) async def test_reindex_clears_reindexing_flag( db_session, db_search_space, db_user, mocker @@ -248,7 +238,6 @@ async def test_reindex_clears_reindexing_flag( etl_service="UNSTRUCTURED", search_space_id=db_search_space.id, user_id=str(db_user.id), - llm=mocker.Mock(), ) result = await db_session.execute( @@ -260,7 +249,7 @@ async def test_reindex_clears_reindexing_flag( document.content_needs_reindexing = True await db_session.flush() - await adapter.reindex(document=document, llm=mocker.Mock()) + await adapter.reindex(document=document) await db_session.refresh(document) assert document.content_needs_reindexing is False @@ -269,10 +258,6 @@ async def test_reindex_clears_reindexing_flag( @pytest.mark.usefixtures("patched_embed_texts", "patched_chunk_text") async def test_reindex_raises_on_failure(db_session, db_search_space, db_user, mocker): """RuntimeError is raised when reindexing fails so the caller can handle it.""" - mocker.patch( - "app.indexing_pipeline.indexing_pipeline_service.summarize_document", - return_value="Mocked summary.", - ) adapter = UploadDocumentAdapter(db_session) await adapter.index( @@ -281,7 +266,6 @@ async def test_reindex_raises_on_failure(db_session, db_search_space, db_user, m etl_service="UNSTRUCTURED", search_space_id=db_search_space.id, user_id=str(db_user.id), - llm=mocker.Mock(), ) result = await db_session.execute( @@ -292,13 +276,8 @@ async def test_reindex_raises_on_failure(db_session, db_search_space, db_user, m document.source_markdown = "## Edited\n\nNew content after user edit." await db_session.flush() - mocker.patch( - "app.indexing_pipeline.indexing_pipeline_service.summarize_document", - side_effect=RuntimeError("LLM unavailable"), - ) - with pytest.raises(RuntimeError, match=r"Embedding failed|Reindexing failed"): - await adapter.reindex(document=document, llm=mocker.Mock()) + await adapter.reindex(document=document) async def test_reindex_raises_on_empty_source_markdown( @@ -323,4 +302,4 @@ async def test_reindex_raises_on_empty_source_markdown( adapter = UploadDocumentAdapter(db_session) with pytest.raises(RuntimeError, match="no source_markdown"): - await adapter.reindex(document=document, llm=mocker.Mock()) + await adapter.reindex(document=document) diff --git a/surfsense_backend/tests/integration/indexing_pipeline/test_calendar_pipeline.py b/surfsense_backend/tests/integration/indexing_pipeline/test_calendar_pipeline.py index b2dd13e57..95afee5ef 100644 --- a/surfsense_backend/tests/integration/indexing_pipeline/test_calendar_pipeline.py +++ b/surfsense_backend/tests/integration/indexing_pipeline/test_calendar_pipeline.py @@ -25,8 +25,6 @@ def _cal_doc( search_space_id=search_space_id, connector_id=connector_id, created_by_id=user_id, - should_summarize=True, - fallback_summary=f"Calendar: Event {unique_id}", metadata={ "event_id": unique_id, "start_time": "2025-01-15T10:00:00", @@ -37,7 +35,7 @@ def _cal_doc( @pytest.mark.usefixtures( - "patched_summarize", "patched_embed_texts", "patched_chunk_text" +"patched_embed_texts", "patched_chunk_text" ) async def test_calendar_pipeline_creates_ready_document( db_session, db_search_space, db_connector, db_user, mocker @@ -55,7 +53,7 @@ async def test_calendar_pipeline_creates_ready_document( prepared = await service.prepare_for_indexing([doc]) assert len(prepared) == 1 - await service.index(prepared[0], doc, llm=mocker.Mock()) + await service.index(prepared[0], doc) result = await db_session.execute( select(Document).filter(Document.search_space_id == space_id) @@ -68,7 +66,7 @@ async def test_calendar_pipeline_creates_ready_document( @pytest.mark.usefixtures( - "patched_summarize", "patched_embed_texts", "patched_chunk_text" +"patched_embed_texts", "patched_chunk_text" ) async def test_calendar_legacy_doc_migrated( db_session, db_search_space, db_connector, db_user, mocker diff --git a/surfsense_backend/tests/integration/indexing_pipeline/test_drive_pipeline.py b/surfsense_backend/tests/integration/indexing_pipeline/test_drive_pipeline.py index d9900ea87..4e8b8a4a2 100644 --- a/surfsense_backend/tests/integration/indexing_pipeline/test_drive_pipeline.py +++ b/surfsense_backend/tests/integration/indexing_pipeline/test_drive_pipeline.py @@ -25,8 +25,6 @@ def _drive_doc( search_space_id=search_space_id, connector_id=connector_id, created_by_id=user_id, - should_summarize=True, - fallback_summary=f"File: {unique_id}.pdf", metadata={ "google_drive_file_id": unique_id, "google_drive_file_name": f"{unique_id}.pdf", @@ -36,7 +34,7 @@ def _drive_doc( @pytest.mark.usefixtures( - "patched_summarize", "patched_embed_texts", "patched_chunk_text" +"patched_embed_texts", "patched_chunk_text" ) async def test_drive_pipeline_creates_ready_document( db_session, db_search_space, db_connector, db_user, mocker @@ -54,7 +52,7 @@ async def test_drive_pipeline_creates_ready_document( prepared = await service.prepare_for_indexing([doc]) assert len(prepared) == 1 - await service.index(prepared[0], doc, llm=mocker.Mock()) + await service.index(prepared[0], doc) result = await db_session.execute( select(Document).filter(Document.search_space_id == space_id) @@ -67,7 +65,7 @@ async def test_drive_pipeline_creates_ready_document( @pytest.mark.usefixtures( - "patched_summarize", "patched_embed_texts", "patched_chunk_text" +"patched_embed_texts", "patched_chunk_text" ) async def test_drive_legacy_doc_migrated( db_session, db_search_space, db_connector, db_user, mocker diff --git a/surfsense_backend/tests/integration/indexing_pipeline/test_dropbox_pipeline.py b/surfsense_backend/tests/integration/indexing_pipeline/test_dropbox_pipeline.py index 83e4f7bb4..d2a8cefc5 100644 --- a/surfsense_backend/tests/integration/indexing_pipeline/test_dropbox_pipeline.py +++ b/surfsense_backend/tests/integration/indexing_pipeline/test_dropbox_pipeline.py @@ -24,8 +24,6 @@ def _dropbox_doc( search_space_id=search_space_id, connector_id=connector_id, created_by_id=user_id, - should_summarize=True, - fallback_summary=f"File: {unique_id}.docx", metadata={ "dropbox_file_id": unique_id, "dropbox_file_name": f"{unique_id}.docx", @@ -35,7 +33,7 @@ def _dropbox_doc( @pytest.mark.usefixtures( - "patched_summarize", "patched_embed_texts", "patched_chunk_text" +"patched_embed_texts", "patched_chunk_text" ) async def test_dropbox_pipeline_creates_ready_document( db_session, db_search_space, db_connector, db_user, mocker @@ -53,7 +51,7 @@ async def test_dropbox_pipeline_creates_ready_document( prepared = await service.prepare_for_indexing([doc]) assert len(prepared) == 1 - await service.index(prepared[0], doc, llm=mocker.Mock()) + await service.index(prepared[0], doc) result = await db_session.execute( select(Document).filter(Document.search_space_id == space_id) @@ -66,7 +64,7 @@ async def test_dropbox_pipeline_creates_ready_document( @pytest.mark.usefixtures( - "patched_summarize", "patched_embed_texts", "patched_chunk_text" +"patched_embed_texts", "patched_chunk_text" ) async def test_dropbox_duplicate_content_skipped( db_session, db_search_space, db_connector, db_user, mocker @@ -86,7 +84,7 @@ async def test_dropbox_duplicate_content_skipped( prepared = await service.prepare_for_indexing([doc]) assert len(prepared) == 1 - await service.index(prepared[0], doc, llm=mocker.Mock()) + await service.index(prepared[0], doc) result = await db_session.execute( select(Document).filter(Document.search_space_id == space_id) diff --git a/surfsense_backend/tests/integration/indexing_pipeline/test_gmail_pipeline.py b/surfsense_backend/tests/integration/indexing_pipeline/test_gmail_pipeline.py index b74d092c0..5b2efa1aa 100644 --- a/surfsense_backend/tests/integration/indexing_pipeline/test_gmail_pipeline.py +++ b/surfsense_backend/tests/integration/indexing_pipeline/test_gmail_pipeline.py @@ -28,8 +28,6 @@ def _gmail_doc( search_space_id=search_space_id, connector_id=connector_id, created_by_id=user_id, - should_summarize=True, - fallback_summary=f"Gmail: Subject for {unique_id}", metadata={ "message_id": unique_id, "from": "sender@example.com", @@ -39,7 +37,7 @@ def _gmail_doc( @pytest.mark.usefixtures( - "patched_summarize", "patched_embed_texts", "patched_chunk_text" +"patched_embed_texts", "patched_chunk_text" ) async def test_gmail_pipeline_creates_ready_document( db_session, db_search_space, db_connector, db_user, mocker @@ -57,7 +55,7 @@ async def test_gmail_pipeline_creates_ready_document( prepared = await service.prepare_for_indexing([doc]) assert len(prepared) == 1 - await service.index(prepared[0], doc, llm=mocker.Mock()) + await service.index(prepared[0], doc) result = await db_session.execute( select(Document).filter(Document.search_space_id == space_id) @@ -71,7 +69,7 @@ async def test_gmail_pipeline_creates_ready_document( @pytest.mark.usefixtures( - "patched_summarize", "patched_embed_texts", "patched_chunk_text" +"patched_embed_texts", "patched_chunk_text" ) async def test_gmail_legacy_doc_migrated_then_reused( db_session, db_search_space, db_connector, db_user, mocker diff --git a/surfsense_backend/tests/integration/indexing_pipeline/test_index_batch.py b/surfsense_backend/tests/integration/indexing_pipeline/test_index_batch.py index 847f7592c..59b7c8814 100644 --- a/surfsense_backend/tests/integration/indexing_pipeline/test_index_batch.py +++ b/surfsense_backend/tests/integration/indexing_pipeline/test_index_batch.py @@ -10,7 +10,7 @@ pytestmark = pytest.mark.integration @pytest.mark.usefixtures( - "patched_summarize", "patched_embed_texts", "patched_chunk_text" +"patched_embed_texts", "patched_chunk_text" ) async def test_index_batch_creates_ready_documents( db_session, db_search_space, make_connector_document, mocker @@ -33,7 +33,7 @@ async def test_index_batch_creates_ready_documents( ] service = IndexingPipelineService(session=db_session) - results = await service.index_batch(docs, llm=mocker.Mock()) + results = await service.index_batch(docs) assert len(results) == 2 @@ -50,10 +50,10 @@ async def test_index_batch_creates_ready_documents( @pytest.mark.usefixtures( - "patched_summarize", "patched_embed_texts", "patched_chunk_text" +"patched_embed_texts", "patched_chunk_text" ) async def test_index_batch_empty_returns_empty(db_session, mocker): """index_batch with empty input returns an empty list.""" service = IndexingPipelineService(session=db_session) - results = await service.index_batch([], llm=mocker.Mock()) + results = await service.index_batch([]) assert results == [] diff --git a/surfsense_backend/tests/integration/indexing_pipeline/test_index_document.py b/surfsense_backend/tests/integration/indexing_pipeline/test_index_document.py index a82148f96..ff0578720 100644 --- a/surfsense_backend/tests/integration/indexing_pipeline/test_index_document.py +++ b/surfsense_backend/tests/integration/indexing_pipeline/test_index_document.py @@ -10,9 +10,7 @@ _EMBEDDING_DIM = app_config.embedding_model_instance.dimension pytestmark = pytest.mark.integration -@pytest.mark.usefixtures( - "patched_summarize", "patched_embed_texts", "patched_chunk_text" -) +@pytest.mark.usefixtures("patched_embed_texts", "patched_chunk_text") async def test_sets_status_ready( db_session, db_search_space, @@ -27,7 +25,7 @@ async def test_sets_status_ready( document = prepared[0] document_id = document.id - await service.index(document, connector_doc, llm=mocker.Mock()) + await service.index(document, connector_doc) result = await db_session.execute( select(Document).filter(Document.id == document_id) @@ -37,16 +35,14 @@ async def test_sets_status_ready( assert DocumentStatus.is_state(reloaded.status, DocumentStatus.READY) -@pytest.mark.usefixtures( - "patched_summarize", "patched_embed_texts", "patched_chunk_text" -) -async def test_content_is_summary_when_should_summarize_true( +@pytest.mark.usefixtures("patched_embed_texts", "patched_chunk_text") +async def test_content_is_source_markdown_by_default( db_session, db_search_space, make_connector_document, mocker, ): - """Document content is set to the LLM-generated summary when should_summarize=True.""" + """Document content is set to source_markdown by default.""" connector_doc = make_connector_document(search_space_id=db_search_space.id) service = IndexingPipelineService(session=db_session) @@ -54,28 +50,25 @@ async def test_content_is_summary_when_should_summarize_true( document = prepared[0] document_id = document.id - await service.index(document, connector_doc, llm=mocker.Mock()) + await service.index(document, connector_doc) result = await db_session.execute( select(Document).filter(Document.id == document_id) ) reloaded = result.scalars().first() - assert reloaded.content == "Mocked summary." + assert reloaded.content == connector_doc.source_markdown -@pytest.mark.usefixtures( - "patched_summarize", "patched_embed_texts", "patched_chunk_text" -) -async def test_content_is_source_markdown_when_should_summarize_false( +@pytest.mark.usefixtures("patched_embed_texts", "patched_chunk_text") +async def test_content_is_source_markdown_when_custom_content( db_session, db_search_space, make_connector_document, ): - """Document content is set to source_markdown verbatim when should_summarize=False.""" + """Document content is set to source_markdown verbatim.""" connector_doc = make_connector_document( search_space_id=db_search_space.id, - should_summarize=False, source_markdown="## Raw content", ) service = IndexingPipelineService(session=db_session) @@ -84,7 +77,7 @@ async def test_content_is_source_markdown_when_should_summarize_false( document = prepared[0] document_id = document.id - await service.index(document, connector_doc, llm=None) + await service.index(document, connector_doc) result = await db_session.execute( select(Document).filter(Document.id == document_id) @@ -94,9 +87,7 @@ async def test_content_is_source_markdown_when_should_summarize_false( assert reloaded.content == "## Raw content" -@pytest.mark.usefixtures( - "patched_summarize", "patched_embed_texts", "patched_chunk_text" -) +@pytest.mark.usefixtures("patched_embed_texts", "patched_chunk_text") async def test_chunks_written_to_db( db_session, db_search_space, @@ -111,7 +102,7 @@ async def test_chunks_written_to_db( document = prepared[0] document_id = document.id - await service.index(document, connector_doc, llm=mocker.Mock()) + await service.index(document, connector_doc) result = await db_session.execute( select(Chunk).filter(Chunk.document_id == document_id) @@ -122,9 +113,7 @@ async def test_chunks_written_to_db( assert chunks[0].content == "Test chunk content." -@pytest.mark.usefixtures( - "patched_summarize", "patched_embed_texts", "patched_chunk_text" -) +@pytest.mark.usefixtures("patched_embed_texts", "patched_chunk_text") async def test_embedding_written_to_db( db_session, db_search_space, @@ -139,7 +128,7 @@ async def test_embedding_written_to_db( document = prepared[0] document_id = document.id - await service.index(document, connector_doc, llm=mocker.Mock()) + await service.index(document, connector_doc) result = await db_session.execute( select(Document).filter(Document.id == document_id) @@ -150,9 +139,7 @@ async def test_embedding_written_to_db( assert len(reloaded.embedding) == _EMBEDDING_DIM -@pytest.mark.usefixtures( - "patched_summarize", "patched_embed_texts", "patched_chunk_text" -) +@pytest.mark.usefixtures("patched_embed_texts", "patched_chunk_text") async def test_updated_at_advances_after_indexing( db_session, db_search_space, @@ -172,7 +159,7 @@ async def test_updated_at_advances_after_indexing( ) updated_at_pending = result.scalars().first().updated_at - await service.index(document, connector_doc, llm=mocker.Mock()) + await service.index(document, connector_doc) result = await db_session.execute( select(Document).filter(Document.id == document_id) @@ -182,18 +169,15 @@ async def test_updated_at_advances_after_indexing( assert updated_at_ready > updated_at_pending -@pytest.mark.usefixtures( - "patched_summarize", "patched_embed_texts", "patched_chunk_text" -) +@pytest.mark.usefixtures("patched_embed_texts", "patched_chunk_text") async def test_no_llm_falls_back_to_source_markdown( db_session, db_search_space, make_connector_document, ): - """When llm=None and no fallback_summary, content falls back to source_markdown.""" + """Content stays deterministic source markdown without an LLM.""" connector_doc = make_connector_document( search_space_id=db_search_space.id, - should_summarize=True, source_markdown="## Fallback content", ) service = IndexingPipelineService(session=db_session) @@ -202,7 +186,7 @@ async def test_no_llm_falls_back_to_source_markdown( document = prepared[0] document_id = document.id - await service.index(document, connector_doc, llm=None) + await service.index(document, connector_doc) result = await db_session.execute( select(Document).filter(Document.id == document_id) @@ -213,27 +197,23 @@ async def test_no_llm_falls_back_to_source_markdown( assert reloaded.content == "## Fallback content" -@pytest.mark.usefixtures( - "patched_summarize", "patched_embed_texts", "patched_chunk_text" -) -async def test_fallback_summary_used_when_llm_unavailable( +@pytest.mark.usefixtures("patched_embed_texts", "patched_chunk_text") +async def test_source_markdown_used_without_preview( db_session, db_search_space, make_connector_document, ): - """fallback_summary is used as content when llm=None and should_summarize=True.""" + """Source markdown is used without fallback preview fields.""" connector_doc = make_connector_document( search_space_id=db_search_space.id, - should_summarize=True, source_markdown="## Full raw content", - fallback_summary="Short pre-built summary.", ) service = IndexingPipelineService(session=db_session) prepared = await service.prepare_for_indexing([connector_doc]) document_id = prepared[0].id - await service.index(prepared[0], connector_doc, llm=None) + await service.index(prepared[0], connector_doc) result = await db_session.execute( select(Document).filter(Document.id == document_id) @@ -241,12 +221,10 @@ async def test_fallback_summary_used_when_llm_unavailable( reloaded = result.scalars().first() assert DocumentStatus.is_state(reloaded.status, DocumentStatus.READY) - assert reloaded.content == "Short pre-built summary." + assert reloaded.content == "## Full raw content" -@pytest.mark.usefixtures( - "patched_summarize", "patched_embed_texts", "patched_chunk_text" -) +@pytest.mark.usefixtures("patched_embed_texts", "patched_chunk_text") async def test_reindex_replaces_old_chunks( db_session, db_search_space, @@ -264,14 +242,14 @@ async def test_reindex_replaces_old_chunks( document = prepared[0] document_id = document.id - await service.index(document, connector_doc, llm=mocker.Mock()) + await service.index(document, connector_doc) updated_doc = make_connector_document( search_space_id=db_search_space.id, source_markdown="## v2", ) re_prepared = await service.prepare_for_indexing([updated_doc]) - await service.index(re_prepared[0], updated_doc, llm=mocker.Mock()) + await service.index(re_prepared[0], updated_doc) result = await db_session.execute( select(Chunk).filter(Chunk.document_id == document_id) @@ -298,7 +276,7 @@ async def test_llm_error_sets_status_failed( document = prepared[0] document_id = document.id - await service.index(document, connector_doc, llm=mocker.Mock()) + await service.index(document, connector_doc) result = await db_session.execute( select(Document).filter(Document.id == document_id) @@ -325,7 +303,7 @@ async def test_llm_error_leaves_no_partial_data( document = prepared[0] document_id = document.id - await service.index(document, connector_doc, llm=mocker.Mock()) + await service.index(document, connector_doc) result = await db_session.execute( select(Document).filter(Document.id == document_id) diff --git a/surfsense_backend/tests/integration/indexing_pipeline/test_local_folder_pipeline.py b/surfsense_backend/tests/integration/indexing_pipeline/test_local_folder_pipeline.py index 4dc5742f7..4070daa80 100644 --- a/surfsense_backend/tests/integration/indexing_pipeline/test_local_folder_pipeline.py +++ b/surfsense_backend/tests/integration/indexing_pipeline/test_local_folder_pipeline.py @@ -21,7 +21,6 @@ from app.db import ( pytestmark = pytest.mark.integration UNIFIED_FIXTURES = ( - "patched_summarize", "patched_embed_texts", "patched_chunk_text", ) @@ -787,7 +786,7 @@ class TestPipelineIntegration: assert len(prepared) == 1 db_doc = prepared[0] - result = await service.index(db_doc, doc, llm=mocker.Mock()) + result = await service.index(db_doc, doc) assert result is not None docs = ( diff --git a/surfsense_backend/tests/integration/indexing_pipeline/test_onedrive_pipeline.py b/surfsense_backend/tests/integration/indexing_pipeline/test_onedrive_pipeline.py index 541e3a38e..41ac6894b 100644 --- a/surfsense_backend/tests/integration/indexing_pipeline/test_onedrive_pipeline.py +++ b/surfsense_backend/tests/integration/indexing_pipeline/test_onedrive_pipeline.py @@ -24,8 +24,6 @@ def _onedrive_doc( search_space_id=search_space_id, connector_id=connector_id, created_by_id=user_id, - should_summarize=True, - fallback_summary=f"File: {unique_id}.docx", metadata={ "onedrive_file_id": unique_id, "onedrive_file_name": f"{unique_id}.docx", @@ -35,7 +33,7 @@ def _onedrive_doc( @pytest.mark.usefixtures( - "patched_summarize", "patched_embed_texts", "patched_chunk_text" +"patched_embed_texts", "patched_chunk_text" ) async def test_onedrive_pipeline_creates_ready_document( db_session, db_search_space, db_connector, db_user, mocker @@ -53,7 +51,7 @@ async def test_onedrive_pipeline_creates_ready_document( prepared = await service.prepare_for_indexing([doc]) assert len(prepared) == 1 - await service.index(prepared[0], doc, llm=mocker.Mock()) + await service.index(prepared[0], doc) result = await db_session.execute( select(Document).filter(Document.search_space_id == space_id) @@ -66,7 +64,7 @@ async def test_onedrive_pipeline_creates_ready_document( @pytest.mark.usefixtures( - "patched_summarize", "patched_embed_texts", "patched_chunk_text" +"patched_embed_texts", "patched_chunk_text" ) async def test_onedrive_duplicate_content_skipped( db_session, db_search_space, db_connector, db_user, mocker @@ -86,7 +84,7 @@ async def test_onedrive_duplicate_content_skipped( prepared = await service.prepare_for_indexing([doc]) assert len(prepared) == 1 - await service.index(prepared[0], doc, llm=mocker.Mock()) + await service.index(prepared[0], doc) result = await db_session.execute( select(Document).filter(Document.search_space_id == space_id) diff --git a/surfsense_backend/tests/integration/indexing_pipeline/test_prepare_for_indexing.py b/surfsense_backend/tests/integration/indexing_pipeline/test_prepare_for_indexing.py index 776180b9a..9c8a3203b 100644 --- a/surfsense_backend/tests/integration/indexing_pipeline/test_prepare_for_indexing.py +++ b/surfsense_backend/tests/integration/indexing_pipeline/test_prepare_for_indexing.py @@ -33,7 +33,7 @@ async def test_new_document_is_persisted_with_pending_status( @pytest.mark.usefixtures( - "patched_summarize", "patched_embed_texts", "patched_chunk_text" +"patched_embed_texts", "patched_chunk_text" ) async def test_unchanged_ready_document_is_skipped( db_session, @@ -47,7 +47,7 @@ async def test_unchanged_ready_document_is_skipped( # Index fully so the document reaches ready state prepared = await service.prepare_for_indexing([doc]) - await service.index(prepared[0], doc, llm=mocker.Mock()) + await service.index(prepared[0], doc) # Same content on the next run — a ready document must be skipped results = await service.prepare_for_indexing([doc]) @@ -56,7 +56,7 @@ async def test_unchanged_ready_document_is_skipped( @pytest.mark.usefixtures( - "patched_summarize", "patched_embed_texts", "patched_chunk_text" +"patched_embed_texts", "patched_chunk_text" ) async def test_title_only_change_updates_title_in_db( db_session, @@ -72,7 +72,7 @@ async def test_title_only_change_updates_title_in_db( prepared = await service.prepare_for_indexing([original]) document_id = prepared[0].id - await service.index(prepared[0], original, llm=mocker.Mock()) + await service.index(prepared[0], original) renamed = make_connector_document( search_space_id=db_search_space.id, title="Updated Title" @@ -354,7 +354,7 @@ async def test_failed_document_with_unchanged_content_is_requeued( # First run: document is created and indexing crashes → status = failed prepared = await service.prepare_for_indexing([doc]) document_id = prepared[0].id - await service.index(prepared[0], doc, llm=mocker.Mock()) + await service.index(prepared[0], doc) result = await db_session.execute( select(Document).filter(Document.id == document_id) diff --git a/surfsense_backend/tests/unit/agents/new_chat/tools/test_resume_page_limits.py b/surfsense_backend/tests/unit/agents/new_chat/tools/test_resume_page_limits.py index 4f93ad732..f9212a45c 100644 --- a/surfsense_backend/tests/unit/agents/new_chat/tools/test_resume_page_limits.py +++ b/surfsense_backend/tests/unit/agents/new_chat/tools/test_resume_page_limits.py @@ -101,7 +101,7 @@ async def test_generate_resume_defaults_to_one_page_target(monkeypatch) -> None: llm = SimpleNamespace(ainvoke=AsyncMock(side_effect=_llm_invoke)) monkeypatch.setattr( resume_tool, - "get_document_summary_llm", + "get_agent_llm", AsyncMock(return_value=llm), ) monkeypatch.setattr(resume_tool, "_compile_typst", lambda _source: b"pdf") @@ -130,7 +130,7 @@ async def test_generate_resume_compresses_when_over_limit(monkeypatch) -> None: llm = SimpleNamespace(ainvoke=AsyncMock(side_effect=responses)) monkeypatch.setattr( resume_tool, - "get_document_summary_llm", + "get_agent_llm", AsyncMock(return_value=llm), ) monkeypatch.setattr(resume_tool, "_compile_typst", lambda _source: b"pdf") @@ -165,7 +165,7 @@ async def test_generate_resume_returns_ready_when_target_not_met(monkeypatch) -> llm = SimpleNamespace(ainvoke=AsyncMock(side_effect=responses)) monkeypatch.setattr( resume_tool, - "get_document_summary_llm", + "get_agent_llm", AsyncMock(return_value=llm), ) monkeypatch.setattr(resume_tool, "_compile_typst", lambda _source: b"pdf") @@ -198,7 +198,7 @@ async def test_generate_resume_fails_when_hard_limit_exceeded(monkeypatch) -> No llm = SimpleNamespace(ainvoke=AsyncMock(side_effect=responses)) monkeypatch.setattr( resume_tool, - "get_document_summary_llm", + "get_agent_llm", AsyncMock(return_value=llm), ) monkeypatch.setattr(resume_tool, "_compile_typst", lambda _source: b"pdf") diff --git a/surfsense_backend/tests/unit/connector_indexers/test_confluence_parallel.py b/surfsense_backend/tests/unit/connector_indexers/test_confluence_parallel.py index a8cf05269..daf6ab985 100644 --- a/surfsense_backend/tests/unit/connector_indexers/test_confluence_parallel.py +++ b/surfsense_backend/tests/unit/connector_indexers/test_confluence_parallel.py @@ -71,7 +71,6 @@ async def test_build_connector_doc_produces_correct_fields(): connector_id=_CONNECTOR_ID, search_space_id=_SEARCH_SPACE_ID, user_id=_USER_ID, - enable_summary=True, ) assert doc.title == "Engineering Handbook" @@ -81,7 +80,6 @@ async def test_build_connector_doc_produces_correct_fields(): assert doc.search_space_id == _SEARCH_SPACE_ID assert doc.connector_id == _CONNECTOR_ID assert doc.created_by_id == _USER_ID - assert doc.should_summarize is True assert doc.metadata["page_id"] == "abc-123" assert doc.metadata["page_title"] == "Engineering Handbook" assert doc.metadata["space_id"] == "ENG" @@ -89,9 +87,8 @@ async def test_build_connector_doc_produces_correct_fields(): assert doc.metadata["connector_id"] == _CONNECTOR_ID assert doc.metadata["document_type"] == "Confluence Page" assert doc.metadata["connector_type"] == "Confluence" - assert doc.fallback_summary is not None - assert "Engineering Handbook" in doc.fallback_summary - assert markdown in doc.fallback_summary + assert "Engineering Handbook" in doc.deterministic_preview + assert markdown in doc.deterministic_preview async def test_build_connector_doc_summary_disabled(): @@ -101,9 +98,7 @@ async def test_build_connector_doc_summary_disabled(): connector_id=_CONNECTOR_ID, search_space_id=_SEARCH_SPACE_ID, user_id=_USER_ID, - enable_summary=False, ) - assert doc.should_summarize is False # --------------------------------------------------------------------------- @@ -111,10 +106,9 @@ async def test_build_connector_doc_summary_disabled(): # --------------------------------------------------------------------------- -def _mock_connector(enable_summary: bool = True): +def _mock_connector(): c = MagicMock() c.config = {"access_token": "tok"} - c.enable_summary = enable_summary c.last_indexed_at = None return c diff --git a/surfsense_backend/tests/unit/connector_indexers/test_dropbox_parallel.py b/surfsense_backend/tests/unit/connector_indexers/test_dropbox_parallel.py index 9ba87207a..694caed06 100644 --- a/surfsense_backend/tests/unit/connector_indexers/test_dropbox_parallel.py +++ b/surfsense_backend/tests/unit/connector_indexers/test_dropbox_parallel.py @@ -71,7 +71,6 @@ async def test_single_file_returns_one_connector_document( connector_id=_CONNECTOR_ID, search_space_id=_SEARCH_SPACE_ID, user_id=_USER_ID, - enable_summary=True, ) assert len(docs) == 1 @@ -97,7 +96,6 @@ async def test_multiple_files_all_produce_documents( connector_id=_CONNECTOR_ID, search_space_id=_SEARCH_SPACE_ID, user_id=_USER_ID, - enable_summary=True, ) assert len(docs) == 3 @@ -125,7 +123,6 @@ async def test_one_download_exception_does_not_block_others( connector_id=_CONNECTOR_ID, search_space_id=_SEARCH_SPACE_ID, user_id=_USER_ID, - enable_summary=True, ) assert len(docs) == 2 @@ -152,7 +149,6 @@ async def test_etl_error_counts_as_download_failure( connector_id=_CONNECTOR_ID, search_space_id=_SEARCH_SPACE_ID, user_id=_USER_ID, - enable_summary=True, ) assert len(docs) == 1 @@ -191,7 +187,6 @@ async def test_concurrency_bounded_by_semaphore( connector_id=_CONNECTOR_ID, search_space_id=_SEARCH_SPACE_ID, user_id=_USER_ID, - enable_summary=True, max_concurrency=2, ) @@ -231,7 +226,6 @@ async def test_heartbeat_fires_during_parallel_downloads( connector_id=_CONNECTOR_ID, search_space_id=_SEARCH_SPACE_ID, user_id=_USER_ID, - enable_summary=True, on_heartbeat=_on_heartbeat, ) @@ -324,7 +318,6 @@ async def _run_full_scan(mocks, monkeypatch, page_files, *, max_files=500): mocks["task_logger"], mocks["log_entry"], max_files, - enable_summary=True, ) @@ -434,7 +427,6 @@ async def _run_selected(mocks, file_tuples): connector_id=_CONNECTOR_ID, search_space_id=_SEARCH_SPACE_ID, user_id=_USER_ID, - enable_summary=True, ) @@ -569,7 +561,6 @@ async def test_delta_sync_deletions_call_remove_document(monkeypatch): mock_task_logger, MagicMock(), max_files=500, - enable_summary=True, ) assert sorted(remove_calls) == ["id:del1", "id:del2"] @@ -608,7 +599,6 @@ async def test_delta_sync_upserts_filtered_and_downloaded(monkeypatch): mock_task_logger, MagicMock(), max_files=500, - enable_summary=True, ) assert indexed == 2 @@ -670,7 +660,6 @@ async def test_delta_sync_mix_deletions_and_upserts(monkeypatch): mock_task_logger, MagicMock(), max_files=500, - enable_summary=True, ) assert sorted(remove_calls) == ["id:del1", "id:del2"] @@ -704,7 +693,6 @@ async def test_delta_sync_returns_new_cursor(monkeypatch): mock_task_logger, MagicMock(), max_files=500, - enable_summary=True, ) assert cursor == "brand-new-cursor-xyz" @@ -725,7 +713,7 @@ def orchestrator_mocks(monkeypatch): mock_connector = MagicMock() mock_connector.config = {"_token_encrypted": False} mock_connector.last_indexed_at = None - mock_connector.enable_summary = True + mock_connector.enable_vision_llm = True monkeypatch.setattr( _mod, diff --git a/surfsense_backend/tests/unit/connector_indexers/test_google_drive_parallel.py b/surfsense_backend/tests/unit/connector_indexers/test_google_drive_parallel.py index 7e968514c..4e67236c3 100644 --- a/surfsense_backend/tests/unit/connector_indexers/test_google_drive_parallel.py +++ b/surfsense_backend/tests/unit/connector_indexers/test_google_drive_parallel.py @@ -66,7 +66,6 @@ async def test_single_file_returns_one_connector_document( connector_id=_CONNECTOR_ID, search_space_id=_SEARCH_SPACE_ID, user_id=_USER_ID, - enable_summary=True, ) assert len(docs) == 1 @@ -91,7 +90,6 @@ async def test_multiple_files_all_produce_documents( connector_id=_CONNECTOR_ID, search_space_id=_SEARCH_SPACE_ID, user_id=_USER_ID, - enable_summary=True, ) assert len(docs) == 3 @@ -119,7 +117,6 @@ async def test_one_download_exception_does_not_block_others( connector_id=_CONNECTOR_ID, search_space_id=_SEARCH_SPACE_ID, user_id=_USER_ID, - enable_summary=True, ) assert len(docs) == 2 @@ -146,7 +143,6 @@ async def test_etl_error_counts_as_download_failure( connector_id=_CONNECTOR_ID, search_space_id=_SEARCH_SPACE_ID, user_id=_USER_ID, - enable_summary=True, ) assert len(docs) == 1 @@ -186,7 +182,6 @@ async def test_concurrency_bounded_by_semaphore( connector_id=_CONNECTOR_ID, search_space_id=_SEARCH_SPACE_ID, user_id=_USER_ID, - enable_summary=True, max_concurrency=2, ) @@ -226,7 +221,6 @@ async def test_heartbeat_fires_during_parallel_downloads( connector_id=_CONNECTOR_ID, search_space_id=_SEARCH_SPACE_ID, user_id=_USER_ID, - enable_summary=True, on_heartbeat=_on_heartbeat, ) @@ -302,7 +296,7 @@ def full_scan_mocks(mock_drive_client, monkeypatch): monkeypatch.setattr( _mod, - "get_user_long_context_llm", + "get_agent_llm", AsyncMock(return_value=MagicMock()), ) @@ -333,7 +327,6 @@ async def _run_full_scan(mocks, *, max_files=500, include_subfolders=False): mocks["log_entry"], max_files, include_subfolders=include_subfolders, - enable_summary=True, ) @@ -489,7 +482,7 @@ async def test_delta_sync_removals_serial_rest_parallel(monkeypatch): ) monkeypatch.setattr( _mod, - "get_user_long_context_llm", + "get_agent_llm", AsyncMock(return_value=MagicMock()), ) @@ -509,7 +502,6 @@ async def test_delta_sync_removals_serial_rest_parallel(monkeypatch): mock_task_logger, MagicMock(), max_files=500, - enable_summary=True, ) assert sorted(remove_calls) == ["del1", "del2", "trash1"] @@ -577,7 +569,6 @@ async def _run_selected(mocks, file_ids): connector_id=_CONNECTOR_ID, search_space_id=_SEARCH_SPACE_ID, user_id=_USER_ID, - enable_summary=True, ) diff --git a/surfsense_backend/tests/unit/connector_indexers/test_linear_parallel.py b/surfsense_backend/tests/unit/connector_indexers/test_linear_parallel.py index ef17aae06..a4702a5ff 100644 --- a/surfsense_backend/tests/unit/connector_indexers/test_linear_parallel.py +++ b/surfsense_backend/tests/unit/connector_indexers/test_linear_parallel.py @@ -70,7 +70,6 @@ async def test_build_connector_doc_produces_correct_fields(): connector_id=_CONNECTOR_ID, search_space_id=_SEARCH_SPACE_ID, user_id=_USER_ID, - enable_summary=True, ) assert doc.title == "ENG-42: Fix login bug" @@ -80,7 +79,6 @@ async def test_build_connector_doc_produces_correct_fields(): assert doc.search_space_id == _SEARCH_SPACE_ID assert doc.connector_id == _CONNECTOR_ID assert doc.created_by_id == _USER_ID - assert doc.should_summarize is True assert doc.metadata["issue_id"] == "abc-123" assert doc.metadata["issue_identifier"] == "ENG-42" assert doc.metadata["issue_title"] == "Fix login bug" @@ -90,13 +88,12 @@ async def test_build_connector_doc_produces_correct_fields(): assert doc.metadata["connector_id"] == _CONNECTOR_ID assert doc.metadata["document_type"] == "Linear Issue" assert doc.metadata["connector_type"] == "Linear" - assert doc.fallback_summary is not None - assert "ENG-42" in doc.fallback_summary - assert markdown in doc.fallback_summary + assert "ENG-42" in doc.deterministic_preview + assert markdown in doc.deterministic_preview async def test_build_connector_doc_summary_disabled(): - """When enable_summary is False, should_summarize is False.""" + """When enable_vision_llm is False, deterministic_content is False.""" doc = _build_connector_doc( _make_issue(), _make_formatted_issue(), @@ -104,21 +101,17 @@ async def test_build_connector_doc_summary_disabled(): connector_id=_CONNECTOR_ID, search_space_id=_SEARCH_SPACE_ID, user_id=_USER_ID, - enable_summary=False, ) - assert doc.should_summarize is False - # --------------------------------------------------------------------------- # Shared fixtures for Slices 2-6 # --------------------------------------------------------------------------- -def _mock_connector(enable_summary: bool = True): +def _mock_connector(): c = MagicMock() c.config = {"access_token": "tok"} - c.enable_summary = enable_summary c.last_indexed_at = None return c diff --git a/surfsense_backend/tests/unit/connector_indexers/test_notion_parallel.py b/surfsense_backend/tests/unit/connector_indexers/test_notion_parallel.py index 651524015..0ad1f2178 100644 --- a/surfsense_backend/tests/unit/connector_indexers/test_notion_parallel.py +++ b/surfsense_backend/tests/unit/connector_indexers/test_notion_parallel.py @@ -41,7 +41,6 @@ async def test_build_connector_doc_produces_correct_fields(): connector_id=_CONNECTOR_ID, search_space_id=_SEARCH_SPACE_ID, user_id=_USER_ID, - enable_summary=True, ) assert doc.title == "My Notion Page" @@ -51,40 +50,34 @@ async def test_build_connector_doc_produces_correct_fields(): assert doc.search_space_id == _SEARCH_SPACE_ID assert doc.connector_id == _CONNECTOR_ID assert doc.created_by_id == _USER_ID - assert doc.should_summarize is True assert doc.metadata["page_title"] == "My Notion Page" assert doc.metadata["page_id"] == "abc-123" assert doc.metadata["connector_id"] == _CONNECTOR_ID assert doc.metadata["document_type"] == "Notion Page" assert doc.metadata["connector_type"] == "Notion" - assert doc.fallback_summary is not None - assert "My Notion Page" in doc.fallback_summary - assert markdown in doc.fallback_summary + assert "My Notion Page" in doc.deterministic_preview + assert markdown in doc.deterministic_preview async def test_build_connector_doc_summary_disabled(): - """When enable_summary is False, should_summarize is False.""" + """When enable_vision_llm is False, deterministic_content is False.""" doc = _build_connector_doc( _make_page(), "# content", connector_id=_CONNECTOR_ID, search_space_id=_SEARCH_SPACE_ID, user_id=_USER_ID, - enable_summary=False, ) - assert doc.should_summarize is False - # --------------------------------------------------------------------------- # Shared fixtures for Slices 2-7 (full index_notion_pages tests) # --------------------------------------------------------------------------- -def _mock_connector(enable_summary: bool = True): +def _mock_connector(): c = MagicMock() c.config = {"access_token": "tok"} - c.enable_summary = enable_summary c.last_indexed_at = None return c diff --git a/surfsense_backend/tests/unit/connector_indexers/test_onedrive_parallel.py b/surfsense_backend/tests/unit/connector_indexers/test_onedrive_parallel.py index 396d79e73..eb1451938 100644 --- a/surfsense_backend/tests/unit/connector_indexers/test_onedrive_parallel.py +++ b/surfsense_backend/tests/unit/connector_indexers/test_onedrive_parallel.py @@ -65,7 +65,6 @@ async def test_single_file_returns_one_connector_document( connector_id=_CONNECTOR_ID, search_space_id=_SEARCH_SPACE_ID, user_id=_USER_ID, - enable_summary=True, ) assert len(docs) == 1 @@ -91,7 +90,6 @@ async def test_multiple_files_all_produce_documents( connector_id=_CONNECTOR_ID, search_space_id=_SEARCH_SPACE_ID, user_id=_USER_ID, - enable_summary=True, ) assert len(docs) == 3 @@ -119,7 +117,6 @@ async def test_one_download_exception_does_not_block_others( connector_id=_CONNECTOR_ID, search_space_id=_SEARCH_SPACE_ID, user_id=_USER_ID, - enable_summary=True, ) assert len(docs) == 2 @@ -146,7 +143,6 @@ async def test_etl_error_counts_as_download_failure( connector_id=_CONNECTOR_ID, search_space_id=_SEARCH_SPACE_ID, user_id=_USER_ID, - enable_summary=True, ) assert len(docs) == 1 @@ -185,7 +181,6 @@ async def test_concurrency_bounded_by_semaphore( connector_id=_CONNECTOR_ID, search_space_id=_SEARCH_SPACE_ID, user_id=_USER_ID, - enable_summary=True, max_concurrency=2, ) @@ -225,7 +220,6 @@ async def test_heartbeat_fires_during_parallel_downloads( connector_id=_CONNECTOR_ID, search_space_id=_SEARCH_SPACE_ID, user_id=_USER_ID, - enable_summary=True, on_heartbeat=_on_heartbeat, ) diff --git a/surfsense_backend/tests/unit/connector_indexers/test_page_limits.py b/surfsense_backend/tests/unit/connector_indexers/test_page_limits.py index 573ee43d8..0080b639e 100644 --- a/surfsense_backend/tests/unit/connector_indexers/test_page_limits.py +++ b/surfsense_backend/tests/unit/connector_indexers/test_page_limits.py @@ -180,7 +180,6 @@ async def _run_gdrive_selected(mocks, file_ids): connector_id=_CONNECTOR_ID, search_space_id=_SEARCH_SPACE_ID, user_id=_USER_ID, - enable_summary=True, ) @@ -337,7 +336,7 @@ def gdrive_full_scan_mocks(monkeypatch): _mod, "IndexingPipelineService", MagicMock(return_value=pipeline_mock) ) monkeypatch.setattr( - _mod, "get_user_long_context_llm", AsyncMock(return_value=MagicMock()) + _mod, "get_agent_llm", AsyncMock(return_value=MagicMock()) ) return { @@ -366,7 +365,6 @@ async def _run_gdrive_full_scan(mocks, max_files=500): MagicMock(), max_files, include_subfolders=False, - enable_summary=True, ) @@ -455,7 +453,7 @@ async def test_gdrive_delta_sync_skips_over_quota(monkeypatch): _mod, "IndexingPipelineService", MagicMock(return_value=pipeline_mock) ) monkeypatch.setattr( - _mod, "get_user_long_context_llm", AsyncMock(return_value=MagicMock()) + _mod, "get_agent_llm", AsyncMock(return_value=MagicMock()) ) mock_task_logger = MagicMock() @@ -473,7 +471,6 @@ async def test_gdrive_delta_sync_skips_over_quota(monkeypatch): mock_task_logger, MagicMock(), max_files=500, - enable_summary=True, ) call_files = download_mock.call_args[0][1] @@ -539,7 +536,6 @@ async def _run_onedrive_selected(mocks, file_ids): connector_id=_CONNECTOR_ID, search_space_id=_SEARCH_SPACE_ID, user_id=_USER_ID, - enable_summary=True, ) @@ -641,7 +637,6 @@ async def _run_dropbox_selected(mocks, file_paths): connector_id=_CONNECTOR_ID, search_space_id=_SEARCH_SPACE_ID, user_id=_USER_ID, - enable_summary=True, ) diff --git a/surfsense_backend/tests/unit/indexing_pipeline/test_connector_document.py b/surfsense_backend/tests/unit/indexing_pipeline/test_connector_document.py index 2136f2152..f85c632ef 100644 --- a/surfsense_backend/tests/unit/indexing_pipeline/test_connector_document.py +++ b/surfsense_backend/tests/unit/indexing_pipeline/test_connector_document.py @@ -18,7 +18,6 @@ def test_valid_document_created_with_required_fields(): connector_id=42, created_by_id="00000000-0000-0000-0000-000000000001", ) - assert doc.should_summarize is True assert doc.should_use_code_chunker is False assert doc.metadata == {} assert doc.connector_id == 42 diff --git a/surfsense_backend/tests/unit/indexing_pipeline/test_document_summarizer.py b/surfsense_backend/tests/unit/indexing_pipeline/test_document_summarizer.py deleted file mode 100644 index eee32357f..000000000 --- a/surfsense_backend/tests/unit/indexing_pipeline/test_document_summarizer.py +++ /dev/null @@ -1,41 +0,0 @@ -from unittest.mock import MagicMock - -import pytest - -from app.indexing_pipeline.document_summarizer import summarize_document - -pytestmark = pytest.mark.unit - - -@pytest.mark.usefixtures("patched_summarizer_chain") -async def test_without_metadata_returns_raw_summary(): - """Summarizer returns the LLM output directly when no metadata is provided.""" - result = await summarize_document("# Content", llm=MagicMock(model="gpt-4")) - - assert result == "The summary." - - -@pytest.mark.usefixtures("patched_summarizer_chain") -async def test_with_metadata_includes_metadata_values_in_output(): - """Non-empty metadata values are prepended to the summary output.""" - result = await summarize_document( - "# Content", - llm=MagicMock(model="gpt-4"), - metadata={"author": "Alice", "source": "Notion"}, - ) - - assert "Alice" in result - assert "Notion" in result - - -@pytest.mark.usefixtures("patched_summarizer_chain") -async def test_with_metadata_omits_empty_fields_from_output(): - """Empty metadata fields are omitted from the summary output.""" - result = await summarize_document( - "# Content", - llm=MagicMock(model="gpt-4"), - metadata={"author": "Alice", "description": ""}, - ) - - assert "Alice" in result - assert "description" not in result.lower() diff --git a/surfsense_backend/tests/unit/indexing_pipeline/test_index_batch_parallel.py b/surfsense_backend/tests/unit/indexing_pipeline/test_index_batch_parallel.py index 07e388836..e4ba8f44c 100644 --- a/surfsense_backend/tests/unit/indexing_pipeline/test_index_batch_parallel.py +++ b/surfsense_backend/tests/unit/indexing_pipeline/test_index_batch_parallel.py @@ -51,11 +51,6 @@ async def test_index_calls_embed_and_chunk_via_to_thread( return await original_to_thread(func, *args, **kwargs) monkeypatch.setattr(asyncio, "to_thread", tracking_to_thread) - - monkeypatch.setattr( - "app.indexing_pipeline.indexing_pipeline_service.summarize_document", - AsyncMock(return_value="Summary."), - ) mock_chunk_hybrid = MagicMock(return_value=["chunk1"]) mock_chunk_hybrid.__name__ = "chunk_text_hybrid" monkeypatch.setattr( @@ -85,7 +80,7 @@ async def test_index_calls_embed_and_chunk_via_to_thread( document.id = 1 document.status = DocumentStatus.pending() - await pipeline.index(document, connector_doc, llm=MagicMock()) + await pipeline.index(document, connector_doc) # Either chunker entry point satisfies the "chunking runs off the event # loop" contract this test guards. Routing between the two is verified @@ -104,10 +99,6 @@ async def test_non_code_documents_use_hybrid_chunker( mid-row. Only documents flagged with ``should_use_code_chunker=True`` should take the ``chunk_text`` path. """ - monkeypatch.setattr( - "app.indexing_pipeline.indexing_pipeline_service.summarize_document", - AsyncMock(return_value="Summary."), - ) mock_chunk_hybrid = MagicMock(return_value=["chunk1"]) mock_chunk_hybrid.__name__ = "chunk_text_hybrid" monkeypatch.setattr( @@ -139,7 +130,7 @@ async def test_non_code_documents_use_hybrid_chunker( document.id = 1 document.status = DocumentStatus.pending() - await pipeline.index(document, connector_doc, llm=MagicMock()) + await pipeline.index(document, connector_doc) mock_chunk_hybrid.assert_called_once() mock_chunk_code.assert_not_called()