diff --git a/surfsense_backend/app/indexing_pipeline/connector_document.py b/surfsense_backend/app/indexing_pipeline/connector_document.py index 3ae7f57c4..0581d368d 100644 --- a/surfsense_backend/app/indexing_pipeline/connector_document.py +++ b/surfsense_backend/app/indexing_pipeline/connector_document.py @@ -12,6 +12,7 @@ class ConnectorDocument(BaseModel): search_space_id: int = Field(gt=0) should_summarize: bool = True should_use_code_chunker: bool = False + fallback_summary: str | None = None metadata: dict = {} connector_id: int = Field(gt=0) created_by_id: str diff --git a/surfsense_backend/app/indexing_pipeline/indexing_pipeline_service.py b/surfsense_backend/app/indexing_pipeline/indexing_pipeline_service.py index af2b2a2ff..9cbeedba8 100644 --- a/surfsense_backend/app/indexing_pipeline/indexing_pipeline_service.py +++ b/surfsense_backend/app/indexing_pipeline/indexing_pipeline_service.py @@ -124,6 +124,8 @@ class IndexingPipelineService: content = await summarize_document( connector_doc.source_markdown, llm, connector_doc.metadata ) + elif connector_doc.should_summarize and connector_doc.fallback_summary: + content = connector_doc.fallback_summary else: content = connector_doc.source_markdown diff --git a/surfsense_backend/tests/integration/indexing_pipeline/test_index_document.py b/surfsense_backend/tests/integration/indexing_pipeline/test_index_document.py index 1180c25c9..89bd722ee 100644 --- a/surfsense_backend/tests/integration/indexing_pipeline/test_index_document.py +++ b/surfsense_backend/tests/integration/indexing_pipeline/test_index_document.py @@ -156,6 +156,30 @@ async def test_no_llm_falls_back_to_source_markdown( assert reloaded.content == "## Fallback content" +@pytest.mark.usefixtures("patched_summarize", "patched_embed_text", "patched_chunk_text") +async def test_fallback_summary_used_when_llm_unavailable( + db_session, db_search_space, make_connector_document, +): + connector_doc = make_connector_document( + search_space_id=db_search_space.id, + should_summarize=True, + source_markdown="## Full raw content", + fallback_summary="Short pre-built summary.", + ) + service = IndexingPipelineService(session=db_session) + + prepared = await service.prepare_for_indexing([connector_doc]) + document_id = prepared[0].id + + await service.index(prepared[0], connector_doc, llm=None) + + result = await db_session.execute(select(Document).filter(Document.id == document_id)) + reloaded = result.scalars().first() + + assert DocumentStatus.is_state(reloaded.status, DocumentStatus.READY) + assert reloaded.content == "Short pre-built summary." + + @pytest.mark.usefixtures("patched_summarize", "patched_embed_text", "patched_chunk_text") async def test_reindex_replaces_old_chunks( db_session, db_search_space, make_connector_document, mocker,