diff --git a/surfsense_backend/tests/integration/indexing_pipeline/adapters/test_file_upload_adapter.py b/surfsense_backend/tests/integration/indexing_pipeline/adapters/test_file_upload_adapter.py index 9c705c1d5..24f2c2cc6 100644 --- a/surfsense_backend/tests/integration/indexing_pipeline/adapters/test_file_upload_adapter.py +++ b/surfsense_backend/tests/integration/indexing_pipeline/adapters/test_file_upload_adapter.py @@ -89,7 +89,7 @@ async def test_chunks_written_to_db(db_session, db_search_space, db_user, mocker async def test_raises_on_indexing_failure(db_session, db_search_space, db_user, mocker): """RuntimeError is raised when the indexing step fails so the caller can fire a failure notification.""" adapter = UploadDocumentAdapter(db_session) - with pytest.raises(RuntimeError): + with pytest.raises(RuntimeError, match=r"Embedding failed|Indexing failed"): await adapter.index( markdown_content="## Hello\n\nSome content.", filename="test.pdf", @@ -106,6 +106,67 @@ async def test_raises_on_indexing_failure(db_session, db_search_space, db_user, # --------------------------------------------------------------------------- +@pytest.mark.usefixtures( + "patched_summarize", "patched_embed_text", "patched_chunk_text" +) +async def test_reindex_updates_content(db_session, db_search_space, db_user, mocker): + """Document content is updated to the new summary after reindexing.""" + adapter = UploadDocumentAdapter(db_session) + await adapter.index( + markdown_content="## Original\n\nOriginal content.", + filename="test.pdf", + etl_service="UNSTRUCTURED", + search_space_id=db_search_space.id, + user_id=str(db_user.id), + llm=mocker.Mock(), + ) + + result = await db_session.execute( + select(Document).filter(Document.search_space_id == db_search_space.id) + ) + document = result.scalars().first() + + document.source_markdown = "## Edited\n\nNew content after user edit." + await db_session.flush() + + await adapter.reindex(document=document, llm=mocker.Mock()) + + await db_session.refresh(document) + assert document.content == "Mocked summary." + + +@pytest.mark.usefixtures( + "patched_summarize", "patched_embed_text", "patched_chunk_text" +) +async def test_reindex_updates_content_hash( + db_session, db_search_space, db_user, mocker +): + """Content hash is recomputed after reindexing with new source markdown.""" + adapter = UploadDocumentAdapter(db_session) + await adapter.index( + markdown_content="## Original\n\nOriginal content.", + filename="test.pdf", + etl_service="UNSTRUCTURED", + search_space_id=db_search_space.id, + user_id=str(db_user.id), + llm=mocker.Mock(), + ) + + result = await db_session.execute( + select(Document).filter(Document.search_space_id == db_search_space.id) + ) + document = result.scalars().first() + original_hash = document.content_hash + + document.source_markdown = "## Edited\n\nNew content after user edit." + await db_session.flush() + + await adapter.reindex(document=document, llm=mocker.Mock()) + + await db_session.refresh(document) + assert document.content_hash != original_hash + + @pytest.mark.usefixtures( "patched_summarize", "patched_embed_text", "patched_chunk_text" ) @@ -135,11 +196,14 @@ async def test_reindex_sets_status_ready(db_session, db_search_space, db_user, m assert DocumentStatus.is_state(document.status, DocumentStatus.READY) -@pytest.mark.usefixtures( - "patched_summarize", "patched_embed_text", "patched_chunk_text" -) +@pytest.mark.usefixtures("patched_summarize", "patched_embed_text") async def test_reindex_replaces_chunks(db_session, db_search_space, db_user, mocker): - """Reindexing replaces old chunks rather than appending new ones.""" + """Reindexing replaces old chunks with new content rather than appending.""" + mocker.patch( + "app.indexing_pipeline.indexing_pipeline_service.chunk_text", + side_effect=[["Original chunk."], ["Updated chunk."]], + ) + adapter = UploadDocumentAdapter(db_session) await adapter.index( markdown_content="## Original\n\nOriginal content.", @@ -167,6 +231,7 @@ async def test_reindex_replaces_chunks(db_session, db_search_space, db_user, moc chunks = chunks_result.scalars().all() assert len(chunks) == 1 + assert chunks[0].content == "Updated chunk." @pytest.mark.usefixtures( @@ -234,5 +299,30 @@ async def test_reindex_raises_on_failure( side_effect=RuntimeError("LLM unavailable"), ) - with pytest.raises(RuntimeError): + with pytest.raises(RuntimeError, match=r"Embedding failed|Reindexing failed"): + await adapter.reindex(document=document, llm=mocker.Mock()) + + +async def test_reindex_raises_on_empty_source_markdown( + db_session, db_search_space, db_user, mocker +): + """Reindexing a document with no source_markdown raises immediately.""" + from app.db import DocumentType + + document = Document( + title="empty.pdf", + document_type=DocumentType.FILE, + content="placeholder", + content_hash="abc123", + unique_identifier_hash="def456", + source_markdown="", + search_space_id=db_search_space.id, + created_by_id=str(db_user.id), + ) + db_session.add(document) + await db_session.flush() + + adapter = UploadDocumentAdapter(db_session) + + with pytest.raises(RuntimeError, match="no source_markdown"): await adapter.reindex(document=document, llm=mocker.Mock())