From 9b926b31337277e80c1a1f091d5a37ad7e588d5a Mon Sep 17 00:00:00 2001 From: Anish Sarkar <104695310+AnishSarkar22@users.noreply.github.com> Date: Wed, 13 May 2026 00:22:43 +0530 Subject: [PATCH] refactor: update test for index() to use chunk_text_hybrid --- .../test_index_batch_parallel.py | 22 ++++++++++++++----- 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/surfsense_backend/tests/unit/indexing_pipeline/test_index_batch_parallel.py b/surfsense_backend/tests/unit/indexing_pipeline/test_index_batch_parallel.py index ae55bc089..10abd05e0 100644 --- a/surfsense_backend/tests/unit/indexing_pipeline/test_index_batch_parallel.py +++ b/surfsense_backend/tests/unit/indexing_pipeline/test_index_batch_parallel.py @@ -37,7 +37,11 @@ def _make_orm_doc(connector_doc, doc_id): async def test_index_calls_embed_and_chunk_via_to_thread( pipeline, make_connector_document, monkeypatch ): - """index() runs embed_texts and chunk_text via asyncio.to_thread, not blocking the loop.""" + """index() runs the chunker and embed_texts via asyncio.to_thread, not blocking the loop. + + The default (non-code) path uses ``chunk_text_hybrid`` so Markdown tables stay + intact (see issue #1334); ``chunk_text`` is reserved for the code-chunker branch. + """ to_thread_calls = [] original_to_thread = asyncio.to_thread @@ -51,11 +55,11 @@ async def test_index_calls_embed_and_chunk_via_to_thread( "app.indexing_pipeline.indexing_pipeline_service.summarize_document", AsyncMock(return_value="Summary."), ) - mock_chunk = MagicMock(return_value=["chunk1"]) - mock_chunk.__name__ = "chunk_text" + mock_chunk_hybrid = MagicMock(return_value=["chunk1"]) + mock_chunk_hybrid.__name__ = "chunk_text_hybrid" monkeypatch.setattr( - "app.indexing_pipeline.indexing_pipeline_service.chunk_text", - mock_chunk, + "app.indexing_pipeline.indexing_pipeline_service.chunk_text_hybrid", + mock_chunk_hybrid, ) mock_embed = MagicMock( side_effect=lambda texts: [[0.1] * _EMBEDDING_DIM for _ in texts] @@ -65,6 +69,11 @@ async def test_index_calls_embed_and_chunk_via_to_thread( "app.indexing_pipeline.indexing_pipeline_service.embed_texts", mock_embed, ) + # Bypass set_committed_value, which requires a real ORM instance (not MagicMock). + monkeypatch.setattr( + "app.indexing_pipeline.indexing_pipeline_service.attach_chunks_to_document", + MagicMock(), + ) connector_doc = make_connector_document( document_type=DocumentType.GOOGLE_GMAIL_CONNECTOR, @@ -77,8 +86,9 @@ async def test_index_calls_embed_and_chunk_via_to_thread( await pipeline.index(document, connector_doc, llm=MagicMock()) - assert "chunk_text" in to_thread_calls + assert "chunk_text_hybrid" in to_thread_calls assert "embed_texts" in to_thread_calls + assert document.status == DocumentStatus.ready() def _mock_session_factory(orm_docs_by_id):