Merge remote-tracking branch 'upstream/dev' into fix/backend-tests

This commit is contained in:
Anish Sarkar 2026-05-16 19:40:01 +05:30
commit 8de7d86d56
603 changed files with 45074 additions and 4695 deletions

View file

@ -39,8 +39,9 @@ async def test_index_calls_embed_and_chunk_via_to_thread(
):
"""index() runs the chunker and embed_texts via asyncio.to_thread, not blocking the loop.
The default (non-code) path uses ``chunk_text_hybrid`` so Markdown tables stay
intact (see issue #1334); ``chunk_text`` is reserved for the code-chunker branch.
Routing between ``chunk_text`` (code path) and ``chunk_text_hybrid`` (default
path, see issue #1334) is verified separately in
``test_non_code_documents_use_hybrid_chunker``.
"""
to_thread_calls = []
original_to_thread = asyncio.to_thread
@ -86,11 +87,64 @@ async def test_index_calls_embed_and_chunk_via_to_thread(
await pipeline.index(document, connector_doc, llm=MagicMock())
assert "chunk_text_hybrid" in to_thread_calls
# Either chunker entry point satisfies the "chunking runs off the event
# loop" contract this test guards. Routing between the two is verified
# in test_non_code_documents_use_hybrid_chunker.
assert {"chunk_text", "chunk_text_hybrid"} & set(to_thread_calls)
assert "embed_texts" in to_thread_calls
assert document.status == DocumentStatus.ready()
async def test_non_code_documents_use_hybrid_chunker(
pipeline, make_connector_document, monkeypatch
):
"""Non-code documents route through ``chunk_text_hybrid`` (issue #1334).
The hybrid chunker preserves Markdown table integrity by avoiding splits
mid-row. Only documents flagged with ``should_use_code_chunker=True``
should take the ``chunk_text`` path.
"""
monkeypatch.setattr(
"app.indexing_pipeline.indexing_pipeline_service.summarize_document",
AsyncMock(return_value="Summary."),
)
mock_chunk_hybrid = MagicMock(return_value=["chunk1"])
mock_chunk_hybrid.__name__ = "chunk_text_hybrid"
monkeypatch.setattr(
"app.indexing_pipeline.indexing_pipeline_service.chunk_text_hybrid",
mock_chunk_hybrid,
)
mock_chunk_code = MagicMock(return_value=["chunk1"])
mock_chunk_code.__name__ = "chunk_text"
monkeypatch.setattr(
"app.indexing_pipeline.indexing_pipeline_service.chunk_text",
mock_chunk_code,
)
monkeypatch.setattr(
"app.indexing_pipeline.indexing_pipeline_service.embed_texts",
MagicMock(side_effect=lambda texts: [[0.1] * _EMBEDDING_DIM for _ in texts]),
)
monkeypatch.setattr(
"app.indexing_pipeline.indexing_pipeline_service.attach_chunks_to_document",
MagicMock(),
)
connector_doc = make_connector_document(
document_type=DocumentType.GOOGLE_GMAIL_CONNECTOR,
unique_id="msg-1",
search_space_id=1,
should_use_code_chunker=False,
)
document = MagicMock(spec=Document)
document.id = 1
document.status = DocumentStatus.pending()
await pipeline.index(document, connector_doc, llm=MagicMock())
mock_chunk_hybrid.assert_called_once()
mock_chunk_code.assert_not_called()
def _mock_session_factory(orm_docs_by_id):
"""Replace get_celery_session_maker with a two-level callable.