mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-05-21 18:55:16 +02:00
refactor: update test for index() to use chunk_text_hybrid
This commit is contained in:
parent
2402b730fa
commit
9b926b3133
1 changed files with 16 additions and 6 deletions
|
|
@ -37,7 +37,11 @@ def _make_orm_doc(connector_doc, doc_id):
|
||||||
async def test_index_calls_embed_and_chunk_via_to_thread(
|
async def test_index_calls_embed_and_chunk_via_to_thread(
|
||||||
pipeline, make_connector_document, monkeypatch
|
pipeline, make_connector_document, monkeypatch
|
||||||
):
|
):
|
||||||
"""index() runs embed_texts and chunk_text via asyncio.to_thread, not blocking the loop."""
|
"""index() runs the chunker and embed_texts via asyncio.to_thread, not blocking the loop.
|
||||||
|
|
||||||
|
The default (non-code) path uses ``chunk_text_hybrid`` so Markdown tables stay
|
||||||
|
intact (see issue #1334); ``chunk_text`` is reserved for the code-chunker branch.
|
||||||
|
"""
|
||||||
to_thread_calls = []
|
to_thread_calls = []
|
||||||
original_to_thread = asyncio.to_thread
|
original_to_thread = asyncio.to_thread
|
||||||
|
|
||||||
|
|
@ -51,11 +55,11 @@ async def test_index_calls_embed_and_chunk_via_to_thread(
|
||||||
"app.indexing_pipeline.indexing_pipeline_service.summarize_document",
|
"app.indexing_pipeline.indexing_pipeline_service.summarize_document",
|
||||||
AsyncMock(return_value="Summary."),
|
AsyncMock(return_value="Summary."),
|
||||||
)
|
)
|
||||||
mock_chunk = MagicMock(return_value=["chunk1"])
|
mock_chunk_hybrid = MagicMock(return_value=["chunk1"])
|
||||||
mock_chunk.__name__ = "chunk_text"
|
mock_chunk_hybrid.__name__ = "chunk_text_hybrid"
|
||||||
monkeypatch.setattr(
|
monkeypatch.setattr(
|
||||||
"app.indexing_pipeline.indexing_pipeline_service.chunk_text",
|
"app.indexing_pipeline.indexing_pipeline_service.chunk_text_hybrid",
|
||||||
mock_chunk,
|
mock_chunk_hybrid,
|
||||||
)
|
)
|
||||||
mock_embed = MagicMock(
|
mock_embed = MagicMock(
|
||||||
side_effect=lambda texts: [[0.1] * _EMBEDDING_DIM for _ in texts]
|
side_effect=lambda texts: [[0.1] * _EMBEDDING_DIM for _ in texts]
|
||||||
|
|
@ -65,6 +69,11 @@ async def test_index_calls_embed_and_chunk_via_to_thread(
|
||||||
"app.indexing_pipeline.indexing_pipeline_service.embed_texts",
|
"app.indexing_pipeline.indexing_pipeline_service.embed_texts",
|
||||||
mock_embed,
|
mock_embed,
|
||||||
)
|
)
|
||||||
|
# Bypass set_committed_value, which requires a real ORM instance (not MagicMock).
|
||||||
|
monkeypatch.setattr(
|
||||||
|
"app.indexing_pipeline.indexing_pipeline_service.attach_chunks_to_document",
|
||||||
|
MagicMock(),
|
||||||
|
)
|
||||||
|
|
||||||
connector_doc = make_connector_document(
|
connector_doc = make_connector_document(
|
||||||
document_type=DocumentType.GOOGLE_GMAIL_CONNECTOR,
|
document_type=DocumentType.GOOGLE_GMAIL_CONNECTOR,
|
||||||
|
|
@ -77,8 +86,9 @@ async def test_index_calls_embed_and_chunk_via_to_thread(
|
||||||
|
|
||||||
await pipeline.index(document, connector_doc, llm=MagicMock())
|
await pipeline.index(document, connector_doc, llm=MagicMock())
|
||||||
|
|
||||||
assert "chunk_text" in to_thread_calls
|
assert "chunk_text_hybrid" in to_thread_calls
|
||||||
assert "embed_texts" in to_thread_calls
|
assert "embed_texts" in to_thread_calls
|
||||||
|
assert document.status == DocumentStatus.ready()
|
||||||
|
|
||||||
|
|
||||||
def _mock_session_factory(orm_docs_by_id):
|
def _mock_session_factory(orm_docs_by_id):
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue