diff --git a/surfsense_backend/app/indexing_pipeline/indexing_pipeline_service.py b/surfsense_backend/app/indexing_pipeline/indexing_pipeline_service.py index 9460f900c..0fadfc42f 100644 --- a/surfsense_backend/app/indexing_pipeline/indexing_pipeline_service.py +++ b/surfsense_backend/app/indexing_pipeline/indexing_pipeline_service.py @@ -9,7 +9,7 @@ from sqlalchemy.ext.asyncio import AsyncSession from app.db import Chunk, Document, DocumentStatus from app.indexing_pipeline.connector_document import ConnectorDocument from app.indexing_pipeline.document_chunker import chunk_text -from app.indexing_pipeline.document_embedder import embed_text +from app.indexing_pipeline.document_embedder import embed_texts from app.indexing_pipeline.document_hashing import ( compute_content_hash, compute_unique_identifier_hash, @@ -195,25 +195,23 @@ class IndexingPipelineService: else: content = connector_doc.source_markdown - t_step = time.perf_counter() - embedding = embed_text(content) - perf.debug( - "[indexing] embed_text (summary) doc=%d in %.3fs", - document.id, - time.perf_counter() - t_step, - ) - await self.session.execute( delete(Chunk).where(Chunk.document_id == document.id) ) t_step = time.perf_counter() + chunk_texts = chunk_text( + connector_doc.source_markdown, + use_code_chunker=connector_doc.should_use_code_chunker, + ) + + texts_to_embed = [content, *chunk_texts] + embeddings = embed_texts(texts_to_embed) + summary_embedding, *chunk_embeddings = embeddings + chunks = [ - Chunk(content=text, embedding=embed_text(text)) - for text in chunk_text( - connector_doc.source_markdown, - use_code_chunker=connector_doc.should_use_code_chunker, - ) + Chunk(content=text, embedding=emb) + for text, emb in zip(chunk_texts, chunk_embeddings) ] perf.info( "[indexing] chunk+embed doc=%d chunks=%d in %.3fs", @@ -223,7 +221,7 @@ class IndexingPipelineService: ) document.content = content - document.embedding = embedding + document.embedding = summary_embedding attach_chunks_to_document(document, chunks) document.updated_at = datetime.now(UTC) document.status = DocumentStatus.ready() diff --git a/surfsense_backend/tests/integration/conftest.py b/surfsense_backend/tests/integration/conftest.py index 8b92a5aa8..4e43ea302 100644 --- a/surfsense_backend/tests/integration/conftest.py +++ b/surfsense_backend/tests/integration/conftest.py @@ -129,10 +129,12 @@ def patched_summarize_raises(monkeypatch) -> AsyncMock: @pytest.fixture -def patched_embed_text(monkeypatch) -> MagicMock: - mock = MagicMock(return_value=[0.1] * _EMBEDDING_DIM) +def patched_embed_texts(monkeypatch) -> MagicMock: + mock = MagicMock( + side_effect=lambda texts: [[0.1] * _EMBEDDING_DIM for _ in texts] + ) monkeypatch.setattr( - "app.indexing_pipeline.indexing_pipeline_service.embed_text", + "app.indexing_pipeline.indexing_pipeline_service.embed_texts", mock, ) return mock diff --git a/surfsense_backend/tests/integration/indexing_pipeline/adapters/test_file_upload_adapter.py b/surfsense_backend/tests/integration/indexing_pipeline/adapters/test_file_upload_adapter.py index fa0fe5787..9fc802aa6 100644 --- a/surfsense_backend/tests/integration/indexing_pipeline/adapters/test_file_upload_adapter.py +++ b/surfsense_backend/tests/integration/indexing_pipeline/adapters/test_file_upload_adapter.py @@ -8,7 +8,7 @@ pytestmark = pytest.mark.integration @pytest.mark.usefixtures( - "patched_summarize", "patched_embed_text", "patched_chunk_text" + "patched_summarize", "patched_embed_texts", "patched_chunk_text" ) async def test_sets_status_ready(db_session, db_search_space, db_user, mocker): """Document status is READY after successful indexing.""" @@ -31,7 +31,7 @@ async def test_sets_status_ready(db_session, db_search_space, db_user, mocker): @pytest.mark.usefixtures( - "patched_summarize", "patched_embed_text", "patched_chunk_text" + "patched_summarize", "patched_embed_texts", "patched_chunk_text" ) async def test_content_is_summary(db_session, db_search_space, db_user, mocker): """Document content is set to the LLM-generated summary.""" @@ -55,7 +55,7 @@ async def test_content_is_summary(db_session, db_search_space, db_user, mocker): @pytest.mark.usefixtures( - "patched_summarize", "patched_embed_text", "patched_chunk_text" + "patched_summarize", "patched_embed_texts", "patched_chunk_text" ) async def test_chunks_written_to_db(db_session, db_search_space, db_user, mocker): """Chunks derived from the source markdown are persisted in the DB.""" @@ -84,7 +84,7 @@ async def test_chunks_written_to_db(db_session, db_search_space, db_user, mocker @pytest.mark.usefixtures( - "patched_summarize_raises", "patched_embed_text", "patched_chunk_text" + "patched_summarize_raises", "patched_embed_texts", "patched_chunk_text" ) async def test_raises_on_indexing_failure(db_session, db_search_space, db_user, mocker): """RuntimeError is raised when the indexing step fails so the caller can fire a failure notification.""" @@ -107,7 +107,7 @@ async def test_raises_on_indexing_failure(db_session, db_search_space, db_user, @pytest.mark.usefixtures( - "patched_summarize", "patched_embed_text", "patched_chunk_text" + "patched_summarize", "patched_embed_texts", "patched_chunk_text" ) async def test_reindex_updates_content(db_session, db_search_space, db_user, mocker): """Document content is updated to the new summary after reindexing.""" @@ -136,7 +136,7 @@ async def test_reindex_updates_content(db_session, db_search_space, db_user, moc @pytest.mark.usefixtures( - "patched_summarize", "patched_embed_text", "patched_chunk_text" + "patched_summarize", "patched_embed_texts", "patched_chunk_text" ) async def test_reindex_updates_content_hash( db_session, db_search_space, db_user, mocker @@ -168,7 +168,7 @@ async def test_reindex_updates_content_hash( @pytest.mark.usefixtures( - "patched_summarize", "patched_embed_text", "patched_chunk_text" + "patched_summarize", "patched_embed_texts", "patched_chunk_text" ) async def test_reindex_sets_status_ready(db_session, db_search_space, db_user, mocker): """Document status is READY after successful reindexing.""" @@ -196,7 +196,7 @@ async def test_reindex_sets_status_ready(db_session, db_search_space, db_user, m assert DocumentStatus.is_state(document.status, DocumentStatus.READY) -@pytest.mark.usefixtures("patched_summarize", "patched_embed_text") +@pytest.mark.usefixtures("patched_summarize", "patched_embed_texts") async def test_reindex_replaces_chunks(db_session, db_search_space, db_user, mocker): """Reindexing replaces old chunks with new content rather than appending.""" mocker.patch( @@ -235,7 +235,7 @@ async def test_reindex_replaces_chunks(db_session, db_search_space, db_user, moc @pytest.mark.usefixtures( - "patched_summarize", "patched_embed_text", "patched_chunk_text" + "patched_summarize", "patched_embed_texts", "patched_chunk_text" ) async def test_reindex_clears_reindexing_flag( db_session, db_search_space, db_user, mocker @@ -266,7 +266,7 @@ async def test_reindex_clears_reindexing_flag( assert document.content_needs_reindexing is False -@pytest.mark.usefixtures("patched_embed_text", "patched_chunk_text") +@pytest.mark.usefixtures("patched_embed_texts", "patched_chunk_text") async def test_reindex_raises_on_failure(db_session, db_search_space, db_user, mocker): """RuntimeError is raised when reindexing fails so the caller can handle it.""" mocker.patch( diff --git a/surfsense_backend/tests/integration/indexing_pipeline/test_index_document.py b/surfsense_backend/tests/integration/indexing_pipeline/test_index_document.py index 2e8ee4d92..a82148f96 100644 --- a/surfsense_backend/tests/integration/indexing_pipeline/test_index_document.py +++ b/surfsense_backend/tests/integration/indexing_pipeline/test_index_document.py @@ -11,7 +11,7 @@ pytestmark = pytest.mark.integration @pytest.mark.usefixtures( - "patched_summarize", "patched_embed_text", "patched_chunk_text" + "patched_summarize", "patched_embed_texts", "patched_chunk_text" ) async def test_sets_status_ready( db_session, @@ -38,7 +38,7 @@ async def test_sets_status_ready( @pytest.mark.usefixtures( - "patched_summarize", "patched_embed_text", "patched_chunk_text" + "patched_summarize", "patched_embed_texts", "patched_chunk_text" ) async def test_content_is_summary_when_should_summarize_true( db_session, @@ -65,7 +65,7 @@ async def test_content_is_summary_when_should_summarize_true( @pytest.mark.usefixtures( - "patched_summarize", "patched_embed_text", "patched_chunk_text" + "patched_summarize", "patched_embed_texts", "patched_chunk_text" ) async def test_content_is_source_markdown_when_should_summarize_false( db_session, @@ -95,7 +95,7 @@ async def test_content_is_source_markdown_when_should_summarize_false( @pytest.mark.usefixtures( - "patched_summarize", "patched_embed_text", "patched_chunk_text" + "patched_summarize", "patched_embed_texts", "patched_chunk_text" ) async def test_chunks_written_to_db( db_session, @@ -123,7 +123,7 @@ async def test_chunks_written_to_db( @pytest.mark.usefixtures( - "patched_summarize", "patched_embed_text", "patched_chunk_text" + "patched_summarize", "patched_embed_texts", "patched_chunk_text" ) async def test_embedding_written_to_db( db_session, @@ -151,7 +151,7 @@ async def test_embedding_written_to_db( @pytest.mark.usefixtures( - "patched_summarize", "patched_embed_text", "patched_chunk_text" + "patched_summarize", "patched_embed_texts", "patched_chunk_text" ) async def test_updated_at_advances_after_indexing( db_session, @@ -183,7 +183,7 @@ async def test_updated_at_advances_after_indexing( @pytest.mark.usefixtures( - "patched_summarize", "patched_embed_text", "patched_chunk_text" + "patched_summarize", "patched_embed_texts", "patched_chunk_text" ) async def test_no_llm_falls_back_to_source_markdown( db_session, @@ -214,7 +214,7 @@ async def test_no_llm_falls_back_to_source_markdown( @pytest.mark.usefixtures( - "patched_summarize", "patched_embed_text", "patched_chunk_text" + "patched_summarize", "patched_embed_texts", "patched_chunk_text" ) async def test_fallback_summary_used_when_llm_unavailable( db_session, @@ -245,7 +245,7 @@ async def test_fallback_summary_used_when_llm_unavailable( @pytest.mark.usefixtures( - "patched_summarize", "patched_embed_text", "patched_chunk_text" + "patched_summarize", "patched_embed_texts", "patched_chunk_text" ) async def test_reindex_replaces_old_chunks( db_session, @@ -282,7 +282,7 @@ async def test_reindex_replaces_old_chunks( @pytest.mark.usefixtures( - "patched_summarize_raises", "patched_embed_text", "patched_chunk_text" + "patched_summarize_raises", "patched_embed_texts", "patched_chunk_text" ) async def test_llm_error_sets_status_failed( db_session, @@ -309,7 +309,7 @@ async def test_llm_error_sets_status_failed( @pytest.mark.usefixtures( - "patched_summarize_raises", "patched_embed_text", "patched_chunk_text" + "patched_summarize_raises", "patched_embed_texts", "patched_chunk_text" ) async def test_llm_error_leaves_no_partial_data( db_session, diff --git a/surfsense_backend/tests/integration/indexing_pipeline/test_prepare_for_indexing.py b/surfsense_backend/tests/integration/indexing_pipeline/test_prepare_for_indexing.py index 837b02c9f..776180b9a 100644 --- a/surfsense_backend/tests/integration/indexing_pipeline/test_prepare_for_indexing.py +++ b/surfsense_backend/tests/integration/indexing_pipeline/test_prepare_for_indexing.py @@ -33,7 +33,7 @@ async def test_new_document_is_persisted_with_pending_status( @pytest.mark.usefixtures( - "patched_summarize", "patched_embed_text", "patched_chunk_text" + "patched_summarize", "patched_embed_texts", "patched_chunk_text" ) async def test_unchanged_ready_document_is_skipped( db_session, @@ -56,7 +56,7 @@ async def test_unchanged_ready_document_is_skipped( @pytest.mark.usefixtures( - "patched_summarize", "patched_embed_text", "patched_chunk_text" + "patched_summarize", "patched_embed_texts", "patched_chunk_text" ) async def test_title_only_change_updates_title_in_db( db_session, @@ -339,7 +339,7 @@ async def test_same_content_from_different_source_is_skipped( @pytest.mark.usefixtures( - "patched_summarize_raises", "patched_embed_text", "patched_chunk_text" + "patched_summarize_raises", "patched_embed_texts", "patched_chunk_text" ) async def test_failed_document_with_unchanged_content_is_requeued( db_session,