feat: implement parallel indexing for Google Calendar and Gmail connectors

- Refactored Google Calendar and Gmail indexers to utilize the new `index_batch_parallel` method for concurrent document indexing, enhancing performance. - Updated the indexing logic to replace serial processing with parallel execution, allowing for improved efficiency in handling multiple documents. - Adjusted logging and error handling to accommodate the new parallel processing approach, ensuring robust operation during indexing. - Enhanced unit tests to validate the functionality of the parallel indexing method and its integration with existing workflows.
2026-04-25 00:36:31 +02:00 · 2026-03-26 19:34:04 +05:30 · 2026-03-26 19:34:04 +05:30 · 4fd776e7ef
commit 4fd776e7ef
parent e5cb6bfacf
4 changed files with 242 additions and 95 deletions
--- a/surfsense_backend/tests/unit/indexing_pipeline/test_index_batch_parallel.py
+++ b/surfsense_backend/tests/unit/indexing_pipeline/test_index_batch_parallel.py
@ -25,6 +25,15 @@ def pipeline(mock_session):
    return IndexingPipelineService(mock_session)


+def _make_orm_doc(connector_doc, doc_id):
+    """Create a MagicMock Document bound to a ConnectorDocument's hash."""
+    doc = MagicMock(spec=Document)
+    doc.id = doc_id
+    doc.unique_identifier_hash = compute_unique_identifier_hash(connector_doc)
+    doc.status = DocumentStatus.pending()
+    return doc
+
+
 async def test_index_calls_embed_and_chunk_via_to_thread(
    pipeline, make_connector_document, monkeypatch
 ):
@ -68,3 +77,110 @@ async def test_index_calls_embed_and_chunk_via_to_thread(

    assert "chunk_text" in to_thread_calls
    assert "embed_texts" in to_thread_calls
+
+
+def _mock_session_factory(orm_docs_by_id):
+    """Replace get_celery_session_maker with a two-level callable.
+
+    get_celery_session_maker() -> session_maker
+    session_maker()            -> async context manager yielding a mock session
+    """
+
+    def _get_maker():
+        def _make_session():
+            session = MagicMock()
+            session.get = AsyncMock(
+                side_effect=lambda model, doc_id: orm_docs_by_id.get(doc_id)
+            )
+            ctx = MagicMock()
+            ctx.__aenter__ = AsyncMock(return_value=session)
+            ctx.__aexit__ = AsyncMock(return_value=False)
+            return ctx
+
+        return _make_session
+
+    return _get_maker
+
+
+async def test_batch_parallel_indexes_all_documents(
+    pipeline, make_connector_document, monkeypatch
+):
+    """index_batch_parallel indexes all documents and returns correct counts."""
+    docs = [
+        make_connector_document(
+            document_type=DocumentType.GOOGLE_GMAIL_CONNECTOR,
+            unique_id=f"msg-{i}",
+            search_space_id=1,
+        )
+        for i in range(3)
+    ]
+
+    orm_docs = [_make_orm_doc(cd, doc_id=i + 1) for i, cd in enumerate(docs)]
+    pipeline.prepare_for_indexing = AsyncMock(return_value=orm_docs)
+
+    orm_by_id = {d.id: d for d in orm_docs}
+    monkeypatch.setattr(
+        "app.tasks.celery_tasks.get_celery_session_maker",
+        _mock_session_factory(orm_by_id),
+    )
+
+    index_calls = []
+
+    async def fake_index(self, document, connector_doc, llm):
+        index_calls.append(document.id)
+        document.status = DocumentStatus.ready()
+        return document
+
+    monkeypatch.setattr(IndexingPipelineService, "index", fake_index)
+
+    async def mock_get_llm(session):
+        return MagicMock()
+
+    _, indexed, failed = await pipeline.index_batch_parallel(
+        docs, mock_get_llm, max_concurrency=2
+    )
+
+    assert indexed == 3
+    assert failed == 0
+    assert sorted(index_calls) == [1, 2, 3]
+
+
+async def test_batch_parallel_one_failure_does_not_affect_others(
+    pipeline, make_connector_document, monkeypatch
+):
+    """One document failure doesn't prevent other documents from being indexed."""
+    docs = [
+        make_connector_document(
+            document_type=DocumentType.GOOGLE_GMAIL_CONNECTOR,
+            unique_id=f"msg-{i}",
+            search_space_id=1,
+        )
+        for i in range(3)
+    ]
+
+    orm_docs = [_make_orm_doc(cd, doc_id=i + 1) for i, cd in enumerate(docs)]
+    pipeline.prepare_for_indexing = AsyncMock(return_value=orm_docs)
+
+    orm_by_id = {d.id: d for d in orm_docs}
+    monkeypatch.setattr(
+        "app.tasks.celery_tasks.get_celery_session_maker",
+        _mock_session_factory(orm_by_id),
+    )
+
+    async def failing_index(self, document, connector_doc, llm):
+        if document.id == 2:
+            raise RuntimeError("LLM exploded")
+        document.status = DocumentStatus.ready()
+        return document
+
+    monkeypatch.setattr(IndexingPipelineService, "index", failing_index)
+
+    async def mock_get_llm(session):
+        return MagicMock()
+
+    _, indexed, failed = await pipeline.index_batch_parallel(
+        docs, mock_get_llm, max_concurrency=4
+    )
+
+    assert indexed == 2
+    assert failed == 1