perf(indexers): offload sync embed_text to thread across background workers

Connector kb_sync_services (gmail, onedrive, google_calendar, jira), streaming indexers (discord, luma, teams) and the file-processor save path all called embed_text inside async coroutines, blocking the background worker's event loop for the duration of the embed. Wrap each call site in asyncio.to_thread so concurrent indexing tasks stop serialising on the embed.
2026-07-22 23:31:12 +02:00 · 2026-05-20 10:09:38 +02:00 · 2026-05-20 10:09:38 +02:00 · 1791241c0c
commit 1791241c0c
parent a8de98895a
8 changed files with 34 additions and 11 deletions
--- a/surfsense_backend/app/tasks/document_processors/_save.py
+++ b/surfsense_backend/app/tasks/document_processors/_save.py
@ -2,6 +2,7 @@
 Unified document save/update logic for file processors.
 """

+import asyncio
 import logging

 from sqlalchemy.exc import SQLAlchemyError
@ -43,7 +44,7 @@ async def _generate_summary(
    """
    if not enable_summary:
        summary = f"File: {file_name}\n\n{markdown_content[:4000]}"
-        return summary, embed_text(summary)
+        return summary, await asyncio.to_thread(embed_text, summary)

    if etl_service == "DOCLING":
        from app.services.docling_service import create_docling_service
@ -65,7 +66,7 @@ async def _generate_summary(
                parts.append(f"**{formatted_key}:** {value}")

        enhanced = "\n".join(parts) + "\n\n# DOCUMENT SUMMARY\n\n" + summary_text
-        return enhanced, embed_text(enhanced)
+        return enhanced, await asyncio.to_thread(embed_text, enhanced)

    # Standard summary (Unstructured / LlamaCloud / others)
    meta = {