perf: use asyncio.to_thread for embed_text in FastAPI paths

2026-05-06 06:12:40 +02:00 · 2026-03-09 16:33:24 +02:00 · 2026-03-09 16:33:24 +02:00 · a49a4db6d6
commit a49a4db6d6
parent c4f2e9a3a5
4 changed files with 11 additions and 8 deletions
--- a/surfsense_backend/app/agents/new_chat/tools/search_surfsense_docs.py
+++ b/surfsense_backend/app/agents/new_chat/tools/search_surfsense_docs.py
@ -8,6 +8,7 @@ The documentation is indexed at deployment time from MDX files and stored
 in dedicated tables (surfsense_docs_documents, surfsense_docs_chunks).
 """
 import asyncio
 import json
 from langchain_core.tools import tool
@ -100,7 +101,7 @@ async def search_surfsense_docs_async(
        Formatted string with relevant documentation content
    """
    # Get embedding for the query
-    query_embedding = embed_text(query)
+    query_embedding = await asyncio.to_thread(embed_text, query)
    # Vector similarity search on chunks, joining with documents
    stmt = (
--- a/surfsense_backend/app/agents/new_chat/tools/shared_memory.py
+++ b/surfsense_backend/app/agents/new_chat/tools/shared_memory.py
@ -1,5 +1,6 @@
 """Shared (team) memory backend for search-space-scoped AI context."""
 import asyncio
 import logging
 from typing import Any
 from uuid import UUID
@ -64,7 +65,7 @@ async def save_shared_memory(
        count = await get_shared_memory_count(db_session, search_space_id)
        if count >= MAX_MEMORIES_PER_SEARCH_SPACE:
            await delete_oldest_shared_memory(db_session, search_space_id)
-        embedding = embed_text(content)
+        embedding = await asyncio.to_thread(embed_text, content)
        row = SharedMemory(
            search_space_id=search_space_id,
            created_by_id=_to_uuid(created_by_id),
@ -108,7 +109,7 @@ async def recall_shared_memory(
        if category and category in valid_categories:
            stmt = stmt.where(SharedMemory.category == MemoryCategory(category))
        if query:
-            query_embedding = embed_text(query)
+            query_embedding = await asyncio.to_thread(embed_text, query)
            stmt = stmt.order_by(
                SharedMemory.embedding.op("<=>")(query_embedding)
            ).limit(top_k)
--- a/surfsense_backend/app/agents/new_chat/tools/user_memory.py
+++ b/surfsense_backend/app/agents/new_chat/tools/user_memory.py
@ -9,6 +9,7 @@ Features:
 - recall_memory: Retrieve relevant memories using semantic search
 """
 import asyncio
 import logging
 from typing import Any
 from uuid import UUID
@ -177,8 +178,7 @@ def create_save_memory_tool(
                # Delete oldest memory to make room
                await delete_oldest_memory(db_session, user_id, search_space_id)
-            # Generate embedding for the memory
+            embedding = await asyncio.to_thread(embed_text, content)
            embedding = embed_text(content)
            # Create new memory using ORM
            # The pgvector Vector column type handles embedding conversion automatically
@ -267,8 +267,7 @@ def create_recall_memory_tool(
            uuid_user_id = _to_uuid(user_id)
            if query:
-                # Semantic search using embeddings
+                query_embedding = await asyncio.to_thread(embed_text, query)
                query_embedding = embed_text(query)
                # Build query with vector similarity
                stmt = (
--- a/surfsense_backend/app/services/connector_service.py
+++ b/surfsense_backend/app/services/connector_service.py
@ -264,7 +264,9 @@ class ConnectorService:
        # Reuse caller-provided embedding or compute once for both retrievers.
        if query_embedding is None:
            t_embed = time.perf_counter()
-            query_embedding = config.embedding_model_instance.embed(query_text)
+            query_embedding = await asyncio.to_thread(
                config.embedding_model_instance.embed, query_text
            )
            perf.info(
                "[connector_svc] _combined_rrf embedding in %.3fs type=%s",
                time.perf_counter() - t_embed,