From a49a4db6d6543bf78cd97b94298fceca3e9ede1c Mon Sep 17 00:00:00 2001 From: CREDO23 Date: Mon, 9 Mar 2026 16:33:24 +0200 Subject: [PATCH] perf: use asyncio.to_thread for embed_text in FastAPI paths --- .../app/agents/new_chat/tools/search_surfsense_docs.py | 3 ++- .../app/agents/new_chat/tools/shared_memory.py | 5 +++-- surfsense_backend/app/agents/new_chat/tools/user_memory.py | 7 +++---- surfsense_backend/app/services/connector_service.py | 4 +++- 4 files changed, 11 insertions(+), 8 deletions(-) diff --git a/surfsense_backend/app/agents/new_chat/tools/search_surfsense_docs.py b/surfsense_backend/app/agents/new_chat/tools/search_surfsense_docs.py index ec86c3ffa..b8b1527c7 100644 --- a/surfsense_backend/app/agents/new_chat/tools/search_surfsense_docs.py +++ b/surfsense_backend/app/agents/new_chat/tools/search_surfsense_docs.py @@ -8,6 +8,7 @@ The documentation is indexed at deployment time from MDX files and stored in dedicated tables (surfsense_docs_documents, surfsense_docs_chunks). """ +import asyncio import json from langchain_core.tools import tool @@ -100,7 +101,7 @@ async def search_surfsense_docs_async( Formatted string with relevant documentation content """ # Get embedding for the query - query_embedding = embed_text(query) + query_embedding = await asyncio.to_thread(embed_text, query) # Vector similarity search on chunks, joining with documents stmt = ( diff --git a/surfsense_backend/app/agents/new_chat/tools/shared_memory.py b/surfsense_backend/app/agents/new_chat/tools/shared_memory.py index ba69f1ce8..c826d808f 100644 --- a/surfsense_backend/app/agents/new_chat/tools/shared_memory.py +++ b/surfsense_backend/app/agents/new_chat/tools/shared_memory.py @@ -1,5 +1,6 @@ """Shared (team) memory backend for search-space-scoped AI context.""" +import asyncio import logging from typing import Any from uuid import UUID @@ -64,7 +65,7 @@ async def save_shared_memory( count = await get_shared_memory_count(db_session, search_space_id) if count >= MAX_MEMORIES_PER_SEARCH_SPACE: await delete_oldest_shared_memory(db_session, search_space_id) - embedding = embed_text(content) + embedding = await asyncio.to_thread(embed_text, content) row = SharedMemory( search_space_id=search_space_id, created_by_id=_to_uuid(created_by_id), @@ -108,7 +109,7 @@ async def recall_shared_memory( if category and category in valid_categories: stmt = stmt.where(SharedMemory.category == MemoryCategory(category)) if query: - query_embedding = embed_text(query) + query_embedding = await asyncio.to_thread(embed_text, query) stmt = stmt.order_by( SharedMemory.embedding.op("<=>")(query_embedding) ).limit(top_k) diff --git a/surfsense_backend/app/agents/new_chat/tools/user_memory.py b/surfsense_backend/app/agents/new_chat/tools/user_memory.py index 8aa516454..81e849856 100644 --- a/surfsense_backend/app/agents/new_chat/tools/user_memory.py +++ b/surfsense_backend/app/agents/new_chat/tools/user_memory.py @@ -9,6 +9,7 @@ Features: - recall_memory: Retrieve relevant memories using semantic search """ +import asyncio import logging from typing import Any from uuid import UUID @@ -177,8 +178,7 @@ def create_save_memory_tool( # Delete oldest memory to make room await delete_oldest_memory(db_session, user_id, search_space_id) - # Generate embedding for the memory - embedding = embed_text(content) + embedding = await asyncio.to_thread(embed_text, content) # Create new memory using ORM # The pgvector Vector column type handles embedding conversion automatically @@ -267,8 +267,7 @@ def create_recall_memory_tool( uuid_user_id = _to_uuid(user_id) if query: - # Semantic search using embeddings - query_embedding = embed_text(query) + query_embedding = await asyncio.to_thread(embed_text, query) # Build query with vector similarity stmt = ( diff --git a/surfsense_backend/app/services/connector_service.py b/surfsense_backend/app/services/connector_service.py index 0aa48eccd..ac51e1833 100644 --- a/surfsense_backend/app/services/connector_service.py +++ b/surfsense_backend/app/services/connector_service.py @@ -264,7 +264,9 @@ class ConnectorService: # Reuse caller-provided embedding or compute once for both retrievers. if query_embedding is None: t_embed = time.perf_counter() - query_embedding = config.embedding_model_instance.embed(query_text) + query_embedding = await asyncio.to_thread( + config.embedding_model_instance.embed, query_text + ) perf.info( "[connector_svc] _combined_rrf embedding in %.3fs type=%s", time.perf_counter() - t_embed,