fix: use asyncio.to_thread for embedding calls in search endpoints

Wrap synchronous embedding_model.embed() calls with asyncio.to_thread()
in both vector_search and hybrid_search methods. This prevents blocking
the asyncio event loop during embedding computation, improving server
responsiveness under concurrent load.

Fixes #794

Signed-off-by: Tim Ren <137012659+xr843@users.noreply.github.com>
This commit is contained in:
Tim Ren 2026-03-15 16:21:19 +08:00
parent ee07ed8168
commit 0269900c60

View file

@ -1,3 +1,4 @@
import asyncio
import time
from datetime import datetime
@ -49,7 +50,7 @@ class ChucksHybridSearchRetriever:
# Get embedding for the query
embedding_model = config.embedding_model_instance
t_embed = time.perf_counter()
query_embedding = embedding_model.embed(query_text)
query_embedding = await asyncio.to_thread(embedding_model.embed, query_text)
perf.debug(
"[chunk_search] vector_search embedding in %.3fs",
time.perf_counter() - t_embed,
@ -195,7 +196,7 @@ class ChucksHybridSearchRetriever:
if query_embedding is None:
embedding_model = config.embedding_model_instance
t_embed = time.perf_counter()
query_embedding = embedding_model.embed(query_text)
query_embedding = await asyncio.to_thread(embedding_model.embed, query_text)
perf.debug(
"[chunk_search] hybrid_search embedding in %.3fs",
time.perf_counter() - t_embed,
@ -427,4 +428,4 @@ class ChucksHybridSearchRetriever:
search_space_id,
document_type,
)
return final_docs
return final_docs