mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-07-02 22:01:05 +02:00
feat: made chat fast
- Introduced lazy knowledge base retrieval mode, allowing the main agent to fetch KB content on demand via the `search_knowledge_base` tool, improving performance by skipping expensive pre-injection processes. - Added cross-thread caching capability, enabling reuse of compiled graphs across different user chats, reducing latency for returning users. - Updated middleware to support new lazy loading and caching features, ensuring efficient resource utilization and improved response times. - Enhanced logging for performance tracking during knowledge retrieval and agent interactions.
This commit is contained in:
parent
ce952d2ad1
commit
41ff57101c
32 changed files with 979 additions and 169 deletions
|
|
@ -571,6 +571,41 @@ async def _warm_agent_jit_caches() -> None:
|
|||
)
|
||||
|
||||
|
||||
async def _warm_embedding_model() -> None:
|
||||
"""Pre-load/JIT the embedding model so the first KB search is fast.
|
||||
|
||||
With lazy KB retrieval (OpenCode-style), the main agent no longer embeds
|
||||
on every turn — it calls the on-demand ``search_knowledge_base`` tool only
|
||||
when it needs KB content, and that tool's first ``embed_texts`` call in a
|
||||
fresh process pays the model's one-time load/JIT (local sentence-transformer
|
||||
warm or API client init). Doing one throwaway embed at startup moves that
|
||||
cost off the first real search.
|
||||
|
||||
Safety: behind the embedding global lock (run in a worker thread), bounded
|
||||
by the caller's ``asyncio.wait_for``, and non-fatal — on any failure we log
|
||||
and swallow so the worst case is the first real search pays the cold cost.
|
||||
"""
|
||||
import time as _time
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
t0 = _time.perf_counter()
|
||||
try:
|
||||
from app.utils.document_converters import embed_texts
|
||||
|
||||
await asyncio.to_thread(embed_texts, ["warmup"])
|
||||
logger.info(
|
||||
"[startup] Embedding model warmup completed in %.3fs",
|
||||
_time.perf_counter() - t0,
|
||||
)
|
||||
except Exception:
|
||||
logger.warning(
|
||||
"[startup] Embedding model warmup failed in %.3fs (non-fatal — first "
|
||||
"KB search will pay the cold embed cost)",
|
||||
_time.perf_counter() - t0,
|
||||
exc_info=True,
|
||||
)
|
||||
|
||||
|
||||
@asynccontextmanager
|
||||
async def lifespan(app: FastAPI):
|
||||
# Tune GC: lower gen-2 threshold so long-lived garbage is collected
|
||||
|
|
@ -601,6 +636,16 @@ async def lifespan(app: FastAPI):
|
|||
"first real request will pay the full compile cost."
|
||||
)
|
||||
|
||||
# Phase 2 — embedding warmup so the first lazy ``search_knowledge_base``
|
||||
# call doesn't pay the cold embed-model load. Bounded + non-fatal.
|
||||
try:
|
||||
await asyncio.wait_for(asyncio.shield(_warm_embedding_model()), timeout=20)
|
||||
except (TimeoutError, Exception): # pragma: no cover - defensive
|
||||
logging.getLogger(__name__).warning(
|
||||
"[startup] Embedding warmup hit timeout/error — skipping; "
|
||||
"first KB search will pay the cold embed cost."
|
||||
)
|
||||
|
||||
register_session_hooks()
|
||||
log_system_snapshot("startup_complete")
|
||||
await start_gateway_inbox_worker()
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue