feat: made chat fast

- Introduced lazy knowledge base retrieval mode, allowing the main agent to fetch KB content on demand via the `search_knowledge_base` tool, improving performance by skipping expensive pre-injection processes.
- Added cross-thread caching capability, enabling reuse of compiled graphs across different user chats, reducing latency for returning users.
- Updated middleware to support new lazy loading and caching features, ensuring efficient resource utilization and improved response times.
- Enhanced logging for performance tracking during knowledge retrieval and agent interactions.
This commit is contained in:
DESKTOP-RTLN3BA\$punk 2026-06-09 04:45:17 -07:00
parent ce952d2ad1
commit 41ff57101c
32 changed files with 979 additions and 169 deletions

View file

@ -571,6 +571,41 @@ async def _warm_agent_jit_caches() -> None:
)
async def _warm_embedding_model() -> None:
"""Pre-load/JIT the embedding model so the first KB search is fast.
With lazy KB retrieval (OpenCode-style), the main agent no longer embeds
on every turn it calls the on-demand ``search_knowledge_base`` tool only
when it needs KB content, and that tool's first ``embed_texts`` call in a
fresh process pays the model's one-time load/JIT (local sentence-transformer
warm or API client init). Doing one throwaway embed at startup moves that
cost off the first real search.
Safety: behind the embedding global lock (run in a worker thread), bounded
by the caller's ``asyncio.wait_for``, and non-fatal — on any failure we log
and swallow so the worst case is the first real search pays the cold cost.
"""
import time as _time
logger = logging.getLogger(__name__)
t0 = _time.perf_counter()
try:
from app.utils.document_converters import embed_texts
await asyncio.to_thread(embed_texts, ["warmup"])
logger.info(
"[startup] Embedding model warmup completed in %.3fs",
_time.perf_counter() - t0,
)
except Exception:
logger.warning(
"[startup] Embedding model warmup failed in %.3fs (non-fatal — first "
"KB search will pay the cold embed cost)",
_time.perf_counter() - t0,
exc_info=True,
)
@asynccontextmanager
async def lifespan(app: FastAPI):
# Tune GC: lower gen-2 threshold so long-lived garbage is collected
@ -601,6 +636,16 @@ async def lifespan(app: FastAPI):
"first real request will pay the full compile cost."
)
# Phase 2 — embedding warmup so the first lazy ``search_knowledge_base``
# call doesn't pay the cold embed-model load. Bounded + non-fatal.
try:
await asyncio.wait_for(asyncio.shield(_warm_embedding_model()), timeout=20)
except (TimeoutError, Exception): # pragma: no cover - defensive
logging.getLogger(__name__).warning(
"[startup] Embedding warmup hit timeout/error — skipping; "
"first KB search will pay the cold embed cost."
)
register_session_hooks()
log_system_snapshot("startup_complete")
await start_gateway_inbox_worker()