feat: made chat fast

- Introduced lazy knowledge base retrieval mode, allowing the main agent to fetch KB content on demand via the `search_knowledge_base` tool, improving performance by skipping expensive pre-injection processes. - Added cross-thread caching capability, enabling reuse of compiled graphs across different user chats, reducing latency for returning users. - Updated middleware to support new lazy loading and caching features, ensuring efficient resource utilization and improved response times. - Enhanced logging for performance tracking during knowledge retrieval and agent interactions.
2026-07-02 22:01:05 +02:00 · 2026-06-09 04:45:17 -07:00 · 2026-06-09 04:45:17 -07:00 · 41ff57101c
commit 41ff57101c
parent ce952d2ad1
32 changed files with 979 additions and 169 deletions
--- a/surfsense_backend/app/app.py
+++ b/surfsense_backend/app/app.py
@ -571,6 +571,41 @@ async def _warm_agent_jit_caches() -> None:
        )


+async def _warm_embedding_model() -> None:
+    """Pre-load/JIT the embedding model so the first KB search is fast.
+
+    With lazy KB retrieval (OpenCode-style), the main agent no longer embeds
+    on every turn — it calls the on-demand ``search_knowledge_base`` tool only
+    when it needs KB content, and that tool's first ``embed_texts`` call in a
+    fresh process pays the model's one-time load/JIT (local sentence-transformer
+    warm or API client init). Doing one throwaway embed at startup moves that
+    cost off the first real search.
+
+    Safety: behind the embedding global lock (run in a worker thread), bounded
+    by the caller's ``asyncio.wait_for``, and non-fatal — on any failure we log
+    and swallow so the worst case is the first real search pays the cold cost.
+    """
+    import time as _time
+
+    logger = logging.getLogger(__name__)
+    t0 = _time.perf_counter()
+    try:
+        from app.utils.document_converters import embed_texts
+
+        await asyncio.to_thread(embed_texts, ["warmup"])
+        logger.info(
+            "[startup] Embedding model warmup completed in %.3fs",
+            _time.perf_counter() - t0,
+        )
+    except Exception:
+        logger.warning(
+            "[startup] Embedding model warmup failed in %.3fs (non-fatal — first "
+            "KB search will pay the cold embed cost)",
+            _time.perf_counter() - t0,
+            exc_info=True,
+        )
+
+
@asynccontextmanager
 async def lifespan(app: FastAPI):
    # Tune GC: lower gen-2 threshold so long-lived garbage is collected
@ -601,6 +636,16 @@ async def lifespan(app: FastAPI):
            "first real request will pay the full compile cost."
        )

+    # Phase 2 — embedding warmup so the first lazy ``search_knowledge_base``
+    # call doesn't pay the cold embed-model load. Bounded + non-fatal.
+    try:
+        await asyncio.wait_for(asyncio.shield(_warm_embedding_model()), timeout=20)
+    except (TimeoutError, Exception):  # pragma: no cover - defensive
+        logging.getLogger(__name__).warning(
+            "[startup] Embedding warmup hit timeout/error — skipping; "
+            "first KB search will pay the cold embed cost."
+        )
+
    register_session_hooks()
    log_system_snapshot("startup_complete")
    await start_gateway_inbox_worker()