feat: cache backend clients per endpoint instead of building one (with a fresh SSL context) per request

2026-06-07 09:55:54 +02:00 · 2026-06-07 09:55:54 +02:00 · 3cd530586c
commit 3cd530586c
parent 1ce792c48b
5 changed files with 87 additions and 15 deletions
--- a/backends/sessions.py
+++ b/backends/sessions.py
@ -8,6 +8,7 @@ populate them once and routes can reuse them.
 import os

 import aiohttp
+import ollama
 import openai

 from state import app_state
@ -70,16 +71,42 @@ def get_probe_session(endpoint: str) -> aiohttp.ClientSession:
    return app_state.get("probe_session") or app_state["session"]


+def get_ollama_client(endpoint: str) -> ollama.AsyncClient:
+    """Return a cached ``ollama.AsyncClient`` for the endpoint, creating it once.
+
+    ``ollama.AsyncClient`` wraps an ``httpx.AsyncClient`` whose construction
+    builds an SSL context and reloads the OS trust store (~40 ms). It is safe to
+    reuse concurrently, so we keep one per endpoint instead of building a fresh
+    one on every request — otherwise that 40 ms of CPU runs on the event loop
+    per request and caps single-worker throughput at ~25 req/s.
+    """
+    cache = app_state["ollama_clients"]
+    client = cache.get(endpoint)
+    if client is None:
+        client = ollama.AsyncClient(host=endpoint)
+        cache[endpoint] = client
+    return client
+
+
 def _make_openai_client(
    endpoint: str,
    default_headers: dict | None = None,
    api_key: str = "no-key",
 ) -> openai.AsyncOpenAI:
-    """Return an AsyncOpenAI client configured for the given endpoint.
+    """Return a cached AsyncOpenAI client configured for the given endpoint.

-    For Unix socket endpoints, injects a pre-created httpx UDS transport
-    so the OpenAI SDK connects via the socket instead of TCP.
+    Clients are cached per ``(endpoint, api_key)`` and reused across requests:
+    constructing one builds an SSL context and reloads the OS trust store
+    (~40 ms), which serializes the event loop if done per request. For Unix
+    socket endpoints, injects the pre-created httpx UDS transport so the OpenAI
+    SDK connects via the socket instead of TCP.
    """
+    cache = app_state["openai_clients"]
+    cache_key = (endpoint, api_key)
+    client = cache.get(cache_key)
+    if client is not None:
+        return client
+
    base_url = ep2base(endpoint)
    kwargs: dict = {"api_key": api_key}
    if default_headers is not None:
@ -89,4 +116,6 @@ def _make_openai_client(
        if http_client is not None:
            kwargs["http_client"] = http_client
            base_url = "http://localhost/v1"
-    return openai.AsyncOpenAI(base_url=base_url, **kwargs)
+    client = openai.AsyncOpenAI(base_url=base_url, **kwargs)
+    cache[cache_key] = client
+    return client