feat: cache backend clients per endpoint instead of building one (with a fresh SSL context) per request
All checks were successful
Build and Publish Docker Image (Semantic Cache) / build (amd64, linux/amd64, docker-amd64) (push) Successful in 3m59s
Build and Publish Docker Image / build (amd64, linux/amd64, docker-amd64) (push) Successful in 1m25s
Build and Publish Docker Image / build (arm64, linux/arm64, docker-arm64) (push) Successful in 12m46s
Build and Publish Docker Image / merge (push) Successful in 33s
Build and Publish Docker Image (Semantic Cache) / build (arm64, linux/arm64, docker-arm64) (push) Successful in 19m56s
Build and Publish Docker Image (Semantic Cache) / merge (push) Successful in 33s

This commit is contained in:
Alpha Nerd 2026-06-07 09:55:54 +02:00
parent 1ce792c48b
commit 3cd530586c
Signed by: alpha-nerd
SSH key fingerprint: SHA256:QkkAgVoYi9TQ0UKPkiKSfnerZy2h4qhi3SVPXJmBN+M
5 changed files with 87 additions and 15 deletions

View file

@ -8,6 +8,7 @@ populate them once and routes can reuse them.
import os
import aiohttp
import ollama
import openai
from state import app_state
@ -70,16 +71,42 @@ def get_probe_session(endpoint: str) -> aiohttp.ClientSession:
return app_state.get("probe_session") or app_state["session"]
def get_ollama_client(endpoint: str) -> ollama.AsyncClient:
"""Return a cached ``ollama.AsyncClient`` for the endpoint, creating it once.
``ollama.AsyncClient`` wraps an ``httpx.AsyncClient`` whose construction
builds an SSL context and reloads the OS trust store (~40 ms). It is safe to
reuse concurrently, so we keep one per endpoint instead of building a fresh
one on every request otherwise that 40 ms of CPU runs on the event loop
per request and caps single-worker throughput at ~25 req/s.
"""
cache = app_state["ollama_clients"]
client = cache.get(endpoint)
if client is None:
client = ollama.AsyncClient(host=endpoint)
cache[endpoint] = client
return client
def _make_openai_client(
endpoint: str,
default_headers: dict | None = None,
api_key: str = "no-key",
) -> openai.AsyncOpenAI:
"""Return an AsyncOpenAI client configured for the given endpoint.
"""Return a cached AsyncOpenAI client configured for the given endpoint.
For Unix socket endpoints, injects a pre-created httpx UDS transport
so the OpenAI SDK connects via the socket instead of TCP.
Clients are cached per ``(endpoint, api_key)`` and reused across requests:
constructing one builds an SSL context and reloads the OS trust store
(~40 ms), which serializes the event loop if done per request. For Unix
socket endpoints, injects the pre-created httpx UDS transport so the OpenAI
SDK connects via the socket instead of TCP.
"""
cache = app_state["openai_clients"]
cache_key = (endpoint, api_key)
client = cache.get(cache_key)
if client is not None:
return client
base_url = ep2base(endpoint)
kwargs: dict = {"api_key": api_key}
if default_headers is not None:
@ -89,4 +116,6 @@ def _make_openai_client(
if http_client is not None:
kwargs["http_client"] = http_client
base_url = "http://localhost/v1"
return openai.AsyncOpenAI(base_url=base_url, **kwargs)
client = openai.AsyncOpenAI(base_url=base_url, **kwargs)
cache[cache_key] = client
return client