feat: cache backend clients per endpoint instead of building one (with a fresh SSL context) per request
All checks were successful
Build and Publish Docker Image (Semantic Cache) / build (amd64, linux/amd64, docker-amd64) (push) Successful in 3m59s
Build and Publish Docker Image / build (amd64, linux/amd64, docker-amd64) (push) Successful in 1m25s
Build and Publish Docker Image / build (arm64, linux/arm64, docker-arm64) (push) Successful in 12m46s
Build and Publish Docker Image / merge (push) Successful in 33s
Build and Publish Docker Image (Semantic Cache) / build (arm64, linux/arm64, docker-arm64) (push) Successful in 19m56s
Build and Publish Docker Image (Semantic Cache) / merge (push) Successful in 33s
All checks were successful
Build and Publish Docker Image (Semantic Cache) / build (amd64, linux/amd64, docker-amd64) (push) Successful in 3m59s
Build and Publish Docker Image / build (amd64, linux/amd64, docker-amd64) (push) Successful in 1m25s
Build and Publish Docker Image / build (arm64, linux/arm64, docker-arm64) (push) Successful in 12m46s
Build and Publish Docker Image / merge (push) Successful in 33s
Build and Publish Docker Image (Semantic Cache) / build (arm64, linux/arm64, docker-arm64) (push) Successful in 19m56s
Build and Publish Docker Image (Semantic Cache) / merge (push) Successful in 33s
This commit is contained in:
parent
1ce792c48b
commit
3cd530586c
5 changed files with 87 additions and 15 deletions
|
|
@ -8,6 +8,7 @@ populate them once and routes can reuse them.
|
|||
import os
|
||||
|
||||
import aiohttp
|
||||
import ollama
|
||||
import openai
|
||||
|
||||
from state import app_state
|
||||
|
|
@ -70,16 +71,42 @@ def get_probe_session(endpoint: str) -> aiohttp.ClientSession:
|
|||
return app_state.get("probe_session") or app_state["session"]
|
||||
|
||||
|
||||
def get_ollama_client(endpoint: str) -> ollama.AsyncClient:
|
||||
"""Return a cached ``ollama.AsyncClient`` for the endpoint, creating it once.
|
||||
|
||||
``ollama.AsyncClient`` wraps an ``httpx.AsyncClient`` whose construction
|
||||
builds an SSL context and reloads the OS trust store (~40 ms). It is safe to
|
||||
reuse concurrently, so we keep one per endpoint instead of building a fresh
|
||||
one on every request — otherwise that 40 ms of CPU runs on the event loop
|
||||
per request and caps single-worker throughput at ~25 req/s.
|
||||
"""
|
||||
cache = app_state["ollama_clients"]
|
||||
client = cache.get(endpoint)
|
||||
if client is None:
|
||||
client = ollama.AsyncClient(host=endpoint)
|
||||
cache[endpoint] = client
|
||||
return client
|
||||
|
||||
|
||||
def _make_openai_client(
|
||||
endpoint: str,
|
||||
default_headers: dict | None = None,
|
||||
api_key: str = "no-key",
|
||||
) -> openai.AsyncOpenAI:
|
||||
"""Return an AsyncOpenAI client configured for the given endpoint.
|
||||
"""Return a cached AsyncOpenAI client configured for the given endpoint.
|
||||
|
||||
For Unix socket endpoints, injects a pre-created httpx UDS transport
|
||||
so the OpenAI SDK connects via the socket instead of TCP.
|
||||
Clients are cached per ``(endpoint, api_key)`` and reused across requests:
|
||||
constructing one builds an SSL context and reloads the OS trust store
|
||||
(~40 ms), which serializes the event loop if done per request. For Unix
|
||||
socket endpoints, injects the pre-created httpx UDS transport so the OpenAI
|
||||
SDK connects via the socket instead of TCP.
|
||||
"""
|
||||
cache = app_state["openai_clients"]
|
||||
cache_key = (endpoint, api_key)
|
||||
client = cache.get(cache_key)
|
||||
if client is not None:
|
||||
return client
|
||||
|
||||
base_url = ep2base(endpoint)
|
||||
kwargs: dict = {"api_key": api_key}
|
||||
if default_headers is not None:
|
||||
|
|
@ -89,4 +116,6 @@ def _make_openai_client(
|
|||
if http_client is not None:
|
||||
kwargs["http_client"] = http_client
|
||||
base_url = "http://localhost/v1"
|
||||
return openai.AsyncOpenAI(base_url=base_url, **kwargs)
|
||||
client = openai.AsyncOpenAI(base_url=base_url, **kwargs)
|
||||
cache[cache_key] = client
|
||||
return client
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue