feat: cache backend clients per endpoint instead of building one (with a fresh SSL context) per request
All checks were successful
Build and Publish Docker Image (Semantic Cache) / build (amd64, linux/amd64, docker-amd64) (push) Successful in 3m59s
Build and Publish Docker Image / build (amd64, linux/amd64, docker-amd64) (push) Successful in 1m25s
Build and Publish Docker Image / build (arm64, linux/arm64, docker-arm64) (push) Successful in 12m46s
Build and Publish Docker Image / merge (push) Successful in 33s
Build and Publish Docker Image (Semantic Cache) / build (arm64, linux/arm64, docker-arm64) (push) Successful in 19m56s
Build and Publish Docker Image (Semantic Cache) / merge (push) Successful in 33s

This commit is contained in:
Alpha Nerd 2026-06-07 09:55:54 +02:00
parent 1ce792c48b
commit 3cd530586c
Signed by: alpha-nerd
SSH key fingerprint: SHA256:QkkAgVoYi9TQ0UKPkiKSfnerZy2h4qhi3SVPXJmBN+M
5 changed files with 87 additions and 15 deletions

View file

@ -44,7 +44,7 @@ from backends.normalize import (
_extract_llama_quant,
)
from backends.probe import fetch
from backends.sessions import _make_openai_client, get_probe_session
from backends.sessions import _make_openai_client, get_ollama_client, get_probe_session
from requests.chat import _make_moe_requests
from requests.messages import (
transform_images_to_data_urls,
@ -187,7 +187,7 @@ async def proxy(request: Request):
params.update({k: v for k, v in optional_params.items() if v is not None})
oclient = _make_openai_client(endpoint, default_headers=default_headers, api_key=config.api_keys.get(endpoint, "no-key"))
else:
client = ollama.AsyncClient(host=endpoint)
client = get_ollama_client(endpoint)
# 4. Async generator body (error handling + cleanup handled by _guarded_stream)
async def stream_generate_response():
@ -364,7 +364,7 @@ async def chat_proxy(request: Request):
params.update({k: v for k, v in optional_params.items() if v is not None})
oclient = _make_openai_client(endpoint, default_headers=default_headers, api_key=config.api_keys.get(endpoint, "no-key"))
else:
client = ollama.AsyncClient(host=endpoint)
client = get_ollama_client(endpoint)
# For OpenAI endpoints: make the API call in handler scope
# (try/except inside async generators is unreliable with Starlette's streaming)
start_ts = None
@ -598,7 +598,7 @@ async def _handle_embedding_request(
model = model[0]
client = _make_openai_client(endpoint, api_key=config.api_keys.get(endpoint, "no-key"))
else:
client = ollama.AsyncClient(host=endpoint)
client = get_ollama_client(endpoint)
# 3. Async generator body (error handling + cleanup handled by _guarded_stream)
async def stream_embedding_response():
@ -688,7 +688,7 @@ async def create_proxy(request: Request):
status_lists = []
for endpoint in config.endpoints:
client = ollama.AsyncClient(host=endpoint)
client = get_ollama_client(endpoint)
create = await client.create(model=model, quantize=quantize, from_=from_, files=files, adapters=adapters, template=template, license=license, system=system, parameters=parameters, messages=messages, stream=False)
status_lists.append(create)
@ -724,7 +724,7 @@ async def show_proxy(request: Request, model: Optional[str] = None):
# 2. Endpoint logic
endpoint, _ = await choose_endpoint(model, reserve=False)
client = ollama.AsyncClient(host=endpoint)
client = get_ollama_client(endpoint)
# 3. Proxy a simple show request
show = await client.show(model=model)
@ -768,7 +768,7 @@ async def copy_proxy(request: Request, source: Optional[str] = None, destination
for endpoint in config.endpoints:
if "/v1" not in endpoint:
client = ollama.AsyncClient(host=endpoint)
client = get_ollama_client(endpoint)
# 4. Proxy a simple copy request
copy = await client.copy(source=src, destination=dst)
status_list.append(copy.status)
@ -804,7 +804,7 @@ async def delete_proxy(request: Request, model: Optional[str] = None):
for endpoint in config.endpoints:
if "/v1" not in endpoint:
client = ollama.AsyncClient(host=endpoint)
client = get_ollama_client(endpoint)
# 3. Proxy a simple copy request
copy = await client.delete(model=model)
status_list.append(copy.status)
@ -842,7 +842,7 @@ async def pull_proxy(request: Request, model: Optional[str] = None):
for endpoint in config.endpoints:
if "/v1" not in endpoint:
client = ollama.AsyncClient(host=endpoint)
client = get_ollama_client(endpoint)
# 3. Proxy a simple pull request
pull = await client.pull(model=model, insecure=insecure, stream=False)
status_list.append(pull)
@ -882,7 +882,7 @@ async def push_proxy(request: Request):
status_list = []
for endpoint in config.endpoints:
client = ollama.AsyncClient(host=endpoint)
client = get_ollama_client(endpoint)
# 3. Proxy a simple push request
push = await client.push(model=model, insecure=insecure, stream=False)
status_list.append(push)