feat: cache backend clients per endpoint instead of building one (with a fresh SSL context) per request

2026-06-07 09:55:54 +02:00 · 2026-06-07 09:55:54 +02:00 · 3cd530586c
commit 3cd530586c
parent 1ce792c48b
5 changed files with 87 additions and 15 deletions
--- a/api/ollama.py
+++ b/api/ollama.py
@ -44,7 +44,7 @@ from backends.normalize import (
    _extract_llama_quant,
 )
 from backends.probe import fetch
-from backends.sessions import _make_openai_client, get_probe_session
+from backends.sessions import _make_openai_client, get_ollama_client, get_probe_session
 from requests.chat import _make_moe_requests
 from requests.messages import (
    transform_images_to_data_urls,
@ -187,7 +187,7 @@ async def proxy(request: Request):
        params.update({k: v for k, v in optional_params.items() if v is not None})
        oclient = _make_openai_client(endpoint, default_headers=default_headers, api_key=config.api_keys.get(endpoint, "no-key"))
    else:
-        client = ollama.AsyncClient(host=endpoint)
+        client = get_ollama_client(endpoint)

    # 4. Async generator body (error handling + cleanup handled by _guarded_stream)
    async def stream_generate_response():
@ -364,7 +364,7 @@ async def chat_proxy(request: Request):
        params.update({k: v for k, v in optional_params.items() if v is not None})
        oclient = _make_openai_client(endpoint, default_headers=default_headers, api_key=config.api_keys.get(endpoint, "no-key"))
    else:
-        client = ollama.AsyncClient(host=endpoint)
+        client = get_ollama_client(endpoint)
    # For OpenAI endpoints: make the API call in handler scope
    # (try/except inside async generators is unreliable with Starlette's streaming)
    start_ts = None
@ -598,7 +598,7 @@ async def _handle_embedding_request(
            model = model[0]
        client = _make_openai_client(endpoint, api_key=config.api_keys.get(endpoint, "no-key"))
    else:
-        client = ollama.AsyncClient(host=endpoint)
+        client = get_ollama_client(endpoint)

    # 3. Async generator body (error handling + cleanup handled by _guarded_stream)
    async def stream_embedding_response():
@ -688,7 +688,7 @@ async def create_proxy(request: Request):
    status_lists = []

    for endpoint in config.endpoints:
-        client = ollama.AsyncClient(host=endpoint)
+        client = get_ollama_client(endpoint)
        create = await client.create(model=model, quantize=quantize, from_=from_, files=files, adapters=adapters, template=template, license=license, system=system, parameters=parameters, messages=messages, stream=False)
        status_lists.append(create)

@ -724,7 +724,7 @@ async def show_proxy(request: Request, model: Optional[str] = None):
    # 2. Endpoint logic
    endpoint, _ = await choose_endpoint(model, reserve=False)

-    client = ollama.AsyncClient(host=endpoint)
+    client = get_ollama_client(endpoint)

    # 3. Proxy a simple show request
    show = await client.show(model=model)
@ -768,7 +768,7 @@ async def copy_proxy(request: Request, source: Optional[str] = None, destination

    for endpoint in config.endpoints:
        if "/v1" not in endpoint:
-            client = ollama.AsyncClient(host=endpoint)
+            client = get_ollama_client(endpoint)
            # 4. Proxy a simple copy request
            copy = await client.copy(source=src, destination=dst)
            status_list.append(copy.status)
@ -804,7 +804,7 @@ async def delete_proxy(request: Request, model: Optional[str] = None):

    for endpoint in config.endpoints:
        if "/v1" not in endpoint:
-            client = ollama.AsyncClient(host=endpoint)
+            client = get_ollama_client(endpoint)
            # 3. Proxy a simple copy request
            copy = await client.delete(model=model)
            status_list.append(copy.status)
@ -842,7 +842,7 @@ async def pull_proxy(request: Request, model: Optional[str] = None):

    for endpoint in config.endpoints:
        if "/v1" not in endpoint:
-            client = ollama.AsyncClient(host=endpoint)
+            client = get_ollama_client(endpoint)
            # 3. Proxy a simple pull request
            pull = await client.pull(model=model, insecure=insecure, stream=False)
            status_list.append(pull)
@ -882,7 +882,7 @@ async def push_proxy(request: Request):
    status_list = []

    for endpoint in config.endpoints:
-        client = ollama.AsyncClient(host=endpoint)
+        client = get_ollama_client(endpoint)
        # 3. Proxy a simple push request
        push = await client.push(model=model, insecure=insecure, stream=False)
        status_list.append(push)