feat: cache backend clients per endpoint instead of building one (with a fresh SSL context) per request
All checks were successful
Build and Publish Docker Image (Semantic Cache) / build (amd64, linux/amd64, docker-amd64) (push) Successful in 3m59s
Build and Publish Docker Image / build (amd64, linux/amd64, docker-amd64) (push) Successful in 1m25s
Build and Publish Docker Image / build (arm64, linux/arm64, docker-arm64) (push) Successful in 12m46s
Build and Publish Docker Image / merge (push) Successful in 33s
Build and Publish Docker Image (Semantic Cache) / build (arm64, linux/arm64, docker-arm64) (push) Successful in 19m56s
Build and Publish Docker Image (Semantic Cache) / merge (push) Successful in 33s
All checks were successful
Build and Publish Docker Image (Semantic Cache) / build (amd64, linux/amd64, docker-amd64) (push) Successful in 3m59s
Build and Publish Docker Image / build (amd64, linux/amd64, docker-amd64) (push) Successful in 1m25s
Build and Publish Docker Image / build (arm64, linux/arm64, docker-arm64) (push) Successful in 12m46s
Build and Publish Docker Image / merge (push) Successful in 33s
Build and Publish Docker Image (Semantic Cache) / build (arm64, linux/arm64, docker-arm64) (push) Successful in 19m56s
Build and Publish Docker Image (Semantic Cache) / merge (push) Successful in 33s
This commit is contained in:
parent
1ce792c48b
commit
3cd530586c
5 changed files with 87 additions and 15 deletions
|
|
@ -44,7 +44,7 @@ from backends.normalize import (
|
|||
_extract_llama_quant,
|
||||
)
|
||||
from backends.probe import fetch
|
||||
from backends.sessions import _make_openai_client, get_probe_session
|
||||
from backends.sessions import _make_openai_client, get_ollama_client, get_probe_session
|
||||
from requests.chat import _make_moe_requests
|
||||
from requests.messages import (
|
||||
transform_images_to_data_urls,
|
||||
|
|
@ -187,7 +187,7 @@ async def proxy(request: Request):
|
|||
params.update({k: v for k, v in optional_params.items() if v is not None})
|
||||
oclient = _make_openai_client(endpoint, default_headers=default_headers, api_key=config.api_keys.get(endpoint, "no-key"))
|
||||
else:
|
||||
client = ollama.AsyncClient(host=endpoint)
|
||||
client = get_ollama_client(endpoint)
|
||||
|
||||
# 4. Async generator body (error handling + cleanup handled by _guarded_stream)
|
||||
async def stream_generate_response():
|
||||
|
|
@ -364,7 +364,7 @@ async def chat_proxy(request: Request):
|
|||
params.update({k: v for k, v in optional_params.items() if v is not None})
|
||||
oclient = _make_openai_client(endpoint, default_headers=default_headers, api_key=config.api_keys.get(endpoint, "no-key"))
|
||||
else:
|
||||
client = ollama.AsyncClient(host=endpoint)
|
||||
client = get_ollama_client(endpoint)
|
||||
# For OpenAI endpoints: make the API call in handler scope
|
||||
# (try/except inside async generators is unreliable with Starlette's streaming)
|
||||
start_ts = None
|
||||
|
|
@ -598,7 +598,7 @@ async def _handle_embedding_request(
|
|||
model = model[0]
|
||||
client = _make_openai_client(endpoint, api_key=config.api_keys.get(endpoint, "no-key"))
|
||||
else:
|
||||
client = ollama.AsyncClient(host=endpoint)
|
||||
client = get_ollama_client(endpoint)
|
||||
|
||||
# 3. Async generator body (error handling + cleanup handled by _guarded_stream)
|
||||
async def stream_embedding_response():
|
||||
|
|
@ -688,7 +688,7 @@ async def create_proxy(request: Request):
|
|||
status_lists = []
|
||||
|
||||
for endpoint in config.endpoints:
|
||||
client = ollama.AsyncClient(host=endpoint)
|
||||
client = get_ollama_client(endpoint)
|
||||
create = await client.create(model=model, quantize=quantize, from_=from_, files=files, adapters=adapters, template=template, license=license, system=system, parameters=parameters, messages=messages, stream=False)
|
||||
status_lists.append(create)
|
||||
|
||||
|
|
@ -724,7 +724,7 @@ async def show_proxy(request: Request, model: Optional[str] = None):
|
|||
# 2. Endpoint logic
|
||||
endpoint, _ = await choose_endpoint(model, reserve=False)
|
||||
|
||||
client = ollama.AsyncClient(host=endpoint)
|
||||
client = get_ollama_client(endpoint)
|
||||
|
||||
# 3. Proxy a simple show request
|
||||
show = await client.show(model=model)
|
||||
|
|
@ -768,7 +768,7 @@ async def copy_proxy(request: Request, source: Optional[str] = None, destination
|
|||
|
||||
for endpoint in config.endpoints:
|
||||
if "/v1" not in endpoint:
|
||||
client = ollama.AsyncClient(host=endpoint)
|
||||
client = get_ollama_client(endpoint)
|
||||
# 4. Proxy a simple copy request
|
||||
copy = await client.copy(source=src, destination=dst)
|
||||
status_list.append(copy.status)
|
||||
|
|
@ -804,7 +804,7 @@ async def delete_proxy(request: Request, model: Optional[str] = None):
|
|||
|
||||
for endpoint in config.endpoints:
|
||||
if "/v1" not in endpoint:
|
||||
client = ollama.AsyncClient(host=endpoint)
|
||||
client = get_ollama_client(endpoint)
|
||||
# 3. Proxy a simple copy request
|
||||
copy = await client.delete(model=model)
|
||||
status_list.append(copy.status)
|
||||
|
|
@ -842,7 +842,7 @@ async def pull_proxy(request: Request, model: Optional[str] = None):
|
|||
|
||||
for endpoint in config.endpoints:
|
||||
if "/v1" not in endpoint:
|
||||
client = ollama.AsyncClient(host=endpoint)
|
||||
client = get_ollama_client(endpoint)
|
||||
# 3. Proxy a simple pull request
|
||||
pull = await client.pull(model=model, insecure=insecure, stream=False)
|
||||
status_list.append(pull)
|
||||
|
|
@ -882,7 +882,7 @@ async def push_proxy(request: Request):
|
|||
status_list = []
|
||||
|
||||
for endpoint in config.endpoints:
|
||||
client = ollama.AsyncClient(host=endpoint)
|
||||
client = get_ollama_client(endpoint)
|
||||
# 3. Proxy a simple push request
|
||||
push = await client.push(model=model, insecure=insecure, stream=False)
|
||||
status_list.append(push)
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue