cache loaded models to decrease load on ollamas

This commit is contained in:
Alpha Nerd 2025-11-17 14:40:24 +01:00
parent 06103e5f01
commit 8a05f2ac44
2 changed files with 18 additions and 7 deletions

4
.gitignore vendored
View file

@ -51,10 +51,6 @@ cython_debug/
# VS Code files for those working on multiple tools
.vscode/*
.vscode/settings.json
!.vscode/tasks.json
!.vscode/launch.json
!.vscode/extensions.json
*.code-workspace
# Local History for Visual Studio Code

View file

@ -25,6 +25,7 @@ from PIL import Image
# ------------------------------------------------------------------
# Successful results are cached for 300s
_models_cache: dict[str, tuple[Set[str], float]] = {}
_loaded_models_cache: dict[str, tuple[Set[str], float]] = {}
# Transient errors are cached for 1s the key stays until the
# timeout expires, after which the endpoint will be queried again.
_error_cache: dict[str, float] = {}
@ -226,7 +227,7 @@ class fetch:
del _models_cache[endpoint]
if endpoint in _error_cache:
if _is_fresh(_error_cache[endpoint], 1):
if _is_fresh(_error_cache[endpoint], 10):
# Still within the short error TTL pretend nothing is available
return set()
else:
@ -269,9 +270,22 @@ class fetch:
loaded on that endpoint. If the request fails (e.g. timeout, 5xx), an empty
set is returned.
"""
client: aiohttp.ClientSession = app_state["session"]
if is_ext_openai_endpoint(endpoint):
return set()
if endpoint in _loaded_models_cache:
models, cached_at = _loaded_models_cache[endpoint]
if _is_fresh(cached_at, 30):
return models
else:
# stale entry drop it
del _loaded_models_cache[endpoint]
if endpoint in _error_cache:
if _is_fresh(_error_cache[endpoint], 10):
return set()
else:
del _error_cache[endpoint]
client: aiohttp.ClientSession = app_state["session"]
try:
async with client.get(f"{endpoint}/api/ps") as resp:
await _ensure_success(resp)
@ -279,6 +293,7 @@ class fetch:
# The response format is:
# {"models": [{"name": "model1"}, {"name": "model2"}]}
models = {m.get("name") for m in data.get("models", []) if m.get("name")}
_loaded_models_cache[endpoint] = (models, time.time())
return models
except Exception as e:
# If anything goes wrong we simply assume the endpoint has no models
@ -975,7 +990,7 @@ async def embed_proxy(request: Request):
# 2. Endpoint logic
endpoint = await choose_endpoint(model)
is_openai_endpoint = "/v1" in endpoint
is_openai_endpoint = is_ext_openai_endpoint(endpoint) #"/v1" in endpoint
if is_openai_endpoint:
if ":latest" in model:
model = model.split(":latest")