cache loaded models to decrease load on ollamas

This commit is contained in:
Alpha Nerd 2025-11-17 14:40:24 +01:00
parent 06103e5f01
commit 8a05f2ac44
2 changed files with 18 additions and 7 deletions

4
.gitignore vendored
View file

@ -51,10 +51,6 @@ cython_debug/
# VS Code files for those working on multiple tools # VS Code files for those working on multiple tools
.vscode/* .vscode/*
.vscode/settings.json
!.vscode/tasks.json
!.vscode/launch.json
!.vscode/extensions.json
*.code-workspace *.code-workspace
# Local History for Visual Studio Code # Local History for Visual Studio Code

View file

@ -25,6 +25,7 @@ from PIL import Image
# ------------------------------------------------------------------ # ------------------------------------------------------------------
# Successful results are cached for 300s # Successful results are cached for 300s
_models_cache: dict[str, tuple[Set[str], float]] = {} _models_cache: dict[str, tuple[Set[str], float]] = {}
_loaded_models_cache: dict[str, tuple[Set[str], float]] = {}
# Transient errors are cached for 1s the key stays until the # Transient errors are cached for 1s the key stays until the
# timeout expires, after which the endpoint will be queried again. # timeout expires, after which the endpoint will be queried again.
_error_cache: dict[str, float] = {} _error_cache: dict[str, float] = {}
@ -226,7 +227,7 @@ class fetch:
del _models_cache[endpoint] del _models_cache[endpoint]
if endpoint in _error_cache: if endpoint in _error_cache:
if _is_fresh(_error_cache[endpoint], 1): if _is_fresh(_error_cache[endpoint], 10):
# Still within the short error TTL pretend nothing is available # Still within the short error TTL pretend nothing is available
return set() return set()
else: else:
@ -269,9 +270,22 @@ class fetch:
loaded on that endpoint. If the request fails (e.g. timeout, 5xx), an empty loaded on that endpoint. If the request fails (e.g. timeout, 5xx), an empty
set is returned. set is returned.
""" """
client: aiohttp.ClientSession = app_state["session"]
if is_ext_openai_endpoint(endpoint): if is_ext_openai_endpoint(endpoint):
return set() return set()
if endpoint in _loaded_models_cache:
models, cached_at = _loaded_models_cache[endpoint]
if _is_fresh(cached_at, 30):
return models
else:
# stale entry drop it
del _loaded_models_cache[endpoint]
if endpoint in _error_cache:
if _is_fresh(_error_cache[endpoint], 10):
return set()
else:
del _error_cache[endpoint]
client: aiohttp.ClientSession = app_state["session"]
try: try:
async with client.get(f"{endpoint}/api/ps") as resp: async with client.get(f"{endpoint}/api/ps") as resp:
await _ensure_success(resp) await _ensure_success(resp)
@ -279,6 +293,7 @@ class fetch:
# The response format is: # The response format is:
# {"models": [{"name": "model1"}, {"name": "model2"}]} # {"models": [{"name": "model1"}, {"name": "model2"}]}
models = {m.get("name") for m in data.get("models", []) if m.get("name")} models = {m.get("name") for m in data.get("models", []) if m.get("name")}
_loaded_models_cache[endpoint] = (models, time.time())
return models return models
except Exception as e: except Exception as e:
# If anything goes wrong we simply assume the endpoint has no models # If anything goes wrong we simply assume the endpoint has no models
@ -975,7 +990,7 @@ async def embed_proxy(request: Request):
# 2. Endpoint logic # 2. Endpoint logic
endpoint = await choose_endpoint(model) endpoint = await choose_endpoint(model)
is_openai_endpoint = "/v1" in endpoint is_openai_endpoint = is_ext_openai_endpoint(endpoint) #"/v1" in endpoint
if is_openai_endpoint: if is_openai_endpoint:
if ":latest" in model: if ":latest" in model:
model = model.split(":latest") model = model.split(":latest")