cache loaded models to decrease load on ollamas
This commit is contained in:
parent
06103e5f01
commit
8a05f2ac44
2 changed files with 18 additions and 7 deletions
4
.gitignore
vendored
4
.gitignore
vendored
|
|
@ -51,10 +51,6 @@ cython_debug/
|
|||
|
||||
# VS Code files for those working on multiple tools
|
||||
.vscode/*
|
||||
.vscode/settings.json
|
||||
!.vscode/tasks.json
|
||||
!.vscode/launch.json
|
||||
!.vscode/extensions.json
|
||||
*.code-workspace
|
||||
|
||||
# Local History for Visual Studio Code
|
||||
|
|
|
|||
21
router.py
21
router.py
|
|
@ -25,6 +25,7 @@ from PIL import Image
|
|||
# ------------------------------------------------------------------
|
||||
# Successful results are cached for 300s
|
||||
_models_cache: dict[str, tuple[Set[str], float]] = {}
|
||||
_loaded_models_cache: dict[str, tuple[Set[str], float]] = {}
|
||||
# Transient errors are cached for 1s – the key stays until the
|
||||
# timeout expires, after which the endpoint will be queried again.
|
||||
_error_cache: dict[str, float] = {}
|
||||
|
|
@ -226,7 +227,7 @@ class fetch:
|
|||
del _models_cache[endpoint]
|
||||
|
||||
if endpoint in _error_cache:
|
||||
if _is_fresh(_error_cache[endpoint], 1):
|
||||
if _is_fresh(_error_cache[endpoint], 10):
|
||||
# Still within the short error TTL – pretend nothing is available
|
||||
return set()
|
||||
else:
|
||||
|
|
@ -269,9 +270,22 @@ class fetch:
|
|||
loaded on that endpoint. If the request fails (e.g. timeout, 5xx), an empty
|
||||
set is returned.
|
||||
"""
|
||||
client: aiohttp.ClientSession = app_state["session"]
|
||||
if is_ext_openai_endpoint(endpoint):
|
||||
return set()
|
||||
if endpoint in _loaded_models_cache:
|
||||
models, cached_at = _loaded_models_cache[endpoint]
|
||||
if _is_fresh(cached_at, 30):
|
||||
return models
|
||||
else:
|
||||
# stale entry – drop it
|
||||
del _loaded_models_cache[endpoint]
|
||||
|
||||
if endpoint in _error_cache:
|
||||
if _is_fresh(_error_cache[endpoint], 10):
|
||||
return set()
|
||||
else:
|
||||
del _error_cache[endpoint]
|
||||
client: aiohttp.ClientSession = app_state["session"]
|
||||
try:
|
||||
async with client.get(f"{endpoint}/api/ps") as resp:
|
||||
await _ensure_success(resp)
|
||||
|
|
@ -279,6 +293,7 @@ class fetch:
|
|||
# The response format is:
|
||||
# {"models": [{"name": "model1"}, {"name": "model2"}]}
|
||||
models = {m.get("name") for m in data.get("models", []) if m.get("name")}
|
||||
_loaded_models_cache[endpoint] = (models, time.time())
|
||||
return models
|
||||
except Exception as e:
|
||||
# If anything goes wrong we simply assume the endpoint has no models
|
||||
|
|
@ -975,7 +990,7 @@ async def embed_proxy(request: Request):
|
|||
|
||||
# 2. Endpoint logic
|
||||
endpoint = await choose_endpoint(model)
|
||||
is_openai_endpoint = "/v1" in endpoint
|
||||
is_openai_endpoint = is_ext_openai_endpoint(endpoint) #"/v1" in endpoint
|
||||
if is_openai_endpoint:
|
||||
if ":latest" in model:
|
||||
model = model.split(":latest")
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue