fix: treat external openai compatible endpoint always as loaded for the advertised models #128
This commit is contained in:
parent
1b3ad057a6
commit
c9e495876b
2 changed files with 21 additions and 4 deletions
|
|
@ -321,7 +321,16 @@ class fetch:
|
|||
to serve requests immediately even when cache is stale (refreshing in background).
|
||||
"""
|
||||
if is_ext_openai_endpoint(endpoint):
|
||||
return set()
|
||||
# External OpenAI-compatible backends (vLLM, OpenAI, Groq, …) keep
|
||||
# every advertised model permanently resident — there is no
|
||||
# /api/ps-style "loaded" subset to probe. Report the advertised set
|
||||
# as the loaded set so choose_endpoint's "loaded & free" preference
|
||||
# treats them on par with Ollama/llama-server backends that have the
|
||||
# model hot, instead of relegating them to the free-slot fallback and
|
||||
# never using them when an Ollama box advertises the same model.
|
||||
# See issue #128. (llama-server / llama-swap are NOT ext-openai — they
|
||||
# keep their real resident-subset detection below.)
|
||||
return await fetch.available_models(endpoint, get_config().api_keys.get(endpoint))
|
||||
|
||||
# Check loaded models cache with lock protection
|
||||
async with _loaded_models_cache_lock:
|
||||
|
|
|
|||
|
|
@ -260,12 +260,20 @@ class TestFetchLoadedModels:
|
|||
models = await router.fetch.loaded_models(MOCK_OLLAMA_EP)
|
||||
assert models == set()
|
||||
|
||||
async def test_ext_openai_always_empty(self):
|
||||
async def test_ext_openai_reports_advertised_as_loaded(self):
|
||||
# Issue #128: external OpenAI-compatible backends (vLLM, OpenAI, …) keep
|
||||
# every advertised model permanently resident, so loaded_models mirrors
|
||||
# the advertised set rather than returning empty (which made them lose
|
||||
# choose_endpoint's "loaded & free" preference to Ollama boxes).
|
||||
ext_ep = "https://api.openai.com/v1"
|
||||
cfg = _make_cfg(ollama_eps=[ext_ep], llama_eps=[])
|
||||
with patch.object(router, "config", cfg):
|
||||
with patch.object(router, "config", cfg), mock_probe() as m:
|
||||
m.add_get(
|
||||
f"{ext_ep}/models",
|
||||
payload={"data": [{"id": "gpt-4o"}, {"id": "gpt-4o-mini"}]},
|
||||
)
|
||||
models = await router.fetch.loaded_models(ext_ep)
|
||||
assert models == set()
|
||||
assert models == {"gpt-4o", "gpt-4o-mini"}
|
||||
|
||||
async def test_caches_result(self):
|
||||
cfg = _make_cfg(ollama_eps=[MOCK_OLLAMA_EP], llama_eps=[])
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue