fix: routing error for openai compatible endpoints #128

This commit is contained in:
Alpha Nerd 2026-06-25 14:13:07 +02:00
parent 4f42f350a3
commit 1b3ad057a6
Signed by: alpha-nerd
SSH key fingerprint: SHA256:QkkAgVoYi9TQ0UKPkiKSfnerZy2h4qhi3SVPXJmBN+M
2 changed files with 64 additions and 3 deletions

View file

@ -99,9 +99,19 @@ async def choose_endpoint(model: str, reserve: bool = True,
llama_eps_extra = [ep for ep in llama_endpoints(config) if ep not in config.endpoints]
all_endpoints = config.endpoints + llama_eps_extra
tag_tasks = [fetch.available_models(ep) for ep in config.endpoints if not is_openai_compatible(ep)]
tag_tasks += [fetch.available_models(ep, config.api_keys.get(ep)) for ep in config.endpoints if is_openai_compatible(ep)]
tag_tasks += [fetch.available_models(ep, config.api_keys.get(ep)) for ep in llama_eps_extra]
# Build the probe tasks in the SAME order as ``all_endpoints`` so the
# gathered results stay aligned for the ``zip(all_endpoints, advertised_sets)``
# below. (A previous partition into non-OpenAI / OpenAI groups reordered the
# tasks relative to ``all_endpoints``, pairing each endpoint with another
# endpoint's advertised models — so an OpenAI-compatible backend listed
# before an Ollama one would inherit the Ollama model set and get a request
# for a model it cannot serve, 404ing. See issue #128.)
def _advertised_task(ep: str):
if is_openai_compatible(ep):
return fetch.available_models(ep, config.api_keys.get(ep))
return fetch.available_models(ep)
tag_tasks = [_advertised_task(ep) for ep in all_endpoints]
advertised_sets = await asyncio.gather(*tag_tasks)
# 2⃣ Filter endpoints that advertise the requested model

View file

@ -279,6 +279,57 @@ class TestChooseEndpointModelNaming:
assert ep == EP1
class TestChooseEndpointAdvertisedAlignment:
"""Regression for issue #128 — advertised-model sets must stay aligned with
their endpoints regardless of config ordering. A previous implementation
partitioned the probe tasks (non-OpenAI first, then OpenAI-compatible) while
the endpoint list kept config order, so an OpenAI-compatible endpoint listed
*before* an Ollama one inherited the Ollama model set and received requests
for models it does not serve (404)."""
OPENAI_EP = "http://vllm:8000/v1"
async def test_openai_endpoint_before_ollama_does_not_misroute(self):
# OpenAI-compatible endpoint deliberately listed FIRST in config.
cfg = _make_cfg([self.OPENAI_EP, EP1])
async def available(ep, *_):
if ep == self.OPENAI_EP:
return {"vllm-only-model"}
return {"qwen2.5-coder:1.5b-base"} # only the Ollama box advertises this
async def loaded(ep):
return {"qwen2.5-coder:1.5b-base"} if ep == EP1 else set()
with (
patch.object(router, "config", cfg),
patch.object(router.fetch, "available_models", side_effect=available),
patch.object(router.fetch, "loaded_models", side_effect=loaded),
):
ep, _ = await router.choose_endpoint("qwen2.5-coder:1.5b-base")
# Must route to the endpoint that actually advertises the model.
assert ep == EP1
async def test_openai_endpoint_before_ollama_still_routes_its_own_model(self):
cfg = _make_cfg([self.OPENAI_EP, EP1])
async def available(ep, *_):
if ep == self.OPENAI_EP:
return {"vllm-only-model"}
return {"qwen2.5-coder:1.5b-base"}
async def loaded(ep):
return {"qwen2.5-coder:1.5b-base"} if ep == EP1 else set()
with (
patch.object(router, "config", cfg),
patch.object(router.fetch, "available_models", side_effect=available),
patch.object(router.fetch, "loaded_models", side_effect=loaded),
):
ep, _ = await router.choose_endpoint("vllm-only-model")
assert ep == self.OPENAI_EP
class TestChooseEndpointLoadBalancing:
async def test_random_selection_among_idle(self):
cfg = _make_cfg([EP1, EP2, EP3])