diff --git a/routing.py b/routing.py index 9afe2f7..17a03bd 100644 --- a/routing.py +++ b/routing.py @@ -99,9 +99,19 @@ async def choose_endpoint(model: str, reserve: bool = True, llama_eps_extra = [ep for ep in llama_endpoints(config) if ep not in config.endpoints] all_endpoints = config.endpoints + llama_eps_extra - tag_tasks = [fetch.available_models(ep) for ep in config.endpoints if not is_openai_compatible(ep)] - tag_tasks += [fetch.available_models(ep, config.api_keys.get(ep)) for ep in config.endpoints if is_openai_compatible(ep)] - tag_tasks += [fetch.available_models(ep, config.api_keys.get(ep)) for ep in llama_eps_extra] + # Build the probe tasks in the SAME order as ``all_endpoints`` so the + # gathered results stay aligned for the ``zip(all_endpoints, advertised_sets)`` + # below. (A previous partition into non-OpenAI / OpenAI groups reordered the + # tasks relative to ``all_endpoints``, pairing each endpoint with another + # endpoint's advertised models — so an OpenAI-compatible backend listed + # before an Ollama one would inherit the Ollama model set and get a request + # for a model it cannot serve, 404ing. See issue #128.) + def _advertised_task(ep: str): + if is_openai_compatible(ep): + return fetch.available_models(ep, config.api_keys.get(ep)) + return fetch.available_models(ep) + + tag_tasks = [_advertised_task(ep) for ep in all_endpoints] advertised_sets = await asyncio.gather(*tag_tasks) # 2️⃣ Filter endpoints that advertise the requested model diff --git a/test/test_choose_endpoint.py b/test/test_choose_endpoint.py index 17650c4..2bc4985 100644 --- a/test/test_choose_endpoint.py +++ b/test/test_choose_endpoint.py @@ -279,6 +279,57 @@ class TestChooseEndpointModelNaming: assert ep == EP1 +class TestChooseEndpointAdvertisedAlignment: + """Regression for issue #128 — advertised-model sets must stay aligned with + their endpoints regardless of config ordering. A previous implementation + partitioned the probe tasks (non-OpenAI first, then OpenAI-compatible) while + the endpoint list kept config order, so an OpenAI-compatible endpoint listed + *before* an Ollama one inherited the Ollama model set and received requests + for models it does not serve (404).""" + + OPENAI_EP = "http://vllm:8000/v1" + + async def test_openai_endpoint_before_ollama_does_not_misroute(self): + # OpenAI-compatible endpoint deliberately listed FIRST in config. + cfg = _make_cfg([self.OPENAI_EP, EP1]) + + async def available(ep, *_): + if ep == self.OPENAI_EP: + return {"vllm-only-model"} + return {"qwen2.5-coder:1.5b-base"} # only the Ollama box advertises this + + async def loaded(ep): + return {"qwen2.5-coder:1.5b-base"} if ep == EP1 else set() + + with ( + patch.object(router, "config", cfg), + patch.object(router.fetch, "available_models", side_effect=available), + patch.object(router.fetch, "loaded_models", side_effect=loaded), + ): + ep, _ = await router.choose_endpoint("qwen2.5-coder:1.5b-base") + # Must route to the endpoint that actually advertises the model. + assert ep == EP1 + + async def test_openai_endpoint_before_ollama_still_routes_its_own_model(self): + cfg = _make_cfg([self.OPENAI_EP, EP1]) + + async def available(ep, *_): + if ep == self.OPENAI_EP: + return {"vllm-only-model"} + return {"qwen2.5-coder:1.5b-base"} + + async def loaded(ep): + return {"qwen2.5-coder:1.5b-base"} if ep == EP1 else set() + + with ( + patch.object(router, "config", cfg), + patch.object(router.fetch, "available_models", side_effect=available), + patch.object(router.fetch, "loaded_models", side_effect=loaded), + ): + ep, _ = await router.choose_endpoint("vllm-only-model") + assert ep == self.OPENAI_EP + + class TestChooseEndpointLoadBalancing: async def test_random_selection_among_idle(self): cfg = _make_cfg([EP1, EP2, EP3])