fix: routing error for openai compatible endpoints #128

2026-06-25 14:13:07 +02:00 · 2026-06-25 14:13:07 +02:00 · 1b3ad057a6
commit 1b3ad057a6
parent 4f42f350a3
2 changed files with 64 additions and 3 deletions
--- a/routing.py
+++ b/routing.py
@ -99,9 +99,19 @@ async def choose_endpoint(model: str, reserve: bool = True,
    llama_eps_extra = [ep for ep in llama_endpoints(config) if ep not in config.endpoints]
    all_endpoints = config.endpoints + llama_eps_extra

-    tag_tasks = [fetch.available_models(ep) for ep in config.endpoints if not is_openai_compatible(ep)]
-    tag_tasks += [fetch.available_models(ep, config.api_keys.get(ep)) for ep in config.endpoints if is_openai_compatible(ep)]
-    tag_tasks += [fetch.available_models(ep, config.api_keys.get(ep)) for ep in llama_eps_extra]
+    # Build the probe tasks in the SAME order as ``all_endpoints`` so the
+    # gathered results stay aligned for the ``zip(all_endpoints, advertised_sets)``
+    # below. (A previous partition into non-OpenAI / OpenAI groups reordered the
+    # tasks relative to ``all_endpoints``, pairing each endpoint with another
+    # endpoint's advertised models — so an OpenAI-compatible backend listed
+    # before an Ollama one would inherit the Ollama model set and get a request
+    # for a model it cannot serve, 404ing. See issue #128.)
+    def _advertised_task(ep: str):
+        if is_openai_compatible(ep):
+            return fetch.available_models(ep, config.api_keys.get(ep))
+        return fetch.available_models(ep)
+
+    tag_tasks = [_advertised_task(ep) for ep in all_endpoints]
    advertised_sets = await asyncio.gather(*tag_tasks)

    # 2️⃣  Filter endpoints that advertise the requested model
--- a/test/test_choose_endpoint.py
+++ b/test/test_choose_endpoint.py
@ -279,6 +279,57 @@ class TestChooseEndpointModelNaming:
        assert ep == EP1


+class TestChooseEndpointAdvertisedAlignment:
+    """Regression for issue #128 — advertised-model sets must stay aligned with
+    their endpoints regardless of config ordering. A previous implementation
+    partitioned the probe tasks (non-OpenAI first, then OpenAI-compatible) while
+    the endpoint list kept config order, so an OpenAI-compatible endpoint listed
+    *before* an Ollama one inherited the Ollama model set and received requests
+    for models it does not serve (404)."""
+
+    OPENAI_EP = "http://vllm:8000/v1"
+
+    async def test_openai_endpoint_before_ollama_does_not_misroute(self):
+        # OpenAI-compatible endpoint deliberately listed FIRST in config.
+        cfg = _make_cfg([self.OPENAI_EP, EP1])
+
+        async def available(ep, *_):
+            if ep == self.OPENAI_EP:
+                return {"vllm-only-model"}
+            return {"qwen2.5-coder:1.5b-base"}  # only the Ollama box advertises this
+
+        async def loaded(ep):
+            return {"qwen2.5-coder:1.5b-base"} if ep == EP1 else set()
+
+        with (
+            patch.object(router, "config", cfg),
+            patch.object(router.fetch, "available_models", side_effect=available),
+            patch.object(router.fetch, "loaded_models", side_effect=loaded),
+        ):
+            ep, _ = await router.choose_endpoint("qwen2.5-coder:1.5b-base")
+        # Must route to the endpoint that actually advertises the model.
+        assert ep == EP1
+
+    async def test_openai_endpoint_before_ollama_still_routes_its_own_model(self):
+        cfg = _make_cfg([self.OPENAI_EP, EP1])
+
+        async def available(ep, *_):
+            if ep == self.OPENAI_EP:
+                return {"vllm-only-model"}
+            return {"qwen2.5-coder:1.5b-base"}
+
+        async def loaded(ep):
+            return {"qwen2.5-coder:1.5b-base"} if ep == EP1 else set()
+
+        with (
+            patch.object(router, "config", cfg),
+            patch.object(router.fetch, "available_models", side_effect=available),
+            patch.object(router.fetch, "loaded_models", side_effect=loaded),
+        ):
+            ep, _ = await router.choose_endpoint("vllm-only-model")
+        assert ep == self.OPENAI_EP
+
+
 class TestChooseEndpointLoadBalancing:
    async def test_random_selection_among_idle(self):
        cfg = _make_cfg([EP1, EP2, EP3])