From c8da58430a912bb21d16123efc9448aa53d49776 Mon Sep 17 00:00:00 2001
From: alpha nerd <alpha-nerd@nomyo.ai>
Date: Sat, 13 Jun 2026 15:54:46 +0200
Subject: [PATCH] fix: logic extend on total_load AND loaded_count

---
 routing.py                   | 39 +++++++++++++++++++++++++++---------
 test/test_choose_endpoint.py | 31 ++++++++++++++++++++++++++++
 2 files changed, 61 insertions(+), 9 deletions(-)

diff --git a/routing.py b/routing.py
index 6a0e205..ecf6803 100644
--- a/routing.py
+++ b/routing.py
@@ -206,16 +206,37 @@ async def choose_endpoint(model: str, reserve: bool = True,
             """Sum of in-flight requests across *all* models on the endpoint."""
             return sum(usage_counts.get(ep, {}).values())
 
+        # How many models each candidate currently has *resident* (from the
+        # /api/ps probe). With infinite keep-alive a model stays loaded long
+        # after its in-flight count drops to zero, so this is the signal that
+        # spreads *distinct* models across backends.
+        ep_loaded_counts = {
+            ep: len(models) for ep, models in zip(candidate_endpoints, loaded_sets)
+        }
+
+        def loaded_count(ep: str) -> int:
+            return ep_loaded_counts.get(ep, 0)
+
         def pick_least_loaded(eps: list[str]) -> str:
-            """Pick the endpoint with the lowest total load, breaking ties at
-            random. Using total load (not per-model usage) for both the ranking
-            *and* the tie-break is what keeps a request off a backend already
-            busy with a *different* model — otherwise the per-model count reads
-            zero everywhere and the ranking gets discarded. See issue: a cold
-            model B would land on the backend already serving model A while
-            other backends sat idle."""
-            min_load = min(total_load(ep) for ep in eps)
-            tied = [ep for ep in eps if total_load(ep) == min_load]
+            """Pick the least-committed endpoint, breaking ties at random.
+
+            Ordering key is ``(total_load, loaded_count)``:
+
+            * ``total_load`` (in-flight requests across *all* models) keeps a
+              request off a backend already busy with a *different* model —
+              otherwise the per-model count reads zero everywhere and the
+              ranking is discarded (cold model B landing on the box serving A).
+            * ``loaded_count`` (number of *resident* models) then spreads
+              distinct models across backends. Two different cold models (27b,
+              35b) requested back-to-back must not pile onto the same box: once
+              27b is resident there, that box has loaded_count 1 while the idle
+              backends have 0, so the next cold model prefers an empty backend
+              even though every backend reports zero in-flight load.
+
+            ``random.choice`` only breaks genuine ties on both keys, so a single
+            idle cluster still distributes the very first cold model evenly."""
+            best = min((total_load(ep), loaded_count(ep)) for ep in eps)
+            tied = [ep for ep in eps if (total_load(ep), loaded_count(ep)) == best]
             return random.choice(tied)
 
         # Priority map: position in all_endpoints list (lower = higher priority)
diff --git a/test/test_choose_endpoint.py b/test/test_choose_endpoint.py
index a6a7905..be75f82 100644
--- a/test/test_choose_endpoint.py
+++ b/test/test_choose_endpoint.py
@@ -112,6 +112,37 @@ class TestChooseEndpointBasic:
                 assert ep in (EP1, EP2)
                 assert ep != EP3
 
+    async def test_two_cold_models_spread_across_backends(self):
+        # Regression: 3 backends all advertise all models. Two *different*
+        # cold models requested back-to-back must land on *different*
+        # backends. Once model-a is resident on the chosen backend (infinite
+        # keep-alive), its in-flight count drops back to 0 — so only the
+        # resident-model count distinguishes the backends. Without it, the
+        # second cold model would randomly re-collide on the busy backend.
+        cfg = _make_cfg([EP1, EP2, EP3], max_conn=4)
+
+        async def available(ep, *_):
+            return {"model-a:latest", "model-b:latest"}
+
+        # model-a finished loading on EP1 and stays resident; its request has
+        # completed so EP1 has zero in-flight load, same as EP2/EP3.
+        loaded = {EP1: {"model-a:latest"}, EP2: set(), EP3: set()}
+
+        async def loaded_models(ep):
+            return loaded[ep]
+
+        with (
+            patch.object(router, "config", cfg),
+            patch.object(router.fetch, "available_models", side_effect=available),
+            patch.object(router.fetch, "loaded_models", side_effect=loaded_models),
+        ):
+            # A cold model-b must avoid EP1 (which already holds model-a) and
+            # go to one of the empty backends, every time.
+            for _ in range(50):
+                ep, _ = await router.choose_endpoint("model-b:latest", reserve=False)
+                assert ep in (EP2, EP3)
+                assert ep != EP1
+
     async def test_saturated_picks_least_busy(self):
         cfg = _make_cfg([EP1, EP2])
         cfg.max_concurrent_connections = 1