From c8da58430a912bb21d16123efc9448aa53d49776 Mon Sep 17 00:00:00 2001 From: alpha nerd Date: Sat, 13 Jun 2026 15:54:46 +0200 Subject: [PATCH] fix: logic extend on total_load AND loaded_count --- routing.py | 39 +++++++++++++++++++++++++++--------- test/test_choose_endpoint.py | 31 ++++++++++++++++++++++++++++ 2 files changed, 61 insertions(+), 9 deletions(-) diff --git a/routing.py b/routing.py index 6a0e205..ecf6803 100644 --- a/routing.py +++ b/routing.py @@ -206,16 +206,37 @@ async def choose_endpoint(model: str, reserve: bool = True, """Sum of in-flight requests across *all* models on the endpoint.""" return sum(usage_counts.get(ep, {}).values()) + # How many models each candidate currently has *resident* (from the + # /api/ps probe). With infinite keep-alive a model stays loaded long + # after its in-flight count drops to zero, so this is the signal that + # spreads *distinct* models across backends. + ep_loaded_counts = { + ep: len(models) for ep, models in zip(candidate_endpoints, loaded_sets) + } + + def loaded_count(ep: str) -> int: + return ep_loaded_counts.get(ep, 0) + def pick_least_loaded(eps: list[str]) -> str: - """Pick the endpoint with the lowest total load, breaking ties at - random. Using total load (not per-model usage) for both the ranking - *and* the tie-break is what keeps a request off a backend already - busy with a *different* model — otherwise the per-model count reads - zero everywhere and the ranking gets discarded. See issue: a cold - model B would land on the backend already serving model A while - other backends sat idle.""" - min_load = min(total_load(ep) for ep in eps) - tied = [ep for ep in eps if total_load(ep) == min_load] + """Pick the least-committed endpoint, breaking ties at random. + + Ordering key is ``(total_load, loaded_count)``: + + * ``total_load`` (in-flight requests across *all* models) keeps a + request off a backend already busy with a *different* model — + otherwise the per-model count reads zero everywhere and the + ranking is discarded (cold model B landing on the box serving A). + * ``loaded_count`` (number of *resident* models) then spreads + distinct models across backends. Two different cold models (27b, + 35b) requested back-to-back must not pile onto the same box: once + 27b is resident there, that box has loaded_count 1 while the idle + backends have 0, so the next cold model prefers an empty backend + even though every backend reports zero in-flight load. + + ``random.choice`` only breaks genuine ties on both keys, so a single + idle cluster still distributes the very first cold model evenly.""" + best = min((total_load(ep), loaded_count(ep)) for ep in eps) + tied = [ep for ep in eps if (total_load(ep), loaded_count(ep)) == best] return random.choice(tied) # Priority map: position in all_endpoints list (lower = higher priority) diff --git a/test/test_choose_endpoint.py b/test/test_choose_endpoint.py index a6a7905..be75f82 100644 --- a/test/test_choose_endpoint.py +++ b/test/test_choose_endpoint.py @@ -112,6 +112,37 @@ class TestChooseEndpointBasic: assert ep in (EP1, EP2) assert ep != EP3 + async def test_two_cold_models_spread_across_backends(self): + # Regression: 3 backends all advertise all models. Two *different* + # cold models requested back-to-back must land on *different* + # backends. Once model-a is resident on the chosen backend (infinite + # keep-alive), its in-flight count drops back to 0 — so only the + # resident-model count distinguishes the backends. Without it, the + # second cold model would randomly re-collide on the busy backend. + cfg = _make_cfg([EP1, EP2, EP3], max_conn=4) + + async def available(ep, *_): + return {"model-a:latest", "model-b:latest"} + + # model-a finished loading on EP1 and stays resident; its request has + # completed so EP1 has zero in-flight load, same as EP2/EP3. + loaded = {EP1: {"model-a:latest"}, EP2: set(), EP3: set()} + + async def loaded_models(ep): + return loaded[ep] + + with ( + patch.object(router, "config", cfg), + patch.object(router.fetch, "available_models", side_effect=available), + patch.object(router.fetch, "loaded_models", side_effect=loaded_models), + ): + # A cold model-b must avoid EP1 (which already holds model-a) and + # go to one of the empty backends, every time. + for _ in range(50): + ep, _ = await router.choose_endpoint("model-b:latest", reserve=False) + assert ep in (EP2, EP3) + assert ep != EP1 + async def test_saturated_picks_least_busy(self): cfg = _make_cfg([EP1, EP2]) cfg.max_concurrent_connections = 1