fix: logic extend on total_load AND loaded_count

This commit is contained in:
Alpha Nerd 2026-06-13 15:54:46 +02:00
parent 5184123fd2
commit c8da58430a
Signed by: alpha-nerd
SSH key fingerprint: SHA256:QkkAgVoYi9TQ0UKPkiKSfnerZy2h4qhi3SVPXJmBN+M
2 changed files with 61 additions and 9 deletions

View file

@ -206,16 +206,37 @@ async def choose_endpoint(model: str, reserve: bool = True,
"""Sum of in-flight requests across *all* models on the endpoint."""
return sum(usage_counts.get(ep, {}).values())
# How many models each candidate currently has *resident* (from the
# /api/ps probe). With infinite keep-alive a model stays loaded long
# after its in-flight count drops to zero, so this is the signal that
# spreads *distinct* models across backends.
ep_loaded_counts = {
ep: len(models) for ep, models in zip(candidate_endpoints, loaded_sets)
}
def loaded_count(ep: str) -> int:
return ep_loaded_counts.get(ep, 0)
def pick_least_loaded(eps: list[str]) -> str:
"""Pick the endpoint with the lowest total load, breaking ties at
random. Using total load (not per-model usage) for both the ranking
*and* the tie-break is what keeps a request off a backend already
busy with a *different* model otherwise the per-model count reads
zero everywhere and the ranking gets discarded. See issue: a cold
model B would land on the backend already serving model A while
other backends sat idle."""
min_load = min(total_load(ep) for ep in eps)
tied = [ep for ep in eps if total_load(ep) == min_load]
"""Pick the least-committed endpoint, breaking ties at random.
Ordering key is ``(total_load, loaded_count)``:
* ``total_load`` (in-flight requests across *all* models) keeps a
request off a backend already busy with a *different* model
otherwise the per-model count reads zero everywhere and the
ranking is discarded (cold model B landing on the box serving A).
* ``loaded_count`` (number of *resident* models) then spreads
distinct models across backends. Two different cold models (27b,
35b) requested back-to-back must not pile onto the same box: once
27b is resident there, that box has loaded_count 1 while the idle
backends have 0, so the next cold model prefers an empty backend
even though every backend reports zero in-flight load.
``random.choice`` only breaks genuine ties on both keys, so a single
idle cluster still distributes the very first cold model evenly."""
best = min((total_load(ep), loaded_count(ep)) for ep in eps)
tied = [ep for ep in eps if (total_load(ep), loaded_count(ep)) == best]
return random.choice(tied)
# Priority map: position in all_endpoints list (lower = higher priority)

View file

@ -112,6 +112,37 @@ class TestChooseEndpointBasic:
assert ep in (EP1, EP2)
assert ep != EP3
async def test_two_cold_models_spread_across_backends(self):
# Regression: 3 backends all advertise all models. Two *different*
# cold models requested back-to-back must land on *different*
# backends. Once model-a is resident on the chosen backend (infinite
# keep-alive), its in-flight count drops back to 0 — so only the
# resident-model count distinguishes the backends. Without it, the
# second cold model would randomly re-collide on the busy backend.
cfg = _make_cfg([EP1, EP2, EP3], max_conn=4)
async def available(ep, *_):
return {"model-a:latest", "model-b:latest"}
# model-a finished loading on EP1 and stays resident; its request has
# completed so EP1 has zero in-flight load, same as EP2/EP3.
loaded = {EP1: {"model-a:latest"}, EP2: set(), EP3: set()}
async def loaded_models(ep):
return loaded[ep]
with (
patch.object(router, "config", cfg),
patch.object(router.fetch, "available_models", side_effect=available),
patch.object(router.fetch, "loaded_models", side_effect=loaded_models),
):
# A cold model-b must avoid EP1 (which already holds model-a) and
# go to one of the empty backends, every time.
for _ in range(50):
ep, _ = await router.choose_endpoint("model-b:latest", reserve=False)
assert ep in (EP2, EP3)
assert ep != EP1
async def test_saturated_picks_least_busy(self):
cfg = _make_cfg([EP1, EP2])
cfg.max_concurrent_connections = 1