From 07751ddd3b70ca58fbcc9b79fcf55282acd32838 Mon Sep 17 00:00:00 2001 From: alpha-nerd-nomyo Date: Thu, 19 Feb 2026 10:11:53 +0100 Subject: [PATCH] fix: endpoint selection logic again --- router.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/router.py b/router.py index e1d42bd..5676bb9 100644 --- a/router.py +++ b/router.py @@ -1536,11 +1536,11 @@ async def choose_endpoint(model: str) -> str: ] if loaded_and_free: - # Sort by per-model usage in DESCENDING order to ensure model affinity - # Endpoints with higher usage (already handling this model) should be preferred - # until they reach max_concurrent_connections + # Sort by per-model usage in ASCENDING order for load balancing. + # All endpoints in this set already have the model loaded, so there is + # no model-switching cost to avoid — prefer the least-busy endpoint. loaded_and_free.sort( - key=lambda ep: -usage_counts.get(ep, {}).get(model, 0) # Negative for descending order + key=lambda ep: usage_counts.get(ep, {}).get(model, 0) ) # If all endpoints have zero usage for this model, randomize to distribute