From 5ad5bfe66ef25095a21274978a7a142cd1e319c4 Mon Sep 17 00:00:00 2001 From: alpha-nerd-nomyo Date: Sun, 18 Jan 2026 09:31:53 +0100 Subject: [PATCH] feat: endpoint selection more consistent and understandable --- router.py | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/router.py b/router.py index 9922c41..0a2ba57 100644 --- a/router.py +++ b/router.py @@ -980,8 +980,15 @@ async def choose_endpoint(model: str) -> str: ] if loaded_and_free: - ep = min(loaded_and_free, key=current_usage) - return ep + # Sort by total endpoint usage first (prefer idle endpoints) + # Then by per-model usage (balance load for this specific model) + loaded_and_free.sort( + key=lambda ep: ( + sum(usage_counts.get(ep, {}).values()), # Primary: total endpoint usage + usage_counts.get(ep, {}).get(model, 0) # Secondary: per-model usage + ) + ) + return loaded_and_free[0] # 4️⃣ Endpoints among the candidates that simply have a free slot endpoints_with_free_slot = [ @@ -990,8 +997,14 @@ async def choose_endpoint(model: str) -> str: ] if endpoints_with_free_slot: - #return random.choice(endpoints_with_free_slot) - endpoints_with_free_slot.sort(key=lambda ep: sum(usage_counts.get(ep, {}).values())) + # Sort by total endpoint usage first (prefer idle endpoints) + # Then by per-model usage (balance load for this specific model) + endpoints_with_free_slot.sort( + key=lambda ep: ( + sum(usage_counts.get(ep, {}).values()), # Primary: total endpoint usage + usage_counts.get(ep, {}).get(model, 0) # Secondary: per-model usage + ) + ) return endpoints_with_free_slot[0] # 5️⃣ All candidate endpoints are saturated – pick one with lowest usages count (will queue)