From 3e3f0dd383b4e6273fbaf7387565521f8f300cb8 Mon Sep 17 00:00:00 2001 From: alpha-nerd-nomyo Date: Mon, 19 Jan 2026 14:21:08 +0100 Subject: [PATCH] fix: endpoint selection logic --- router.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/router.py b/router.py index 0a2ba57..5bb1eaa 100644 --- a/router.py +++ b/router.py @@ -980,13 +980,11 @@ async def choose_endpoint(model: str) -> str: ] if loaded_and_free: - # Sort by total endpoint usage first (prefer idle endpoints) - # Then by per-model usage (balance load for this specific model) + # Sort by per-model usage in DESCENDING order to ensure model affinity + # Endpoints with higher usage (already handling this model) should be preferred + # until they reach max_concurrent_connections loaded_and_free.sort( - key=lambda ep: ( - sum(usage_counts.get(ep, {}).values()), # Primary: total endpoint usage - usage_counts.get(ep, {}).get(model, 0) # Secondary: per-model usage - ) + key=lambda ep: -usage_counts.get(ep, {}).get(model, 0) # Negative for descending order ) return loaded_and_free[0] @@ -997,12 +995,14 @@ async def choose_endpoint(model: str) -> str: ] if endpoints_with_free_slot: - # Sort by total endpoint usage first (prefer idle endpoints) - # Then by per-model usage (balance load for this specific model) + # Sort by per-model usage (descending) first to ensure model affinity + # Even if the model isn't showing as "loaded" in /api/ps yet (e.g., during initial loading), + # we want to send subsequent requests to the endpoint that already has connections for this model + # Then by total endpoint usage (ascending) to balance idle endpoints endpoints_with_free_slot.sort( key=lambda ep: ( - sum(usage_counts.get(ep, {}).values()), # Primary: total endpoint usage - usage_counts.get(ep, {}).get(model, 0) # Secondary: per-model usage + -usage_counts.get(ep, {}).get(model, 0), # Primary: per-model usage (descending - prefer endpoints with connections) + sum(usage_counts.get(ep, {}).values()) # Secondary: total endpoint usage (ascending - prefer idle endpoints) ) ) return endpoints_with_free_slot[0]