Add files via upload

improving high load endpoint selection by moving from rr to lc algorithm for connections > max_concurrent_connections
This commit is contained in:
Alpha Nerd 2025-09-01 09:30:23 +02:00 committed by GitHub
parent 64549b4e1c
commit 9f19350f55
No known key found for this signature in database
GPG key ID: B5690EEEBB952194

View file

@ -71,7 +71,7 @@ async def fetch_available_models(endpoint: str) -> Set[str]:
set is returned.
"""
try:
async with httpx.AsyncClient(timeout=5.0) as client:
async with httpx.AsyncClient(timeout=2.5) as client:
if "/v1" in endpoint:
resp = await client.get(f"{endpoint}/models")
else:
@ -85,8 +85,9 @@ async def fetch_available_models(endpoint: str) -> Set[str]:
else:
models = {m.get("name") for m in data.get("models", []) if m.get("name")}
return models
except Exception:
except Exception as e:
# Treat any error as if the endpoint offers no models
print(e)
return set()
async def fetch_loaded_models(endpoint: str) -> Set[str]:
@ -206,14 +207,19 @@ async def choose_endpoint(model: str) -> str:
loaded_sets = await asyncio.gather(*load_tasks)
async with usage_lock:
# Helper: get current usage count for (endpoint, model)
def current_usage(ep: str) -> int:
return usage_counts.get(ep, {}).get(model, 0)
# 3⃣ Endpoints that have the model loaded *and* a free slot
loaded_and_free = [
ep for ep, models in zip(candidate_endpoints, loaded_sets)
if model in models and usage_counts[ep].get(model, 0) < config.max_concurrent_connections
]
if loaded_and_free:
return random.choice(loaded_and_free)
ep = min(loaded_and_free, key=current_usage)
return ep
# 4⃣ Endpoints among the candidates that simply have a free slot
endpoints_with_free_slot = [
@ -222,10 +228,12 @@ async def choose_endpoint(model: str) -> str:
]
if endpoints_with_free_slot:
return random.choice(endpoints_with_free_slot)
ep = min(endpoints_with_free_slot, key=current_usage)
return ep
# 5⃣ All candidate endpoints are saturated pick any (will queue)
return random.choice(candidate_endpoints)
ep = min(candidate_endpoints, key=current_usage)
return ep
# -------------------------------------------------------------
# 6. API route Generate