dev-0.9.x -> main #76

Merged
alpha-nerd merged 15 commits from dev-0.9.x into main 2026-05-15 09:16:11 +02:00
2 changed files with 157 additions and 52 deletions
Showing only changes of commit 27dfc07889 - Show all commits

View file

@ -26,6 +26,16 @@ max_concurrent_connections: 2
# When false (default), equally-idle endpoints are chosen at random. # When false (default), equally-idle endpoints are chosen at random.
# priority_routing: true # priority_routing: true
# Conversation affinity (optional, default: false).
# Routes follow-up requests back to the endpoint that previously served the
# same conversation so the llama.cpp / Ollama prompt cache (KV cache) stays
# warm — first turn does a cold prefill, follow-ups skip it. Soft preference:
# falls back to the standard algorithm when the affine endpoint no longer has
# the model loaded or has no free slot. Conversations are fingerprinted by
# (model, first system + first user turn).
# conversation_affinity: true
# conversation_affinity_ttl: 300 # seconds; matches Ollama's default keep_alive
# Optional router-level API key that gates router/API/web UI access (leave empty to disable) # Optional router-level API key that gates router/API/web UI access (leave empty to disable)
nomyo-router-api-key: "" nomyo-router-api-key: ""

197
router.py
View file

@ -6,7 +6,7 @@ version: 0.7
license: AGPL license: AGPL
""" """
# ------------------------------------------------------------- # -------------------------------------------------------------
import orjson, time, asyncio, yaml, ollama, openai, os, re, aiohttp, ssl, random, base64, io, enhance, secrets, math, socket, httpx import orjson, time, asyncio, yaml, ollama, openai, os, re, aiohttp, ssl, random, base64, io, enhance, secrets, math, socket, httpx, hashlib
try: try:
import truststore; truststore.inject_into_ssl() import truststore; truststore.inject_into_ssl()
except ImportError: except ImportError:
@ -223,6 +223,15 @@ class Config(BaseSettings):
# When True, config order = priority; routes by utilization ratio + config index (WRR) # When True, config order = priority; routes by utilization ratio + config index (WRR)
priority_routing: bool = Field(default=False) priority_routing: bool = Field(default=False)
# Conversation affinity: route the same conversation back to the endpoint that
# previously served it, to keep the llama.cpp / Ollama prompt cache (KV cache) warm.
# Soft preference — falls back to the standard algorithm when the affine endpoint
# is saturated or no longer has the model loaded.
conversation_affinity: bool = Field(default=False)
# TTL (seconds) for affinity entries. Defaults to Ollama's default keep_alive (5 min):
# if the backend has already evicted the model, the KV cache is cold anyway.
conversation_affinity_ttl: int = Field(default=300)
api_keys: Dict[str, str] = Field(default_factory=dict) api_keys: Dict[str, str] = Field(default_factory=dict)
# Optional router-level API key used to gate access to this service and dashboard # Optional router-level API key used to gate access to this service and dashboard
router_api_key: Optional[str] = Field(default=None, env="NOMYO_ROUTER_API_KEY") router_api_key: Optional[str] = Field(default=None, env="NOMYO_ROUTER_API_KEY")
@ -436,6 +445,45 @@ token_usage_counts: Dict[str, Dict[str, int]] = defaultdict(lambda: defaultdict(
usage_lock = asyncio.Lock() # protects access to usage_counts usage_lock = asyncio.Lock() # protects access to usage_counts
token_usage_lock = asyncio.Lock() token_usage_lock = asyncio.Lock()
# Conversation affinity map: fingerprint -> (endpoint, expires_at_monotonic).
# Keeps the same conversation pinned to the endpoint that already has its
# KV-cache prefix warm. Never held together with usage_lock.
_affinity_map: Dict[str, tuple[str, float]] = {}
_affinity_lock = asyncio.Lock()
_AFFINITY_MAX_ENTRIES = 10000
def _conversation_fingerprint(model: str, messages: Optional[list],
prompt: Optional[str]) -> Optional[str]:
"""
Stable hash over (model, first system + first user turn). That prefix
determines whether the backend's prompt cache is reusable; later turns
don't influence the routing decision because they extend the same prefix.
Returns None when there is no usable prefix.
"""
parts: list[str] = [model or "_"]
if messages:
for m in messages:
role = m.get("role") if isinstance(m, dict) else None
if role not in ("system", "user"):
continue
content = m.get("content")
if isinstance(content, list): # OpenAI multimodal parts
content = "".join(
p.get("text", "") for p in content
if isinstance(p, dict) and p.get("type") == "text"
)
if not isinstance(content, str):
continue
parts.append(f"{role}:{content}")
if role == "user":
break
elif prompt:
parts.append(f"user:{prompt}")
else:
return None
return hashlib.sha1("\x1f".join(parts).encode("utf-8", "replace")).hexdigest()
# Database instance # Database instance
db: "TokenDatabase" = None db: "TokenDatabase" = None
@ -1738,7 +1786,8 @@ def get_max_connections(ep: str) -> int:
"max_concurrent_connections", config.max_concurrent_connections "max_concurrent_connections", config.max_concurrent_connections
) )
async def choose_endpoint(model: str, reserve: bool = True) -> tuple[str, str]: async def choose_endpoint(model: str, reserve: bool = True,
affinity_key: Optional[str] = None) -> tuple[str, str]:
""" """
Determine which endpoint to use for the given model while respecting Determine which endpoint to use for the given model while respecting
the `max_concurrent_connections` per endpointmodel pair **and** the `max_concurrent_connections` per endpointmodel pair **and**
@ -1748,6 +1797,10 @@ async def choose_endpoint(model: str, reserve: bool = True) -> tuple[str, str]:
1 Query every endpoint for its advertised models (`/api/tags`). 1 Query every endpoint for its advertised models (`/api/tags`).
2 Build a list of endpoints that contain the requested model. 2 Build a list of endpoints that contain the requested model.
2.5 If conversation affinity is enabled and the caller passes
``affinity_key``, prefer the endpoint that previously served the
same conversation but only when it still has the model loaded
and a free slot. Otherwise fall through to the standard logic.
3 For those endpoints, find those that have the model loaded 3 For those endpoints, find those that have the model loaded
(`/api/ps`) *and* still have a free slot. (`/api/ps`) *and* still have a free slot.
4 If none are both loaded and free, fall back to any endpoint 4 If none are both loaded and free, fall back to any endpoint
@ -1799,6 +1852,19 @@ async def choose_endpoint(model: str, reserve: bool = True) -> tuple[str, str]:
load_tasks = [fetch.loaded_models(ep) for ep in candidate_endpoints] load_tasks = [fetch.loaded_models(ep) for ep in candidate_endpoints]
loaded_sets = await asyncio.gather(*load_tasks) loaded_sets = await asyncio.gather(*load_tasks)
# Look up a possible affinity hint *before* taking usage_lock. The two
# locks are never held together to avoid lock-ordering issues.
affine_ep: Optional[str] = None
if config.conversation_affinity and affinity_key:
async with _affinity_lock:
entry = _affinity_map.get(affinity_key)
if entry is not None:
ep, expires_at = entry
if expires_at < time.monotonic():
_affinity_map.pop(affinity_key, None)
else:
affine_ep = ep
# Protect all reads/writes of usage_counts with the lock so that selection # Protect all reads/writes of usage_counts with the lock so that selection
# and reservation are atomic — concurrent callers see each other's pending load. # and reservation are atomic — concurrent callers see each other's pending load.
async with usage_lock: async with usage_lock:
@ -1814,59 +1880,75 @@ async def choose_endpoint(model: str, reserve: bool = True) -> tuple[str, str]:
# Priority map: position in all_endpoints list (lower = higher priority) # Priority map: position in all_endpoints list (lower = higher priority)
ep_priority = {ep: i for i, ep in enumerate(all_endpoints)} ep_priority = {ep: i for i, ep in enumerate(all_endpoints)}
# 3⃣ Endpoints that have the model loaded *and* a free slot selected: Optional[str] = None
loaded_and_free = [
ep for ep, models in zip(candidate_endpoints, loaded_sets)
if model in models and tracking_usage(ep) < get_max_connections(ep)
]
if loaded_and_free: # 2⃣.5 Conversation affinity preference — only honour the hint when
if config.priority_routing: # the affine endpoint still advertises the model loaded *and* has a
# WRR: sort by config order first (stable), then by utilization ratio. # free slot. Otherwise fall back to the standard algorithm.
# Stable sort preserves priority for equal-ratio endpoints. if affine_ep:
loaded_and_free.sort(key=lambda ep: ep_priority.get(ep, 999)) ep_loaded = {
loaded_and_free.sort(key=utilization_ratio) ep: set(models)
selected = loaded_and_free[0] for ep, models in zip(candidate_endpoints, loaded_sets)
else: }
# Sort ascending for load balancing — all endpoints here already have the if (affine_ep in candidate_endpoints
# model loaded, so there is no model-switching cost to optimise for. and model in ep_loaded.get(affine_ep, set())
loaded_and_free.sort(key=tracking_usage) and tracking_usage(affine_ep) < get_max_connections(affine_ep)):
# When all candidates are equally idle, randomise to avoid always picking selected = affine_ep
# the first entry in a stable sort.
if all(tracking_usage(ep) == 0 for ep in loaded_and_free): if selected is None:
selected = random.choice(loaded_and_free) # 3⃣ Endpoints that have the model loaded *and* a free slot
else: loaded_and_free = [
selected = loaded_and_free[0] ep for ep, models in zip(candidate_endpoints, loaded_sets)
else: if model in models and tracking_usage(ep) < get_max_connections(ep)
# 4⃣ Endpoints among the candidates that simply have a free slot
endpoints_with_free_slot = [
ep for ep in candidate_endpoints
if tracking_usage(ep) < get_max_connections(ep)
] ]
if endpoints_with_free_slot: if loaded_and_free:
if config.priority_routing: if config.priority_routing:
endpoints_with_free_slot.sort(key=lambda ep: ep_priority.get(ep, 999)) # WRR: sort by config order first (stable), then by utilization ratio.
endpoints_with_free_slot.sort(key=utilization_ratio) # Stable sort preserves priority for equal-ratio endpoints.
selected = endpoints_with_free_slot[0] loaded_and_free.sort(key=lambda ep: ep_priority.get(ep, 999))
loaded_and_free.sort(key=utilization_ratio)
selected = loaded_and_free[0]
else: else:
# Sort by total endpoint load (ascending) to prefer idle endpoints. # Sort ascending for load balancing — all endpoints here already have the
endpoints_with_free_slot.sort( # model loaded, so there is no model-switching cost to optimise for.
key=lambda ep: sum(usage_counts.get(ep, {}).values()) loaded_and_free.sort(key=tracking_usage)
) # When all candidates are equally idle, randomise to avoid always picking
if all(tracking_usage(ep) == 0 for ep in endpoints_with_free_slot): # the first entry in a stable sort.
selected = random.choice(endpoints_with_free_slot) if all(tracking_usage(ep) == 0 for ep in loaded_and_free):
selected = random.choice(loaded_and_free)
else: else:
selected = endpoints_with_free_slot[0] selected = loaded_and_free[0]
else: else:
# 5⃣ All candidate endpoints are saturated pick the least-busy one (will queue) # 4⃣ Endpoints among the candidates that simply have a free slot
if config.priority_routing: endpoints_with_free_slot = [
selected = min( ep for ep in candidate_endpoints
candidate_endpoints, if tracking_usage(ep) < get_max_connections(ep)
key=lambda ep: (utilization_ratio(ep), ep_priority.get(ep, 999)), ]
)
if endpoints_with_free_slot:
if config.priority_routing:
endpoints_with_free_slot.sort(key=lambda ep: ep_priority.get(ep, 999))
endpoints_with_free_slot.sort(key=utilization_ratio)
selected = endpoints_with_free_slot[0]
else:
# Sort by total endpoint load (ascending) to prefer idle endpoints.
endpoints_with_free_slot.sort(
key=lambda ep: sum(usage_counts.get(ep, {}).values())
)
if all(tracking_usage(ep) == 0 for ep in endpoints_with_free_slot):
selected = random.choice(endpoints_with_free_slot)
else:
selected = endpoints_with_free_slot[0]
else: else:
selected = min(candidate_endpoints, key=tracking_usage) # 5⃣ All candidate endpoints are saturated pick the least-busy one (will queue)
if config.priority_routing:
selected = min(
candidate_endpoints,
key=lambda ep: (utilization_ratio(ep), ep_priority.get(ep, 999)),
)
else:
selected = min(candidate_endpoints, key=tracking_usage)
tracking_model = get_tracking_model(selected, model) tracking_model = get_tracking_model(selected, model)
snapshot = None snapshot = None
@ -1875,6 +1957,15 @@ async def choose_endpoint(model: str, reserve: bool = True) -> tuple[str, str]:
snapshot = _capture_snapshot() snapshot = _capture_snapshot()
if snapshot is not None: if snapshot is not None:
await _distribute_snapshot(snapshot) await _distribute_snapshot(snapshot)
# Record / refresh affinity *after* releasing usage_lock.
if reserve and config.conversation_affinity and affinity_key:
expires_at = time.monotonic() + config.conversation_affinity_ttl
async with _affinity_lock:
_affinity_map[affinity_key] = (selected, expires_at)
if len(_affinity_map) > _AFFINITY_MAX_ENTRIES:
now = time.monotonic()
for k in [k for k, v in _affinity_map.items() if v[1] < now]:
_affinity_map.pop(k, None)
return selected, tracking_model return selected, tracking_model
# ------------------------------------------------------------- # -------------------------------------------------------------
@ -1925,7 +2016,8 @@ async def proxy(request: Request):
yield _cached yield _cached
return StreamingResponse(_serve_cached_generate(), media_type="application/json") return StreamingResponse(_serve_cached_generate(), media_type="application/json")
endpoint, tracking_model = await choose_endpoint(model) _affinity_key = _conversation_fingerprint(model, None, prompt)
endpoint, tracking_model = await choose_endpoint(model, affinity_key=_affinity_key)
use_openai = is_openai_compatible(endpoint) use_openai = is_openai_compatible(endpoint)
if use_openai: if use_openai:
if ":latest" in model: if ":latest" in model:
@ -2095,7 +2187,8 @@ async def chat_proxy(request: Request):
opt = True opt = True
else: else:
opt = False opt = False
endpoint, tracking_model = await choose_endpoint(model) _affinity_key = _conversation_fingerprint(model, messages, None)
endpoint, tracking_model = await choose_endpoint(model, affinity_key=_affinity_key)
use_openai = is_openai_compatible(endpoint) use_openai = is_openai_compatible(endpoint)
if use_openai: if use_openai:
if ":latest" in model: if ":latest" in model:
@ -3228,7 +3321,8 @@ async def openai_chat_completions_proxy(request: Request):
return StreamingResponse(_serve_cached_ochat_json(), media_type="application/json") return StreamingResponse(_serve_cached_ochat_json(), media_type="application/json")
# 2. Endpoint logic # 2. Endpoint logic
endpoint, tracking_model = await choose_endpoint(model) _affinity_key = _conversation_fingerprint(model, messages, None)
endpoint, tracking_model = await choose_endpoint(model, affinity_key=_affinity_key)
oclient = _make_openai_client(endpoint, default_headers=default_headers, api_key=config.api_keys.get(endpoint, "no-key")) oclient = _make_openai_client(endpoint, default_headers=default_headers, api_key=config.api_keys.get(endpoint, "no-key"))
# 3. Helpers and API call — done in handler scope so try/except works reliably # 3. Helpers and API call — done in handler scope so try/except works reliably
async def _normalize_images_in_messages(msgs: list) -> list: async def _normalize_images_in_messages(msgs: list) -> list:
@ -3538,7 +3632,8 @@ async def openai_completions_proxy(request: Request):
return StreamingResponse(_serve_cached_ocompl_json(), media_type="application/json") return StreamingResponse(_serve_cached_ocompl_json(), media_type="application/json")
# 2. Endpoint logic # 2. Endpoint logic
endpoint, tracking_model = await choose_endpoint(model) _affinity_key = _conversation_fingerprint(model, None, prompt)
endpoint, tracking_model = await choose_endpoint(model, affinity_key=_affinity_key)
oclient = _make_openai_client(endpoint, default_headers=default_headers, api_key=config.api_keys.get(endpoint, "no-key")) oclient = _make_openai_client(endpoint, default_headers=default_headers, api_key=config.api_keys.get(endpoint, "no-key"))
# 3. Async generator that streams completions data and decrements the counter # 3. Async generator that streams completions data and decrements the counter