2026-05-19 11:18:06 +02:00
|
|
|
|
"""Shared mutable router state.
|
|
|
|
|
|
|
|
|
|
|
|
All process-wide caches, locks, in-flight task maps, queues, counters and
|
|
|
|
|
|
buffers used by the router live here. These names are only ever *mutated*
|
|
|
|
|
|
(dict/set updates, lock acquisitions, queue put/get) — never rebound — so
|
|
|
|
|
|
importing them via ``from state import …`` is safe from every module.
|
|
|
|
|
|
|
|
|
|
|
|
Rebound singletons (``config``, ``db``, ``token_worker_task``,
|
|
|
|
|
|
``flush_task``) intentionally stay in router.py so their reassignment on
|
|
|
|
|
|
startup is visible to all callers.
|
|
|
|
|
|
"""
|
|
|
|
|
|
import asyncio
|
|
|
|
|
|
from collections import defaultdict
|
|
|
|
|
|
from typing import Dict, Set
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# ------------------------------------------------------------------
|
|
|
|
|
|
# In‑memory caches
|
|
|
|
|
|
# ------------------------------------------------------------------
|
|
|
|
|
|
# Successful results are cached for 300s
|
|
|
|
|
|
_models_cache: dict[str, tuple[Set[str], float]] = {}
|
|
|
|
|
|
_loaded_models_cache: dict[str, tuple[Set[str], float]] = {}
|
|
|
|
|
|
# Transient errors are cached separately per concern so that a failure
|
|
|
|
|
|
# in one path does not poison the other.
|
|
|
|
|
|
_available_error_cache: dict[str, float] = {}
|
|
|
|
|
|
_loaded_error_cache: dict[str, float] = {}
|
|
|
|
|
|
# Per-(endpoint, model) completion-path failures. A llama-server in router
|
|
|
|
|
|
# mode can keep returning /v1/models 200 OK after its delegated worker for
|
|
|
|
|
|
# a specific model dies — the probe-level caches above will not catch this.
|
|
|
|
|
|
# We record signals observed during actual completion attempts so
|
|
|
|
|
|
# choose_endpoint can avoid the affected (endpoint, model) pair without
|
|
|
|
|
|
# poisoning unrelated models on the same backend.
|
|
|
|
|
|
_completion_error_cache: dict[tuple[str, str], float] = {}
|
|
|
|
|
|
_COMPLETION_ERROR_TTL = 300
|
|
|
|
|
|
|
|
|
|
|
|
# ------------------------------------------------------------------
|
|
|
|
|
|
# Cache locks
|
|
|
|
|
|
# ------------------------------------------------------------------
|
|
|
|
|
|
_models_cache_lock = asyncio.Lock()
|
|
|
|
|
|
_loaded_models_cache_lock = asyncio.Lock()
|
|
|
|
|
|
_available_error_cache_lock = asyncio.Lock()
|
|
|
|
|
|
_loaded_error_cache_lock = asyncio.Lock()
|
|
|
|
|
|
_completion_error_cache_lock = asyncio.Lock()
|
|
|
|
|
|
|
|
|
|
|
|
# ------------------------------------------------------------------
|
|
|
|
|
|
# In-flight request tracking (prevents cache stampede)
|
|
|
|
|
|
# ------------------------------------------------------------------
|
|
|
|
|
|
_inflight_available_models: dict[str, asyncio.Task] = {}
|
|
|
|
|
|
_inflight_loaded_models: dict[str, asyncio.Task] = {}
|
|
|
|
|
|
_inflight_lock = asyncio.Lock()
|
|
|
|
|
|
_bg_refresh_available: dict[str, asyncio.Task] = {}
|
|
|
|
|
|
_bg_refresh_loaded: dict[str, asyncio.Task] = {}
|
|
|
|
|
|
_bg_refresh_lock = asyncio.Lock()
|
|
|
|
|
|
|
|
|
|
|
|
# ------------------------------------------------------------------
|
|
|
|
|
|
# Queues
|
|
|
|
|
|
# ------------------------------------------------------------------
|
|
|
|
|
|
_subscribers: Set[asyncio.Queue] = set()
|
|
|
|
|
|
_subscribers_lock = asyncio.Lock()
|
|
|
|
|
|
token_queue: asyncio.Queue[tuple[str, str, int, int]] = asyncio.Queue()
|
|
|
|
|
|
|
|
|
|
|
|
# ------------------------------------------------------------------
|
|
|
|
|
|
# HTTP client / connector cache
|
|
|
|
|
|
# ------------------------------------------------------------------
|
|
|
|
|
|
app_state = {
|
|
|
|
|
|
"session": None,
|
|
|
|
|
|
"connector": None,
|
2026-05-28 09:54:53 +02:00
|
|
|
|
"probe_session": None, # dedicated session for health/introspection probes
|
|
|
|
|
|
"probe_connector": None, # connection pool isolated from proxy traffic
|
2026-05-19 11:18:06 +02:00
|
|
|
|
"socket_sessions": {}, # endpoint -> aiohttp.ClientSession(UnixConnector) for .sock endpoints
|
|
|
|
|
|
"httpx_clients": {}, # endpoint -> httpx.AsyncClient(UDS transport) for .sock endpoints
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2026-05-19 12:05:51 +02:00
|
|
|
|
# Default outbound HTTP headers attached to every backend request.
|
|
|
|
|
|
default_headers = {
|
|
|
|
|
|
"HTTP-Referer": "https://nomyo.ai",
|
|
|
|
|
|
"Referer": "https://nomyo.ai",
|
|
|
|
|
|
"X-Title": "NOMYO Router",
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2026-05-19 11:18:06 +02:00
|
|
|
|
# ------------------------------------------------------------------
|
|
|
|
|
|
# Token Count Buffer (for write-behind pattern)
|
|
|
|
|
|
# ------------------------------------------------------------------
|
|
|
|
|
|
# Structure: {endpoint: {model: (input_tokens, output_tokens)}}
|
|
|
|
|
|
token_buffer: dict[str, dict[str, tuple[int, int]]] = defaultdict(lambda: defaultdict(lambda: (0, 0)))
|
|
|
|
|
|
# Time series buffer with timestamp
|
|
|
|
|
|
time_series_buffer: list[dict[str, int | str]] = []
|
|
|
|
|
|
# Lock to protect buffer access from race conditions
|
|
|
|
|
|
buffer_lock = asyncio.Lock()
|
|
|
|
|
|
|
|
|
|
|
|
# Configuration for periodic flushing
|
|
|
|
|
|
FLUSH_INTERVAL = 10 # seconds
|
|
|
|
|
|
|
|
|
|
|
|
# ------------------------------------------------------------------
|
|
|
|
|
|
# Per‑endpoint per‑model active connection counters
|
|
|
|
|
|
# ------------------------------------------------------------------
|
|
|
|
|
|
usage_counts: Dict[str, Dict[str, int]] = defaultdict(lambda: defaultdict(int))
|
|
|
|
|
|
token_usage_counts: Dict[str, Dict[str, int]] = defaultdict(lambda: defaultdict(int))
|
|
|
|
|
|
usage_lock = asyncio.Lock() # protects access to usage_counts
|
|
|
|
|
|
token_usage_lock = asyncio.Lock()
|
|
|
|
|
|
|
|
|
|
|
|
# Conversation affinity map: fingerprint -> (endpoint, model, expires_at_monotonic).
|
|
|
|
|
|
# Keeps the same conversation pinned to the endpoint that already has its
|
|
|
|
|
|
# KV-cache prefix warm. Model is stored so the dashboard can aggregate live
|
|
|
|
|
|
# entries per (endpoint, model) without recomputing fingerprints.
|
|
|
|
|
|
# Never held together with usage_lock.
|
|
|
|
|
|
_affinity_map: Dict[str, tuple[str, str, float]] = {}
|
|
|
|
|
|
_affinity_lock = asyncio.Lock()
|
|
|
|
|
|
_AFFINITY_MAX_ENTRIES = 10000
|