nomyo-router/state.py

107 lines
4.8 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""Shared mutable router state.
All process-wide caches, locks, in-flight task maps, queues, counters and
buffers used by the router live here. These names are only ever *mutated*
(dict/set updates, lock acquisitions, queue put/get) — never rebound — so
importing them via ``from state import …`` is safe from every module.
Rebound singletons (``config``, ``db``, ``token_worker_task``,
``flush_task``) intentionally stay in router.py so their reassignment on
startup is visible to all callers.
"""
import asyncio
from collections import defaultdict
from typing import Dict, Set
# ------------------------------------------------------------------
# Inmemory caches
# ------------------------------------------------------------------
# Successful results are cached for 300s
_models_cache: dict[str, tuple[Set[str], float]] = {}
_loaded_models_cache: dict[str, tuple[Set[str], float]] = {}
# Transient errors are cached separately per concern so that a failure
# in one path does not poison the other.
_available_error_cache: dict[str, float] = {}
_loaded_error_cache: dict[str, float] = {}
# Per-(endpoint, model) completion-path failures. A llama-server in router
# mode can keep returning /v1/models 200 OK after its delegated worker for
# a specific model dies — the probe-level caches above will not catch this.
# We record signals observed during actual completion attempts so
# choose_endpoint can avoid the affected (endpoint, model) pair without
# poisoning unrelated models on the same backend.
_completion_error_cache: dict[tuple[str, str], float] = {}
_COMPLETION_ERROR_TTL = 300
# ------------------------------------------------------------------
# Cache locks
# ------------------------------------------------------------------
_models_cache_lock = asyncio.Lock()
_loaded_models_cache_lock = asyncio.Lock()
_available_error_cache_lock = asyncio.Lock()
_loaded_error_cache_lock = asyncio.Lock()
_completion_error_cache_lock = asyncio.Lock()
# ------------------------------------------------------------------
# In-flight request tracking (prevents cache stampede)
# ------------------------------------------------------------------
_inflight_available_models: dict[str, asyncio.Task] = {}
_inflight_loaded_models: dict[str, asyncio.Task] = {}
_inflight_lock = asyncio.Lock()
_bg_refresh_available: dict[str, asyncio.Task] = {}
_bg_refresh_loaded: dict[str, asyncio.Task] = {}
_bg_refresh_lock = asyncio.Lock()
# ------------------------------------------------------------------
# Queues
# ------------------------------------------------------------------
_subscribers: Set[asyncio.Queue] = set()
_subscribers_lock = asyncio.Lock()
token_queue: asyncio.Queue[tuple[str, str, int, int]] = asyncio.Queue()
# ------------------------------------------------------------------
# HTTP client / connector cache
# ------------------------------------------------------------------
app_state = {
"session": None,
"connector": None,
"socket_sessions": {}, # endpoint -> aiohttp.ClientSession(UnixConnector) for .sock endpoints
"httpx_clients": {}, # endpoint -> httpx.AsyncClient(UDS transport) for .sock endpoints
}
# Default outbound HTTP headers attached to every backend request.
default_headers = {
"HTTP-Referer": "https://nomyo.ai",
"Referer": "https://nomyo.ai",
"X-Title": "NOMYO Router",
}
# ------------------------------------------------------------------
# Token Count Buffer (for write-behind pattern)
# ------------------------------------------------------------------
# Structure: {endpoint: {model: (input_tokens, output_tokens)}}
token_buffer: dict[str, dict[str, tuple[int, int]]] = defaultdict(lambda: defaultdict(lambda: (0, 0)))
# Time series buffer with timestamp
time_series_buffer: list[dict[str, int | str]] = []
# Lock to protect buffer access from race conditions
buffer_lock = asyncio.Lock()
# Configuration for periodic flushing
FLUSH_INTERVAL = 10 # seconds
# ------------------------------------------------------------------
# Perendpoint permodel active connection counters
# ------------------------------------------------------------------
usage_counts: Dict[str, Dict[str, int]] = defaultdict(lambda: defaultdict(int))
token_usage_counts: Dict[str, Dict[str, int]] = defaultdict(lambda: defaultdict(int))
usage_lock = asyncio.Lock() # protects access to usage_counts
token_usage_lock = asyncio.Lock()
# Conversation affinity map: fingerprint -> (endpoint, model, expires_at_monotonic).
# Keeps the same conversation pinned to the endpoint that already has its
# KV-cache prefix warm. Model is stored so the dashboard can aggregate live
# entries per (endpoint, model) without recomputing fingerprints.
# Never held together with usage_lock.
_affinity_map: Dict[str, tuple[str, str, float]] = {}
_affinity_lock = asyncio.Lock()
_AFFINITY_MAX_ENTRIES = 10000