diff --git a/config.py b/config.py new file mode 100644 index 0000000..e3b5ee6 --- /dev/null +++ b/config.py @@ -0,0 +1,126 @@ +"""Router configuration loader. + +Pydantic ``BaseSettings`` model populated from YAML (path resolved via +``_config_path_from_env``) with ``${VAR}`` expansion, plus env-var overrides +under the ``NOMYO_ROUTER_`` prefix. +""" +import os +import re +from pathlib import Path +from typing import Dict, List, Optional + +import yaml +from pydantic import Field +from pydantic_settings import BaseSettings + + +class Config(BaseSettings): + # List of Ollama endpoints + endpoints: list[str] = Field( + default_factory=lambda: [ + "http://localhost:11434", + ] + ) + # List of llama-server endpoints (OpenAI-compatible with /v1/models status info) + llama_server_endpoints: List[str] = Field(default_factory=list) + # Max concurrent connections per endpoint‑model pair, see OLLAMA_NUM_PARALLEL + max_concurrent_connections: int = 1 + # Per-endpoint overrides: {endpoint_url: {max_concurrent_connections: N}} + endpoint_config: Dict[str, Dict] = Field(default_factory=dict) + # When True, config order = priority; routes by utilization ratio + config index (WRR) + priority_routing: bool = Field(default=False) + + # Conversation affinity: route the same conversation back to the endpoint that + # previously served it, to keep the llama.cpp / Ollama prompt cache (KV cache) warm. + # Soft preference — falls back to the standard algorithm when the affine endpoint + # is saturated or no longer has the model loaded. + conversation_affinity: bool = Field(default=False) + # TTL (seconds) for affinity entries. Defaults to Ollama's default keep_alive (5 min): + # if the backend has already evicted the model, the KV cache is cold anyway. + conversation_affinity_ttl: int = Field(default=300) + + api_keys: Dict[str, str] = Field(default_factory=dict) + # Optional router-level API key used to gate access to this service and dashboard + router_api_key: Optional[str] = Field(default=None, env="NOMYO_ROUTER_API_KEY") + + # Database configuration + db_path: str = Field(default=os.getenv("NOMYO_ROUTER_DB_PATH", "token_counts.db")) + + # Semantic LLM Cache configuration + cache_enabled: bool = Field(default=False) + # Backend: "memory" (default, in-process), "sqlite" (persistent), "redis" (distributed) + cache_backend: str = Field(default="memory") + # Cosine similarity threshold: 1.0 = exact match only, <1.0 = semantic (requires :semantic image) + cache_similarity: float = Field(default=1.0) + # TTL in seconds; None = cache forever + cache_ttl: Optional[int] = Field(default=3600) + # SQLite backend: path to cache database file + cache_db_path: str = Field(default="llm_cache.db") + # Redis backend: connection URL + cache_redis_url: str = Field(default="redis://localhost:6379/0") + # Weight of BM25-weighted chat-history embedding vs last-user-message embedding + # 0.3 = 30% history context signal, 70% question signal + cache_history_weight: float = Field(default=0.3) + + class Config: + # YAML loading is handled manually via Config.from_yaml(); env vars use this prefix. + env_prefix = "NOMYO_ROUTER_" + + @classmethod + def _expand_env_refs(cls, obj): + """Recursively replace `${VAR}` with os.getenv('VAR').""" + if isinstance(obj, dict): + return {k: cls._expand_env_refs(v) for k, v in obj.items()} + if isinstance(obj, list): + return [cls._expand_env_refs(v) for v in obj] + if isinstance(obj, str): + # Only expand if it is exactly ${VAR} + m = re.fullmatch(r"\$\{([A-Za-z_][A-Za-z0-9_]*)\}", obj) + if m: + return os.getenv(m.group(1), "") + return obj + + @classmethod + def from_yaml(cls, path: Path) -> "Config": + """Load the YAML file and create the Config instance.""" + if path.exists(): + with path.open("r", encoding="utf-8") as fp: + data = yaml.safe_load(fp) or {} + cleaned = cls._expand_env_refs(data) + if isinstance(cleaned, dict): + # Accept hyphenated config key and map it to the field name + key_aliases = [ + # canonical field name + "router_api_key", + # lowercase, hyphen/underscore variants + "nomyo-router-api-key", + "nomyo_router_api_key", + "nomyo-router_api_key", + "nomyo_router-api_key", + # uppercase env-style variants + "NOMYO-ROUTER_API_KEY", + "NOMYO_ROUTER_API_KEY", + ] + for alias in key_aliases: + if alias in cleaned: + cleaned["router_api_key"] = cleaned.get("router_api_key", cleaned.pop(alias)) + break + # If not present in YAML (or empty), fall back to env var explicitly + if not cleaned.get("router_api_key"): + env_key = os.getenv("NOMYO_ROUTER_API_KEY") + if env_key: + cleaned["router_api_key"] = env_key + return cls(**cleaned) + return cls() + + +def _config_path_from_env() -> Path: + """ + Resolve the configuration file path. Defaults to `config.yaml` + in the current working directory unless NOMYO_ROUTER_CONFIG_PATH + is set. + """ + candidate = os.getenv("NOMYO_ROUTER_CONFIG_PATH") + if candidate: + return Path(candidate).expanduser() + return Path("config.yaml") diff --git a/router.py b/router.py index c5a0336..825bafc 100644 --- a/router.py +++ b/router.py @@ -107,118 +107,7 @@ buffer_lock = asyncio.Lock() # Configuration for periodic flushing FLUSH_INTERVAL = 10 # seconds -# ------------------------------------------------------------- -# 1. Configuration loader -# ------------------------------------------------------------- -class Config(BaseSettings): - # List of Ollama endpoints - endpoints: list[str] = Field( - default_factory=lambda: [ - "http://localhost:11434", - ] - ) - # List of llama-server endpoints (OpenAI-compatible with /v1/models status info) - llama_server_endpoints: List[str] = Field(default_factory=list) - # Max concurrent connections per endpoint‑model pair, see OLLAMA_NUM_PARALLEL - max_concurrent_connections: int = 1 - # Per-endpoint overrides: {endpoint_url: {max_concurrent_connections: N}} - endpoint_config: Dict[str, Dict] = Field(default_factory=dict) - # When True, config order = priority; routes by utilization ratio + config index (WRR) - priority_routing: bool = Field(default=False) - - # Conversation affinity: route the same conversation back to the endpoint that - # previously served it, to keep the llama.cpp / Ollama prompt cache (KV cache) warm. - # Soft preference — falls back to the standard algorithm when the affine endpoint - # is saturated or no longer has the model loaded. - conversation_affinity: bool = Field(default=False) - # TTL (seconds) for affinity entries. Defaults to Ollama's default keep_alive (5 min): - # if the backend has already evicted the model, the KV cache is cold anyway. - conversation_affinity_ttl: int = Field(default=300) - - api_keys: Dict[str, str] = Field(default_factory=dict) - # Optional router-level API key used to gate access to this service and dashboard - router_api_key: Optional[str] = Field(default=None, env="NOMYO_ROUTER_API_KEY") - - # Database configuration - db_path: str = Field(default=os.getenv("NOMYO_ROUTER_DB_PATH", "token_counts.db")) - - # Semantic LLM Cache configuration - cache_enabled: bool = Field(default=False) - # Backend: "memory" (default, in-process), "sqlite" (persistent), "redis" (distributed) - cache_backend: str = Field(default="memory") - # Cosine similarity threshold: 1.0 = exact match only, <1.0 = semantic (requires :semantic image) - cache_similarity: float = Field(default=1.0) - # TTL in seconds; None = cache forever - cache_ttl: Optional[int] = Field(default=3600) - # SQLite backend: path to cache database file - cache_db_path: str = Field(default="llm_cache.db") - # Redis backend: connection URL - cache_redis_url: str = Field(default="redis://localhost:6379/0") - # Weight of BM25-weighted chat-history embedding vs last-user-message embedding - # 0.3 = 30% history context signal, 70% question signal - cache_history_weight: float = Field(default=0.3) - - class Config: - # YAML loading is handled manually via Config.from_yaml(); env vars use this prefix. - env_prefix = "NOMYO_ROUTER_" - - @classmethod - def _expand_env_refs(cls, obj): - """Recursively replace `${VAR}` with os.getenv('VAR').""" - if isinstance(obj, dict): - return {k: cls._expand_env_refs(v) for k, v in obj.items()} - if isinstance(obj, list): - return [cls._expand_env_refs(v) for v in obj] - if isinstance(obj, str): - # Only expand if it is exactly ${VAR} - m = re.fullmatch(r"\$\{([A-Za-z_][A-Za-z0-9_]*)\}", obj) - if m: - return os.getenv(m.group(1), "") - return obj - - @classmethod - def from_yaml(cls, path: Path) -> "Config": - """Load the YAML file and create the Config instance.""" - if path.exists(): - with path.open("r", encoding="utf-8") as fp: - data = yaml.safe_load(fp) or {} - cleaned = cls._expand_env_refs(data) - if isinstance(cleaned, dict): - # Accept hyphenated config key and map it to the field name - key_aliases = [ - # canonical field name - "router_api_key", - # lowercase, hyphen/underscore variants - "nomyo-router-api-key", - "nomyo_router_api_key", - "nomyo-router_api_key", - "nomyo_router-api_key", - # uppercase env-style variants - "NOMYO-ROUTER_API_KEY", - "NOMYO_ROUTER_API_KEY", - ] - for alias in key_aliases: - if alias in cleaned: - cleaned["router_api_key"] = cleaned.get("router_api_key", cleaned.pop(alias)) - break - # If not present in YAML (or empty), fall back to env var explicitly - if not cleaned.get("router_api_key"): - env_key = os.getenv("NOMYO_ROUTER_API_KEY") - if env_key: - cleaned["router_api_key"] = env_key - return cls(**cleaned) - return cls() - -def _config_path_from_env() -> Path: - """ - Resolve the configuration file path. Defaults to `config.yaml` - in the current working directory unless NOMYO_ROUTER_CONFIG_PATH - is set. - """ - candidate = os.getenv("NOMYO_ROUTER_CONFIG_PATH") - if candidate: - return Path(candidate).expanduser() - return Path("config.yaml") +from config import Config, _config_path_from_env from ollama._types import TokenLogprob, Logprob from db import TokenDatabase