nomyo-router/config.yaml

# config.yaml
endpoints:
  - http://192.168.0.50:11434
  - http://192.168.0.51:11434
  - http://192.168.0.52:11434
  - https://api.openai.com/v1

llama_server_endpoints:
  - http://192.168.0.50:8889/v1

# Maximum concurrent connections *per endpoint‑model pair* (equals to OLLAMA_NUM_PARALLEL)
# This is the global default; individual endpoints can override it via endpoint_config below.
max_concurrent_connections: 2

# Per-endpoint overrides (optional). Any field not listed falls back to the global default.
# endpoint_config:
#   "http://192.168.0.50:11434":
#     max_concurrent_connections: 3
#   "http://192.168.0.51:11434":
#     max_concurrent_connections: 1

# Priority / WRR routing (optional, default: false).
# When true, requests are routed by utilization ratio (usage/max_concurrent_connections)
# and the config order of endpoints acts as the tiebreaker — the first endpoint listed
# is preferred when two endpoints are equally loaded.
# When false (default), equally-idle endpoints are chosen at random.
# priority_routing: true

# Conversation affinity (optional, default: false).
# Pins a conversation to the endpoint that served its first turn so the
# llama.cpp / Ollama prompt cache (KV cache) stays warm — first turn pays
# the cold prefill, every follow-up turn reuses the same prefix.
#
# Fingerprint = sha1(model + leading system messages + first user turn).
# Same chat → same fingerprint on every follow-up turn → same pin, TTL
# refreshed on each reuse. Soft preference: if the pinned endpoint no
# longer has the model loaded or has no free slot, the standard algorithm
# takes over (no failure, just a cache miss).
#
# Heads-up: most chat UIs (Open WebUI, LibreChat, …) fire side requests for
# title / tag / follow-up generation. Those have their own first turn and
# therefore their own pin, so a single visible "chat" may show several dots
# in the dashboard's Affinity column. That is correct — each pin matches a
# real warm KV prefix on the backend. See doc/configuration.md for details.
conversation_affinity: true
conversation_affinity_ttl: 300   # seconds of inactivity before a pin expires;
                                 # bumped on every reuse. Matches Ollama's default keep_alive.

# Optional router-level API key that gates router/API/web UI access (leave empty to disable)
nomyo-router-api-key: ""

# API keys for remote endpoints
# Set an environment variable like OPENAI_KEY
# Confirm endpoints are exactly as in endpoints block
api_keys:
  "http://192.168.0.50:11434": "ollama"
  "http://192.168.0.51:11434": "ollama"
  "http://192.168.0.52:11434": "ollama"
  "https://api.openai.com/v1": "${OPENAI_KEY}"
  "http://192.168.0.50:8889/v1": "llama"

# -------------------------------------------------------------
# Semantic LLM Cache (optional — disabled by default)
# Caches LLM responses to cut costs and latency on repeated or
# semantically similar prompts.
# Cached routes: /api/chat  /api/generate  /v1/chat/completions  /v1/completions
# MOE requests (moe-* model prefix) always bypass the cache.
# -------------------------------------------------------------
# cache_enabled: true

# Backend — where cached responses are stored:
#   memory  → in-process LRU (lost on restart, not shared across replicas) [default]
#   sqlite  → persistent file-based   (single instance, survives restart)
#   redis   → distributed             (shared across replicas, requires Redis)
# cache_backend: memory

# Cosine similarity threshold for a cache hit:
#   1.0  → exact match only  (works on any image variant)
#   <1.0 → semantic matching (requires the :semantic Docker image tag)
# cache_similarity: 0.9

# Response TTL in seconds. Remove the key or set to null to cache forever.
# cache_ttl: 3600

# SQLite backend: path to the cache database file
# cache_db_path: llm_cache.db

# Redis backend: connection URL
# cache_redis_url: redis://localhost:6379/0

# Weight of the BM25-weighted chat-history embedding vs last-user-message embedding.
# 0.3 = 30% history context signal, 70% question signal.
# Only relevant when cache_similarity < 1.0.
# cache_history_weight: 0.3