nomyo-router/config.yaml

74 lines
No EOL
2.9 KiB
YAML
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# config.yaml
endpoints:
- http://192.168.0.50:11434
- http://192.168.0.51:11434
- http://192.168.0.52:11434
- https://api.openai.com/v1
llama_server_endpoints:
- http://192.168.0.50:8889/v1
# Maximum concurrent connections *per endpointmodel pair* (equals to OLLAMA_NUM_PARALLEL)
# This is the global default; individual endpoints can override it via endpoint_config below.
max_concurrent_connections: 2
# Per-endpoint overrides (optional). Any field not listed falls back to the global default.
# endpoint_config:
# "http://192.168.0.50:11434":
# max_concurrent_connections: 3
# "http://192.168.0.51:11434":
# max_concurrent_connections: 1
# Priority / WRR routing (optional, default: false).
# When true, requests are routed by utilization ratio (usage/max_concurrent_connections)
# and the config order of endpoints acts as the tiebreaker — the first endpoint listed
# is preferred when two endpoints are equally loaded.
# When false (default), equally-idle endpoints are chosen at random.
# priority_routing: true
# Optional router-level API key that gates router/API/web UI access (leave empty to disable)
nomyo-router-api-key: ""
# API keys for remote endpoints
# Set an environment variable like OPENAI_KEY
# Confirm endpoints are exactly as in endpoints block
api_keys:
"http://192.168.0.50:11434": "ollama"
"http://192.168.0.51:11434": "ollama"
"http://192.168.0.52:11434": "ollama"
"https://api.openai.com/v1": "${OPENAI_KEY}"
"http://192.168.0.50:8889/v1": "llama"
# -------------------------------------------------------------
# Semantic LLM Cache (optional — disabled by default)
# Caches LLM responses to cut costs and latency on repeated or
# semantically similar prompts.
# Cached routes: /api/chat /api/generate /v1/chat/completions /v1/completions
# MOE requests (moe-* model prefix) always bypass the cache.
# -------------------------------------------------------------
# cache_enabled: true
# Backend — where cached responses are stored:
# memory → in-process LRU (lost on restart, not shared across replicas) [default]
# sqlite → persistent file-based (single instance, survives restart)
# redis → distributed (shared across replicas, requires Redis)
# cache_backend: memory
# Cosine similarity threshold for a cache hit:
# 1.0 → exact match only (works on any image variant)
# <1.0 → semantic matching (requires the :semantic Docker image tag)
# cache_similarity: 0.9
# Response TTL in seconds. Remove the key or set to null to cache forever.
# cache_ttl: 3600
# SQLite backend: path to the cache database file
# cache_db_path: llm_cache.db
# Redis backend: connection URL
# cache_redis_url: redis://localhost:6379/0
# Weight of the BM25-weighted chat-history embedding vs last-user-message embedding.
# 0.3 = 30% history context signal, 70% question signal.
# Only relevant when cache_similarity < 1.0.
# cache_history_weight: 0.3