# config.yaml # Ollama endpoints local + remote endpoints: - http://localhost:11434 - http://192.168.0.51:11434 - http://192.168.0.52:11434 # External OpenAI-compatible endpoints (will NOT be queried for /api/ps /api/ps_details) - https://api.openai.com/v1 # llama-server endpoints (OpenAI-compatible with /v1/models status info) # These endpoints will be queried for /api/tags, /api/ps, /api/ps_details # and included in the model selection pool for inference routing llama_server_endpoints: - http://localhost:8080/v1 - http://192.168.0.33:8081/v1 # Maximum concurrent connections *per endpoint‑model pair* (equals to OLLAMA_NUM_PARALLEL) max_concurrent_connections: 2 # Optional router-level API key that gates router/API/web UI access (leave empty to disable) nomyo-router-api-key: "" # API keys for remote endpoints # Set an environment variable like OPENAI_KEY # Confirm endpoints are exactly as in endpoints block api_keys: "http://192.168.0.50:11434": "ollama" "http://192.168.0.51:11434": "ollama" "http://192.168.0.52:11434": "ollama" "https://api.openai.com/v1": "${OPENAI_KEY}" "http://localhost:8080/v1": "llama-server" # Optional API key for llama-server - depends on llama_server config "http://192.168.0.33:8081/v1": "llama-server" # ------------------------------------------------------------- # Semantic LLM Cache (optional — disabled by default) # Caches LLM responses to cut costs and latency on repeated or # semantically similar prompts. # Cached routes: /api/chat /api/generate /v1/chat/completions /v1/completions # MOE requests (moe-* model prefix) always bypass the cache. # ------------------------------------------------------------- # cache_enabled: false # Backend — where cached responses are stored: # memory → in-process LRU (lost on restart, not shared across replicas) [default] # sqlite → persistent file-based (single instance, survives restart) # redis → distributed (shared across replicas, requires Redis) # cache_backend: memory # Cosine similarity threshold for a cache hit: # 1.0 → exact match only (works on any image variant) # <1.0 → semantic matching (requires the :semantic Docker image tag) # cache_similarity: 1.0 # Response TTL in seconds. Remove the key or set to null to cache forever. # cache_ttl: 3600 # SQLite backend: path to the cache database file # cache_db_path: llm_cache.db # Redis backend: connection URL # cache_redis_url: redis://localhost:6379/0 # Weight of the BM25-weighted chat-history embedding vs last-user-message embedding. # 0.3 = 30% history context signal, 70% question signal. # Only relevant when cache_similarity < 1.0. # cache_history_weight: 0.3