2025-09-17 11:43:12 +02:00
|
|
|
|
# config.yaml
|
|
|
|
|
|
endpoints:
|
|
|
|
|
|
- http://192.168.0.50:11434
|
|
|
|
|
|
- http://192.168.0.51:11434
|
|
|
|
|
|
- http://192.168.0.52:11434
|
|
|
|
|
|
- https://api.openai.com/v1
|
|
|
|
|
|
|
2026-02-13 13:29:45 +01:00
|
|
|
|
llama_server_endpoints:
|
2026-03-08 09:12:09 +01:00
|
|
|
|
- http://192.168.0.50:8889/v1
|
2026-02-13 13:29:45 +01:00
|
|
|
|
|
2025-09-17 11:43:12 +02:00
|
|
|
|
# Maximum concurrent connections *per endpoint‑model pair* (equals to OLLAMA_NUM_PARALLEL)
|
|
|
|
|
|
max_concurrent_connections: 2
|
|
|
|
|
|
|
2026-01-14 09:28:02 +01:00
|
|
|
|
# Optional router-level API key that gates router/API/web UI access (leave empty to disable)
|
|
|
|
|
|
nomyo-router-api-key: ""
|
|
|
|
|
|
|
2025-09-17 11:43:12 +02:00
|
|
|
|
# API keys for remote endpoints
|
|
|
|
|
|
# Set an environment variable like OPENAI_KEY
|
|
|
|
|
|
# Confirm endpoints are exactly as in endpoints block
|
|
|
|
|
|
api_keys:
|
|
|
|
|
|
"http://192.168.0.50:11434": "ollama"
|
|
|
|
|
|
"http://192.168.0.51:11434": "ollama"
|
|
|
|
|
|
"http://192.168.0.52:11434": "ollama"
|
|
|
|
|
|
"https://api.openai.com/v1": "${OPENAI_KEY}"
|
2026-03-08 09:12:09 +01:00
|
|
|
|
"http://192.168.0.50:8889/v1": "llama"
|
|
|
|
|
|
|
|
|
|
|
|
# -------------------------------------------------------------
|
|
|
|
|
|
# Semantic LLM Cache (optional — disabled by default)
|
|
|
|
|
|
# Caches LLM responses to cut costs and latency on repeated or
|
|
|
|
|
|
# semantically similar prompts.
|
|
|
|
|
|
# Cached routes: /api/chat /api/generate /v1/chat/completions /v1/completions
|
|
|
|
|
|
# MOE requests (moe-* model prefix) always bypass the cache.
|
|
|
|
|
|
# -------------------------------------------------------------
|
2026-03-08 09:35:40 +01:00
|
|
|
|
# cache_enabled: true
|
2026-03-08 09:12:09 +01:00
|
|
|
|
|
|
|
|
|
|
# Backend — where cached responses are stored:
|
|
|
|
|
|
# memory → in-process LRU (lost on restart, not shared across replicas) [default]
|
|
|
|
|
|
# sqlite → persistent file-based (single instance, survives restart)
|
|
|
|
|
|
# redis → distributed (shared across replicas, requires Redis)
|
2026-03-08 09:35:40 +01:00
|
|
|
|
# cache_backend: memory
|
2026-03-08 09:12:09 +01:00
|
|
|
|
|
|
|
|
|
|
# Cosine similarity threshold for a cache hit:
|
|
|
|
|
|
# 1.0 → exact match only (works on any image variant)
|
|
|
|
|
|
# <1.0 → semantic matching (requires the :semantic Docker image tag)
|
2026-03-08 09:35:40 +01:00
|
|
|
|
# cache_similarity: 0.9
|
2026-03-08 09:12:09 +01:00
|
|
|
|
|
|
|
|
|
|
# Response TTL in seconds. Remove the key or set to null to cache forever.
|
2026-03-08 09:35:40 +01:00
|
|
|
|
# cache_ttl: 3600
|
2026-03-08 09:12:09 +01:00
|
|
|
|
|
|
|
|
|
|
# SQLite backend: path to the cache database file
|
2026-03-08 09:35:40 +01:00
|
|
|
|
# cache_db_path: llm_cache.db
|
2026-03-08 09:12:09 +01:00
|
|
|
|
|
|
|
|
|
|
# Redis backend: connection URL
|
|
|
|
|
|
# cache_redis_url: redis://localhost:6379/0
|
|
|
|
|
|
|
|
|
|
|
|
# Weight of the BM25-weighted chat-history embedding vs last-user-message embedding.
|
|
|
|
|
|
# 0.3 = 30% history context signal, 70% question signal.
|
|
|
|
|
|
# Only relevant when cache_similarity < 1.0.
|
|
|
|
|
|
# cache_history_weight: 0.3
|