feat: add conversation-endpoint affinity to benefit from hot kv-caches if possible
This commit is contained in:
parent
adbdad0c37
commit
27dfc07889
2 changed files with 157 additions and 52 deletions
10
config.yaml
10
config.yaml
|
|
@ -26,6 +26,16 @@ max_concurrent_connections: 2
|
|||
# When false (default), equally-idle endpoints are chosen at random.
|
||||
# priority_routing: true
|
||||
|
||||
# Conversation affinity (optional, default: false).
|
||||
# Routes follow-up requests back to the endpoint that previously served the
|
||||
# same conversation so the llama.cpp / Ollama prompt cache (KV cache) stays
|
||||
# warm — first turn does a cold prefill, follow-ups skip it. Soft preference:
|
||||
# falls back to the standard algorithm when the affine endpoint no longer has
|
||||
# the model loaded or has no free slot. Conversations are fingerprinted by
|
||||
# (model, first system + first user turn).
|
||||
# conversation_affinity: true
|
||||
# conversation_affinity_ttl: 300 # seconds; matches Ollama's default keep_alive
|
||||
|
||||
# Optional router-level API key that gates router/API/web UI access (leave empty to disable)
|
||||
nomyo-router-api-key: ""
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue