feat: add conversation-endpoint affinity to benefit from hot kv-caches if possible

2026-05-12 18:33:47 +02:00 · 2026-05-12 18:33:47 +02:00 · 27dfc07889
commit 27dfc07889
parent adbdad0c37
2 changed files with 157 additions and 52 deletions
--- a/config.yaml
+++ b/config.yaml
@ -26,6 +26,16 @@ max_concurrent_connections: 2
 # When false (default), equally-idle endpoints are chosen at random.
 # priority_routing: true

+# Conversation affinity (optional, default: false).
+# Routes follow-up requests back to the endpoint that previously served the
+# same conversation so the llama.cpp / Ollama prompt cache (KV cache) stays
+# warm — first turn does a cold prefill, follow-ups skip it. Soft preference:
+# falls back to the standard algorithm when the affine endpoint no longer has
+# the model loaded or has no free slot. Conversations are fingerprinted by
+# (model, first system + first user turn).
+# conversation_affinity: true
+# conversation_affinity_ttl: 300   # seconds; matches Ollama's default keep_alive
+
 # Optional router-level API key that gates router/API/web UI access (leave empty to disable)
 nomyo-router-api-key: ""