feat: visualization of conversation affinity in dashboard

2026-05-13 13:38:37 +02:00 · 2026-05-13 13:38:37 +02:00 · aa7ec6354a
commit aa7ec6354a
parent 4acbaeb29c
5 changed files with 306 additions and 19 deletions
--- a/config.yaml
+++ b/config.yaml
@ -27,14 +27,24 @@ max_concurrent_connections: 2
 # priority_routing: true

 # Conversation affinity (optional, default: false).
-# Routes follow-up requests back to the endpoint that previously served the
-# same conversation so the llama.cpp / Ollama prompt cache (KV cache) stays
-# warm — first turn does a cold prefill, follow-ups skip it. Soft preference:
-# falls back to the standard algorithm when the affine endpoint no longer has
-# the model loaded or has no free slot. Conversations are fingerprinted by
-# (model, first system + first user turn).
-# conversation_affinity: true
-# conversation_affinity_ttl: 300   # seconds; matches Ollama's default keep_alive
+# Pins a conversation to the endpoint that served its first turn so the
+# llama.cpp / Ollama prompt cache (KV cache) stays warm — first turn pays
+# the cold prefill, every follow-up turn reuses the same prefix.
+#
+# Fingerprint = sha1(model + leading system messages + first user turn).
+# Same chat → same fingerprint on every follow-up turn → same pin, TTL
+# refreshed on each reuse. Soft preference: if the pinned endpoint no
+# longer has the model loaded or has no free slot, the standard algorithm
+# takes over (no failure, just a cache miss).
+#
+# Heads-up: most chat UIs (Open WebUI, LibreChat, …) fire side requests for
+# title / tag / follow-up generation. Those have their own first turn and
+# therefore their own pin, so a single visible "chat" may show several dots
+# in the dashboard's Affinity column. That is correct — each pin matches a
+# real warm KV prefix on the backend. See doc/configuration.md for details.
+conversation_affinity: true
+conversation_affinity_ttl: 300   # seconds of inactivity before a pin expires;
+                                 # bumped on every reuse. Matches Ollama's default keep_alive.

 # Optional router-level API key that gates router/API/web UI access (leave empty to disable)
 nomyo-router-api-key: ""