feat: visualization of conversation affinity in dashboard
This commit is contained in:
parent
4acbaeb29c
commit
aa7ec6354a
5 changed files with 306 additions and 19 deletions
26
config.yaml
26
config.yaml
|
|
@ -27,14 +27,24 @@ max_concurrent_connections: 2
|
|||
# priority_routing: true
|
||||
|
||||
# Conversation affinity (optional, default: false).
|
||||
# Routes follow-up requests back to the endpoint that previously served the
|
||||
# same conversation so the llama.cpp / Ollama prompt cache (KV cache) stays
|
||||
# warm — first turn does a cold prefill, follow-ups skip it. Soft preference:
|
||||
# falls back to the standard algorithm when the affine endpoint no longer has
|
||||
# the model loaded or has no free slot. Conversations are fingerprinted by
|
||||
# (model, first system + first user turn).
|
||||
# conversation_affinity: true
|
||||
# conversation_affinity_ttl: 300 # seconds; matches Ollama's default keep_alive
|
||||
# Pins a conversation to the endpoint that served its first turn so the
|
||||
# llama.cpp / Ollama prompt cache (KV cache) stays warm — first turn pays
|
||||
# the cold prefill, every follow-up turn reuses the same prefix.
|
||||
#
|
||||
# Fingerprint = sha1(model + leading system messages + first user turn).
|
||||
# Same chat → same fingerprint on every follow-up turn → same pin, TTL
|
||||
# refreshed on each reuse. Soft preference: if the pinned endpoint no
|
||||
# longer has the model loaded or has no free slot, the standard algorithm
|
||||
# takes over (no failure, just a cache miss).
|
||||
#
|
||||
# Heads-up: most chat UIs (Open WebUI, LibreChat, …) fire side requests for
|
||||
# title / tag / follow-up generation. Those have their own first turn and
|
||||
# therefore their own pin, so a single visible "chat" may show several dots
|
||||
# in the dashboard's Affinity column. That is correct — each pin matches a
|
||||
# real warm KV prefix on the backend. See doc/configuration.md for details.
|
||||
conversation_affinity: true
|
||||
conversation_affinity_ttl: 300 # seconds of inactivity before a pin expires;
|
||||
# bumped on every reuse. Matches Ollama's default keep_alive.
|
||||
|
||||
# Optional router-level API key that gates router/API/web UI access (leave empty to disable)
|
||||
nomyo-router-api-key: ""
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue