Redis-backed session cache for cross-replica model affinity (#879)
Some checks failed
CI / pre-commit (push) Has been cancelled
CI / plano-tools-tests (push) Has been cancelled
CI / native-smoke-test (push) Has been cancelled
CI / docker-build (push) Has been cancelled
CI / validate-config (push) Has been cancelled
Publish docker image (latest) / build-arm64 (push) Has been cancelled
Publish docker image (latest) / build-amd64 (push) Has been cancelled
Build and Deploy Documentation / build (push) Has been cancelled
CI / security-scan (push) Has been cancelled
CI / test-prompt-gateway (push) Has been cancelled
CI / test-model-alias-routing (push) Has been cancelled
CI / test-responses-api-with-state (push) Has been cancelled
CI / e2e-plano-tests (3.10) (push) Has been cancelled
CI / e2e-plano-tests (3.11) (push) Has been cancelled
CI / e2e-plano-tests (3.12) (push) Has been cancelled
CI / e2e-plano-tests (3.13) (push) Has been cancelled
CI / e2e-plano-tests (3.14) (push) Has been cancelled
CI / e2e-demo-preference (push) Has been cancelled
CI / e2e-demo-currency (push) Has been cancelled
Publish docker image (latest) / create-manifest (push) Has been cancelled

* add pluggable session cache with Redis backend

* add Redis session affinity demos (Docker Compose and Kubernetes)

* address PR review feedback on session cache

* document Redis session cache backend for model affinity

* sync rendered config reference with session_cache addition

* add tenant-scoped Redis session cache keys and remove dead log_affinity_hit

- Add tenant_header to SessionCacheConfig; when set, cache keys are scoped
  as plano:affinity:{tenant_id}:{session_id} for multi-tenant isolation
- Thread tenant_id through RouterService, routing_service, and llm handlers
- Use Cow<'_, str> in session_key to avoid allocation when no tenant is set
- Remove unused log_affinity_hit (logging was already inlined at call sites)

* remove session_affinity_redis and session_affinity_redis_k8s demos
This commit is contained in:
Musa 2026-04-13 19:30:47 -07:00 committed by GitHub
parent 128059e7c1
commit 980faef6be
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
15 changed files with 1538 additions and 729 deletions

View file

@ -0,0 +1,82 @@
use std::{
num::NonZeroUsize,
sync::Arc,
time::{Duration, Instant},
};
use async_trait::async_trait;
use lru::LruCache;
use tokio::sync::Mutex;
use tracing::info;
use super::{CachedRoute, SessionCache};
type CacheStore = Mutex<LruCache<String, (CachedRoute, Instant, Duration)>>;
pub struct MemorySessionCache {
store: Arc<CacheStore>,
}
impl MemorySessionCache {
pub fn new(max_entries: usize) -> Self {
let capacity = NonZeroUsize::new(max_entries)
.unwrap_or_else(|| NonZeroUsize::new(10_000).expect("10_000 is non-zero"));
let store = Arc::new(Mutex::new(LruCache::new(capacity)));
// Spawn a background task to evict TTL-expired entries every 5 minutes.
let store_clone = Arc::clone(&store);
tokio::spawn(async move {
let mut interval = tokio::time::interval(Duration::from_secs(300));
loop {
interval.tick().await;
Self::evict_expired(&store_clone).await;
}
});
Self { store }
}
async fn evict_expired(store: &CacheStore) {
let mut cache = store.lock().await;
let expired: Vec<String> = cache
.iter()
.filter(|(_, (_, inserted_at, ttl))| inserted_at.elapsed() >= *ttl)
.map(|(k, _)| k.clone())
.collect();
let removed = expired.len();
for key in &expired {
cache.pop(key.as_str());
}
if removed > 0 {
info!(
removed = removed,
remaining = cache.len(),
"cleaned up expired session cache entries"
);
}
}
}
#[async_trait]
impl SessionCache for MemorySessionCache {
async fn get(&self, key: &str) -> Option<CachedRoute> {
let mut cache = self.store.lock().await;
if let Some((route, inserted_at, ttl)) = cache.get(key) {
if inserted_at.elapsed() < *ttl {
return Some(route.clone());
}
}
None
}
async fn put(&self, key: &str, route: CachedRoute, ttl: Duration) {
self.store
.lock()
.await
.put(key.to_string(), (route, Instant::now(), ttl));
}
async fn remove(&self, key: &str) {
self.store.lock().await.pop(key);
}
}

View file

@ -0,0 +1,70 @@
use std::sync::Arc;
use async_trait::async_trait;
use common::configuration::Configuration;
use std::time::Duration;
use tracing::{debug, info};
pub mod memory;
pub mod redis;
#[derive(Clone, Debug, serde::Serialize, serde::Deserialize)]
pub struct CachedRoute {
pub model_name: String,
pub route_name: Option<String>,
}
#[async_trait]
pub trait SessionCache: Send + Sync {
/// Look up a cached routing decision by key.
async fn get(&self, key: &str) -> Option<CachedRoute>;
/// Store a routing decision in the session cache with the given TTL.
async fn put(&self, key: &str, route: CachedRoute, ttl: Duration);
/// Remove a cached routing decision by key.
async fn remove(&self, key: &str);
}
/// Initialize the session cache backend from config.
/// Defaults to the in-memory backend when no `session_cache` block is configured.
pub async fn init_session_cache(
config: &Configuration,
) -> Result<Arc<dyn SessionCache>, Box<dyn std::error::Error + Send + Sync>> {
use common::configuration::SessionCacheType;
let session_max_entries = config.routing.as_ref().and_then(|r| r.session_max_entries);
const DEFAULT_SESSION_MAX_ENTRIES: usize = 10_000;
const MAX_SESSION_MAX_ENTRIES: usize = 10_000;
let max_entries = session_max_entries
.unwrap_or(DEFAULT_SESSION_MAX_ENTRIES)
.min(MAX_SESSION_MAX_ENTRIES);
let cache_config = config
.routing
.as_ref()
.and_then(|r| r.session_cache.as_ref());
let cache_type = cache_config
.map(|c| &c.cache_type)
.unwrap_or(&SessionCacheType::Memory);
match cache_type {
SessionCacheType::Memory => {
info!(storage_type = "memory", "initialized session cache");
Ok(Arc::new(memory::MemorySessionCache::new(max_entries)))
}
SessionCacheType::Redis => {
let url = cache_config
.and_then(|c| c.url.as_ref())
.ok_or("session_cache.url is required when type is redis")?;
debug!(storage_type = "redis", url = %url, "initializing session cache");
let cache = redis::RedisSessionCache::new(url)
.await
.map_err(|e| format!("failed to connect to Redis session cache: {e}"))?;
Ok(Arc::new(cache))
}
}
}

View file

@ -0,0 +1,48 @@
use std::time::Duration;
use async_trait::async_trait;
use redis::aio::MultiplexedConnection;
use redis::AsyncCommands;
use super::{CachedRoute, SessionCache};
const KEY_PREFIX: &str = "plano:affinity:";
pub struct RedisSessionCache {
conn: MultiplexedConnection,
}
impl RedisSessionCache {
pub async fn new(url: &str) -> Result<Self, redis::RedisError> {
let client = redis::Client::open(url)?;
let conn = client.get_multiplexed_async_connection().await?;
Ok(Self { conn })
}
fn make_key(key: &str) -> String {
format!("{KEY_PREFIX}{key}")
}
}
#[async_trait]
impl SessionCache for RedisSessionCache {
async fn get(&self, key: &str) -> Option<CachedRoute> {
let mut conn = self.conn.clone();
let value: Option<String> = conn.get(Self::make_key(key)).await.ok()?;
value.and_then(|v| serde_json::from_str(&v).ok())
}
async fn put(&self, key: &str, route: CachedRoute, ttl: Duration) {
let mut conn = self.conn.clone();
let Ok(json) = serde_json::to_string(&route) else {
return;
};
let ttl_secs = ttl.as_secs().max(1);
let _: Result<(), _> = conn.set_ex(Self::make_key(key), json, ttl_secs).await;
}
async fn remove(&self, key: &str) {
let mut conn = self.conn.clone();
let _: Result<(), _> = conn.del(Self::make_key(key)).await;
}
}