diff --git a/config/plano_config_schema.yaml b/config/plano_config_schema.yaml index ead7e2bf..fc1a697e 100644 --- a/config/plano_config_schema.yaml +++ b/config/plano_config_schema.yaml @@ -437,6 +437,7 @@ properties: session_max_entries: type: integer minimum: 1 + maximum: 10000 description: Maximum number of session-pinned routing cache entries. Default 10000. additionalProperties: false state_storage: diff --git a/crates/brightstaff/src/handlers/llm/mod.rs b/crates/brightstaff/src/handlers/llm/mod.rs index 6f36f955..ac460420 100644 --- a/crates/brightstaff/src/handlers/llm/mod.rs +++ b/crates/brightstaff/src/handlers/llm/mod.rs @@ -1,6 +1,8 @@ use bytes::Bytes; use common::configuration::{FilterPipeline, ModelAlias}; -use common::consts::{ARCH_IS_STREAMING_HEADER, ARCH_PROVIDER_HINT_HEADER, SESSION_ID_HEADER}; +use common::consts::{ + ARCH_IS_STREAMING_HEADER, ARCH_PROVIDER_HINT_HEADER, ROUTING_SESSION_ID_HEADER, +}; use common::llm_providers::LlmProviders; use hermesllm::apis::openai::Message; use hermesllm::apis::openai_responses::InputParam; @@ -96,7 +98,7 @@ async fn llm_chat_inner( // Session pinning: extract session ID and check cache before routing let session_id: Option = request_headers - .get(SESSION_ID_HEADER) + .get(ROUTING_SESSION_ID_HEADER) .and_then(|h| h.to_str().ok()) .map(|s| s.to_string()); let pinned_model: Option = if let Some(ref sid) = session_id { diff --git a/crates/brightstaff/src/handlers/routing_service.rs b/crates/brightstaff/src/handlers/routing_service.rs index 38f1ba10..5c498519 100644 --- a/crates/brightstaff/src/handlers/routing_service.rs +++ b/crates/brightstaff/src/handlers/routing_service.rs @@ -1,6 +1,6 @@ use bytes::Bytes; use common::configuration::{SpanAttributes, TopLevelRoutingPreference}; -use common::consts::{REQUEST_ID_HEADER, SESSION_ID_HEADER}; +use common::consts::{REQUEST_ID_HEADER, ROUTING_SESSION_ID_HEADER}; use common::errors::BrightStaffError; use hermesllm::clients::SupportedAPIsFromClient; use hermesllm::ProviderRequestType; @@ -72,7 +72,7 @@ pub async fn routing_decision( .unwrap_or_else(|| uuid::Uuid::new_v4().to_string()); let session_id: Option = request_headers - .get(SESSION_ID_HEADER) + .get(ROUTING_SESSION_ID_HEADER) .and_then(|h| h.to_str().ok()) .map(|s| s.to_string()); diff --git a/crates/brightstaff/src/router/llm.rs b/crates/brightstaff/src/router/llm.rs index d7a25d4c..5a208c6e 100644 --- a/crates/brightstaff/src/router/llm.rs +++ b/crates/brightstaff/src/router/llm.rs @@ -20,6 +20,7 @@ use crate::router::router_model_v1; const DEFAULT_SESSION_TTL_SECONDS: u64 = 600; const DEFAULT_SESSION_MAX_ENTRIES: usize = 10_000; +const MAX_SESSION_MAX_ENTRIES: usize = 10_000; #[derive(Clone, Debug)] pub struct CachedRoute { @@ -92,7 +93,9 @@ impl RouterService { let session_ttl = Duration::from_secs(session_ttl_seconds.unwrap_or(DEFAULT_SESSION_TTL_SECONDS)); - let session_max_entries = session_max_entries.unwrap_or(DEFAULT_SESSION_MAX_ENTRIES); + let session_max_entries = session_max_entries + .unwrap_or(DEFAULT_SESSION_MAX_ENTRIES) + .min(MAX_SESSION_MAX_ENTRIES); RouterService { router_url, diff --git a/crates/common/src/configuration.rs b/crates/common/src/configuration.rs index 35a4417c..ec863ecb 100644 --- a/crates/common/src/configuration.rs +++ b/crates/common/src/configuration.rs @@ -9,7 +9,7 @@ use crate::api::open_ai::{ #[derive(Debug, Clone, Serialize, Deserialize)] pub struct Routing { - pub model_provider: Option, + pub llm_provider: Option, pub model: Option, pub session_ttl_seconds: Option, pub session_max_entries: Option, diff --git a/crates/common/src/consts.rs b/crates/common/src/consts.rs index 179c66ac..03074c4a 100644 --- a/crates/common/src/consts.rs +++ b/crates/common/src/consts.rs @@ -22,7 +22,7 @@ pub const X_ARCH_TOOL_CALL: &str = "x-arch-tool-call-message"; pub const X_ARCH_FC_MODEL_RESPONSE: &str = "x-arch-fc-model-response"; pub const ARCH_FC_MODEL_NAME: &str = "Arch-Function"; pub const REQUEST_ID_HEADER: &str = "x-request-id"; -pub const SESSION_ID_HEADER: &str = "x-session-id"; +pub const ROUTING_SESSION_ID_HEADER: &str = "x-routing-session-id"; pub const ENVOY_ORIGINAL_PATH_HEADER: &str = "x-envoy-original-path"; pub const TRACE_PARENT_HEADER: &str = "traceparent"; pub const ARCH_INTERNAL_CLUSTER_NAME: &str = "arch_internal"; diff --git a/demos/llm_routing/model_routing_service/README.md b/demos/llm_routing/model_routing_service/README.md index 0778d4ad..bc7b14c4 100644 --- a/demos/llm_routing/model_routing_service/README.md +++ b/demos/llm_routing/model_routing_service/README.md @@ -108,13 +108,13 @@ The response contains the model list — your client should try `models[0]` firs ## Session Pinning -Send an `X-Session-Id` header to pin the routing decision for a session. Once a model is selected, all subsequent requests with the same session ID return the same model without re-running routing. +Send an `X-Routing-Session-Id` header to pin the routing decision for a session. Once a model is selected, all subsequent requests with the same session ID return the same model without re-running routing. ```bash # First call — runs routing, caches result curl http://localhost:12000/routing/v1/chat/completions \ -H "Content-Type: application/json" \ - -H "X-Session-Id: my-session-123" \ + -H "X-Routing-Session-Id: my-session-123" \ -d '{ "model": "gpt-4o-mini", "messages": [{"role": "user", "content": "Write a Python function for binary search"}] @@ -136,7 +136,7 @@ Response (first call): # Second call — same session, returns cached result curl http://localhost:12000/routing/v1/chat/completions \ -H "Content-Type: application/json" \ - -H "X-Session-Id: my-session-123" \ + -H "X-Routing-Session-Id: my-session-123" \ -d '{ "model": "gpt-4o-mini", "messages": [{"role": "user", "content": "Now explain merge sort"}] @@ -161,7 +161,7 @@ routing: session_max_entries: 10000 # default: 10000 ``` -Without the `X-Session-Id` header, routing runs fresh every time (no breaking change). +Without the `X-Routing-Session-Id` header, routing runs fresh every time (no breaking change). ## Kubernetes Deployment (Self-hosted Arch-Router on GPU) diff --git a/demos/llm_routing/model_routing_service/demo.sh b/demos/llm_routing/model_routing_service/demo.sh index 305231a5..97d3032d 100755 --- a/demos/llm_routing/model_routing_service/demo.sh +++ b/demos/llm_routing/model_routing_service/demo.sh @@ -114,7 +114,7 @@ echo "--- 7. Session pinning - first call (fresh routing decision) ---" echo "" curl -s "$PLANO_URL/routing/v1/chat/completions" \ -H "Content-Type: application/json" \ - -H "X-Session-Id: demo-session-001" \ + -H "X-Routing-Session-Id: demo-session-001" \ -d '{ "model": "gpt-4o-mini", "messages": [ @@ -129,7 +129,7 @@ echo " Notice: same model returned with \"pinned\": true, routing was skipped echo "" curl -s "$PLANO_URL/routing/v1/chat/completions" \ -H "Content-Type: application/json" \ - -H "X-Session-Id: demo-session-001" \ + -H "X-Routing-Session-Id: demo-session-001" \ -d '{ "model": "gpt-4o-mini", "messages": [ @@ -143,7 +143,7 @@ echo "--- 9. Different session gets its own fresh routing ---" echo "" curl -s "$PLANO_URL/routing/v1/chat/completions" \ -H "Content-Type: application/json" \ - -H "X-Session-Id: demo-session-002" \ + -H "X-Routing-Session-Id: demo-session-002" \ -d '{ "model": "gpt-4o-mini", "messages": [ diff --git a/demos/llm_routing/session_pinning/README.md b/demos/llm_routing/session_pinning/README.md index a84d440e..6ea6db02 100644 --- a/demos/llm_routing/session_pinning/README.md +++ b/demos/llm_routing/session_pinning/README.md @@ -1,15 +1,15 @@ # Session Pinning Demo -> Consistent model selection for agentic loops using `X-Session-Id`. +> Consistent model selection for agentic loops using `X-Routing-Session-Id`. ## Why Session Pinning? When an agent runs in a loop — research → analyse → implement → evaluate → summarise — each step hits Plano's router independently. Because prompts vary in intent, the router may select **different models** for each step, fragmenting context mid-session. -**Session pinning** solves this: send an `X-Session-Id` header and the first request runs routing as usual, caching the decision. Every subsequent request with the same session ID returns the **same model**, without re-running the router. +**Session pinning** solves this: send an `X-Routing-Session-Id` header and the first request runs routing as usual, caching the decision. Every subsequent request with the same session ID returns the **same model**, without re-running the router. ``` -Without pinning With pinning (X-Session-Id) +Without pinning With pinning (X-Routing-Session-Id) ───────────────── ────────────────────────── Step 1 → claude-sonnet (code_gen) Step 1 → claude-sonnet ← routed Step 2 → gpt-4o (reasoning) Step 2 → claude-sonnet ← pinned ✓ @@ -56,8 +56,8 @@ accumulated message history. Steps alternate between `code_generation` and The demo runs the loop **twice** against `/v1/chat/completions` using the [OpenAI SDK](https://github.com/openai/openai-python): -1. **Without pinning** — no `X-Session-Id`; models alternate per step -2. **With pinning** — `X-Session-Id` header included; model is pinned from step 1 +1. **Without pinning** — no `X-Routing-Session-Id`; models alternate per step +2. **With pinning** — `X-Routing-Session-Id` header included; model is pinned from step 1 Each step makes real LLM calls. Step 5's report explicitly references findings from earlier steps, demonstrating why coherent context requires a consistent model. @@ -85,7 +85,7 @@ from earlier steps, demonstrating why coherent context requires a consistent mod ✗ Without pinning: model switched 4 time(s) — gpt-4o, claude-sonnet-4-20250514 - Run 2: WITH Session Pinning (X-Session-Id: a1b2c3d4…) + Run 2: WITH Session Pinning (X-Routing-Session-Id: a1b2c3d4…) ───────────────────────────────────────────────────────────────────── step 1 [claude-sonnet-4-20250514] List requirements "Critical requirements: 1. ACID transactions for order integrity…" @@ -112,12 +112,12 @@ from earlier steps, demonstrating why coherent context requires a consistent mod ### How It Works -Session pinning is implemented in brightstaff. When `X-Session-Id` is present: +Session pinning is implemented in brightstaff. When `X-Routing-Session-Id` is present: 1. **First request** — routing runs normally, result is cached keyed by session ID 2. **Subsequent requests** — cache hit skips routing and returns the cached model instantly -The `X-Session-Id` header is forwarded transparently; no changes to your OpenAI +The `X-Routing-Session-Id` header is forwarded transparently; no changes to your OpenAI SDK calls beyond adding the header. ```python @@ -130,7 +130,7 @@ session_id = str(uuid.uuid4()) response = client.chat.completions.create( model="gpt-4o-mini", messages=[{"role": "user", "content": prompt}], - extra_headers={"X-Session-Id": session_id}, # pin the session + extra_headers={"X-Routing-Session-Id": session_id}, # pin the session ) ``` @@ -146,7 +146,7 @@ routing: session_max_entries: 10000 # Max cached sessions before LRU eviction ``` -Without the `X-Session-Id` header, routing runs fresh every time — no breaking +Without the `X-Routing-Session-Id` header, routing runs fresh every time — no breaking change to existing clients. --- diff --git a/demos/llm_routing/session_pinning/agent.py b/demos/llm_routing/session_pinning/agent.py index ffb553d3..2d51085f 100644 --- a/demos/llm_routing/session_pinning/agent.py +++ b/demos/llm_routing/session_pinning/agent.py @@ -11,9 +11,10 @@ each with its own tool-calling loop. The tasks deliberately alternate between code_generation and complex_reasoning intents so Plano's preference-based router selects different models for each task. -If the client sends X-Session-Id, the agent forwards it on every outbound -call to Plano. The first task pins the model; all subsequent tasks skip the -router and reuse it — keeping the whole session on one consistent model. +If the client sends X-Routing-Session-Id, the agent forwards it on every +outbound call to Plano. The first task pins the model; all subsequent tasks +skip the router and reuse it — keeping the whole session on one consistent +model. Run standalone: uv run agent.py @@ -309,12 +310,12 @@ async def run_task( Each task is an independent conversation so the router sees only this task's intent — not the accumulated context of previous tasks. - Session pinning via X-Session-Id pins the model from the first task - onward, so all tasks stay on the same model. + Session pinning via X-Routing-Session-Id pins the model from the first + task onward, so all tasks stay on the same model. Returns (answer, first_model_used). """ - headers = {"X-Session-Id": session_id} if session_id else {} + headers = {"X-Routing-Session-Id": session_id} if session_id else {} messages: list[ChatCompletionMessageParam] = [ {"role": "system", "content": SYSTEM_PROMPT}, {"role": "user", "content": prompt}, @@ -391,7 +392,7 @@ app = FastAPI(title="Research Agent", version="1.0.0") @app.post("/v1/chat/completions") async def chat(request: Request) -> JSONResponse: body = await request.json() - session_id: str | None = request.headers.get("x-session-id") + session_id: str | None = request.headers.get("x-routing-session-id") log.info("request session_id=%s", session_id or "none") diff --git a/demos/llm_routing/session_pinning/demo.py b/demos/llm_routing/session_pinning/demo.py index fdf7634b..034898e7 100644 --- a/demos/llm_routing/session_pinning/demo.py +++ b/demos/llm_routing/session_pinning/demo.py @@ -52,7 +52,7 @@ async def wait_for_agent(timeout: int = 30) -> bool: async def ask_agent(query: str, session_id: str | None = None) -> dict: headers: dict[str, str] = {} if session_id: - headers["X-Session-Id"] = session_id + headers["X-Routing-Session-Id"] = session_id async with httpx.AsyncClient(timeout=120.0) as client: r = await client.post( @@ -113,7 +113,7 @@ async def main() -> None: print(" ╚══════════════════════════════════════════════════════════════╝") print() print(f" Agent : {AGENT_URL}") - print(f" Query : \"{QUERY[:72]}…\"") + print(f' Query : "{QUERY[:72]}…"') print() print(" The agent uses a tool-calling loop (get_db_benchmarks,") print(" get_case_studies, check_feature_support) to research the") @@ -149,7 +149,7 @@ async def main() -> None: # ── Run 2 ──────────────────────────────────────────────────────────── print(" ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━") - print(f" Run 2: WITH Session Pinning (X-Session-Id: {sid[:8]}…)") + print(f" Run 2: WITH Session Pinning (X-Routing-Session-Id: {sid[:8]}…)") print(" ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━") print() print(" LLM turns inside the agent loop:")