rename x-session-id to x-routing-session-id and fix routing config field name

This commit is contained in:
Adil Hafeez 2026-04-08 12:31:57 -07:00
parent f699cfb059
commit 5789694d2f
11 changed files with 41 additions and 34 deletions

View file

@ -437,6 +437,7 @@ properties:
session_max_entries:
type: integer
minimum: 1
maximum: 10000
description: Maximum number of session-pinned routing cache entries. Default 10000.
additionalProperties: false
state_storage:

View file

@ -1,6 +1,8 @@
use bytes::Bytes;
use common::configuration::{FilterPipeline, ModelAlias};
use common::consts::{ARCH_IS_STREAMING_HEADER, ARCH_PROVIDER_HINT_HEADER, SESSION_ID_HEADER};
use common::consts::{
ARCH_IS_STREAMING_HEADER, ARCH_PROVIDER_HINT_HEADER, ROUTING_SESSION_ID_HEADER,
};
use common::llm_providers::LlmProviders;
use hermesllm::apis::openai::Message;
use hermesllm::apis::openai_responses::InputParam;
@ -96,7 +98,7 @@ async fn llm_chat_inner(
// Session pinning: extract session ID and check cache before routing
let session_id: Option<String> = request_headers
.get(SESSION_ID_HEADER)
.get(ROUTING_SESSION_ID_HEADER)
.and_then(|h| h.to_str().ok())
.map(|s| s.to_string());
let pinned_model: Option<String> = if let Some(ref sid) = session_id {

View file

@ -1,6 +1,6 @@
use bytes::Bytes;
use common::configuration::{SpanAttributes, TopLevelRoutingPreference};
use common::consts::{REQUEST_ID_HEADER, SESSION_ID_HEADER};
use common::consts::{REQUEST_ID_HEADER, ROUTING_SESSION_ID_HEADER};
use common::errors::BrightStaffError;
use hermesllm::clients::SupportedAPIsFromClient;
use hermesllm::ProviderRequestType;
@ -72,7 +72,7 @@ pub async fn routing_decision(
.unwrap_or_else(|| uuid::Uuid::new_v4().to_string());
let session_id: Option<String> = request_headers
.get(SESSION_ID_HEADER)
.get(ROUTING_SESSION_ID_HEADER)
.and_then(|h| h.to_str().ok())
.map(|s| s.to_string());

View file

@ -20,6 +20,7 @@ use crate::router::router_model_v1;
const DEFAULT_SESSION_TTL_SECONDS: u64 = 600;
const DEFAULT_SESSION_MAX_ENTRIES: usize = 10_000;
const MAX_SESSION_MAX_ENTRIES: usize = 10_000;
#[derive(Clone, Debug)]
pub struct CachedRoute {
@ -92,7 +93,9 @@ impl RouterService {
let session_ttl =
Duration::from_secs(session_ttl_seconds.unwrap_or(DEFAULT_SESSION_TTL_SECONDS));
let session_max_entries = session_max_entries.unwrap_or(DEFAULT_SESSION_MAX_ENTRIES);
let session_max_entries = session_max_entries
.unwrap_or(DEFAULT_SESSION_MAX_ENTRIES)
.min(MAX_SESSION_MAX_ENTRIES);
RouterService {
router_url,

View file

@ -9,7 +9,7 @@ use crate::api::open_ai::{
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct Routing {
pub model_provider: Option<String>,
pub llm_provider: Option<String>,
pub model: Option<String>,
pub session_ttl_seconds: Option<u64>,
pub session_max_entries: Option<usize>,

View file

@ -22,7 +22,7 @@ pub const X_ARCH_TOOL_CALL: &str = "x-arch-tool-call-message";
pub const X_ARCH_FC_MODEL_RESPONSE: &str = "x-arch-fc-model-response";
pub const ARCH_FC_MODEL_NAME: &str = "Arch-Function";
pub const REQUEST_ID_HEADER: &str = "x-request-id";
pub const SESSION_ID_HEADER: &str = "x-session-id";
pub const ROUTING_SESSION_ID_HEADER: &str = "x-routing-session-id";
pub const ENVOY_ORIGINAL_PATH_HEADER: &str = "x-envoy-original-path";
pub const TRACE_PARENT_HEADER: &str = "traceparent";
pub const ARCH_INTERNAL_CLUSTER_NAME: &str = "arch_internal";

View file

@ -108,13 +108,13 @@ The response contains the model list — your client should try `models[0]` firs
## Session Pinning
Send an `X-Session-Id` header to pin the routing decision for a session. Once a model is selected, all subsequent requests with the same session ID return the same model without re-running routing.
Send an `X-Routing-Session-Id` header to pin the routing decision for a session. Once a model is selected, all subsequent requests with the same session ID return the same model without re-running routing.
```bash
# First call — runs routing, caches result
curl http://localhost:12000/routing/v1/chat/completions \
-H "Content-Type: application/json" \
-H "X-Session-Id: my-session-123" \
-H "X-Routing-Session-Id: my-session-123" \
-d '{
"model": "gpt-4o-mini",
"messages": [{"role": "user", "content": "Write a Python function for binary search"}]
@ -136,7 +136,7 @@ Response (first call):
# Second call — same session, returns cached result
curl http://localhost:12000/routing/v1/chat/completions \
-H "Content-Type: application/json" \
-H "X-Session-Id: my-session-123" \
-H "X-Routing-Session-Id: my-session-123" \
-d '{
"model": "gpt-4o-mini",
"messages": [{"role": "user", "content": "Now explain merge sort"}]
@ -161,7 +161,7 @@ routing:
session_max_entries: 10000 # default: 10000
```
Without the `X-Session-Id` header, routing runs fresh every time (no breaking change).
Without the `X-Routing-Session-Id` header, routing runs fresh every time (no breaking change).
## Kubernetes Deployment (Self-hosted Arch-Router on GPU)

View file

@ -114,7 +114,7 @@ echo "--- 7. Session pinning - first call (fresh routing decision) ---"
echo ""
curl -s "$PLANO_URL/routing/v1/chat/completions" \
-H "Content-Type: application/json" \
-H "X-Session-Id: demo-session-001" \
-H "X-Routing-Session-Id: demo-session-001" \
-d '{
"model": "gpt-4o-mini",
"messages": [
@ -129,7 +129,7 @@ echo " Notice: same model returned with \"pinned\": true, routing was skipped
echo ""
curl -s "$PLANO_URL/routing/v1/chat/completions" \
-H "Content-Type: application/json" \
-H "X-Session-Id: demo-session-001" \
-H "X-Routing-Session-Id: demo-session-001" \
-d '{
"model": "gpt-4o-mini",
"messages": [
@ -143,7 +143,7 @@ echo "--- 9. Different session gets its own fresh routing ---"
echo ""
curl -s "$PLANO_URL/routing/v1/chat/completions" \
-H "Content-Type: application/json" \
-H "X-Session-Id: demo-session-002" \
-H "X-Routing-Session-Id: demo-session-002" \
-d '{
"model": "gpt-4o-mini",
"messages": [

View file

@ -1,15 +1,15 @@
# Session Pinning Demo
> Consistent model selection for agentic loops using `X-Session-Id`.
> Consistent model selection for agentic loops using `X-Routing-Session-Id`.
## Why Session Pinning?
When an agent runs in a loop — research → analyse → implement → evaluate → summarise — each step hits Plano's router independently. Because prompts vary in intent, the router may select **different models** for each step, fragmenting context mid-session.
**Session pinning** solves this: send an `X-Session-Id` header and the first request runs routing as usual, caching the decision. Every subsequent request with the same session ID returns the **same model**, without re-running the router.
**Session pinning** solves this: send an `X-Routing-Session-Id` header and the first request runs routing as usual, caching the decision. Every subsequent request with the same session ID returns the **same model**, without re-running the router.
```
Without pinning With pinning (X-Session-Id)
Without pinning With pinning (X-Routing-Session-Id)
───────────────── ──────────────────────────
Step 1 → claude-sonnet (code_gen) Step 1 → claude-sonnet ← routed
Step 2 → gpt-4o (reasoning) Step 2 → claude-sonnet ← pinned ✓
@ -56,8 +56,8 @@ accumulated message history. Steps alternate between `code_generation` and
The demo runs the loop **twice** against `/v1/chat/completions` using the
[OpenAI SDK](https://github.com/openai/openai-python):
1. **Without pinning** — no `X-Session-Id`; models alternate per step
2. **With pinning**`X-Session-Id` header included; model is pinned from step 1
1. **Without pinning** — no `X-Routing-Session-Id`; models alternate per step
2. **With pinning**`X-Routing-Session-Id` header included; model is pinned from step 1
Each step makes real LLM calls. Step 5's report explicitly references findings
from earlier steps, demonstrating why coherent context requires a consistent model.
@ -85,7 +85,7 @@ from earlier steps, demonstrating why coherent context requires a consistent mod
✗ Without pinning: model switched 4 time(s) — gpt-4o, claude-sonnet-4-20250514
Run 2: WITH Session Pinning (X-Session-Id: a1b2c3d4…)
Run 2: WITH Session Pinning (X-Routing-Session-Id: a1b2c3d4…)
─────────────────────────────────────────────────────────────────────
step 1 [claude-sonnet-4-20250514] List requirements
"Critical requirements: 1. ACID transactions for order integrity…"
@ -112,12 +112,12 @@ from earlier steps, demonstrating why coherent context requires a consistent mod
### How It Works
Session pinning is implemented in brightstaff. When `X-Session-Id` is present:
Session pinning is implemented in brightstaff. When `X-Routing-Session-Id` is present:
1. **First request** — routing runs normally, result is cached keyed by session ID
2. **Subsequent requests** — cache hit skips routing and returns the cached model instantly
The `X-Session-Id` header is forwarded transparently; no changes to your OpenAI
The `X-Routing-Session-Id` header is forwarded transparently; no changes to your OpenAI
SDK calls beyond adding the header.
```python
@ -130,7 +130,7 @@ session_id = str(uuid.uuid4())
response = client.chat.completions.create(
model="gpt-4o-mini",
messages=[{"role": "user", "content": prompt}],
extra_headers={"X-Session-Id": session_id}, # pin the session
extra_headers={"X-Routing-Session-Id": session_id}, # pin the session
)
```
@ -146,7 +146,7 @@ routing:
session_max_entries: 10000 # Max cached sessions before LRU eviction
```
Without the `X-Session-Id` header, routing runs fresh every time — no breaking
Without the `X-Routing-Session-Id` header, routing runs fresh every time — no breaking
change to existing clients.
---

View file

@ -11,9 +11,10 @@ each with its own tool-calling loop. The tasks deliberately alternate between
code_generation and complex_reasoning intents so Plano's preference-based
router selects different models for each task.
If the client sends X-Session-Id, the agent forwards it on every outbound
call to Plano. The first task pins the model; all subsequent tasks skip the
router and reuse it keeping the whole session on one consistent model.
If the client sends X-Routing-Session-Id, the agent forwards it on every
outbound call to Plano. The first task pins the model; all subsequent tasks
skip the router and reuse it keeping the whole session on one consistent
model.
Run standalone:
uv run agent.py
@ -309,12 +310,12 @@ async def run_task(
Each task is an independent conversation so the router sees only
this task's intent — not the accumulated context of previous tasks.
Session pinning via X-Session-Id pins the model from the first task
onward, so all tasks stay on the same model.
Session pinning via X-Routing-Session-Id pins the model from the first
task onward, so all tasks stay on the same model.
Returns (answer, first_model_used).
"""
headers = {"X-Session-Id": session_id} if session_id else {}
headers = {"X-Routing-Session-Id": session_id} if session_id else {}
messages: list[ChatCompletionMessageParam] = [
{"role": "system", "content": SYSTEM_PROMPT},
{"role": "user", "content": prompt},
@ -391,7 +392,7 @@ app = FastAPI(title="Research Agent", version="1.0.0")
@app.post("/v1/chat/completions")
async def chat(request: Request) -> JSONResponse:
body = await request.json()
session_id: str | None = request.headers.get("x-session-id")
session_id: str | None = request.headers.get("x-routing-session-id")
log.info("request session_id=%s", session_id or "none")

View file

@ -52,7 +52,7 @@ async def wait_for_agent(timeout: int = 30) -> bool:
async def ask_agent(query: str, session_id: str | None = None) -> dict:
headers: dict[str, str] = {}
if session_id:
headers["X-Session-Id"] = session_id
headers["X-Routing-Session-Id"] = session_id
async with httpx.AsyncClient(timeout=120.0) as client:
r = await client.post(
@ -113,7 +113,7 @@ async def main() -> None:
print(" ╚══════════════════════════════════════════════════════════════╝")
print()
print(f" Agent : {AGENT_URL}")
print(f" Query : \"{QUERY[:72]}\"")
print(f' Query : "{QUERY[:72]}"')
print()
print(" The agent uses a tool-calling loop (get_db_benchmarks,")
print(" get_case_studies, check_feature_support) to research the")
@ -149,7 +149,7 @@ async def main() -> None:
# ── Run 2 ────────────────────────────────────────────────────────────
print(" ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━")
print(f" Run 2: WITH Session Pinning (X-Session-Id: {sid[:8]}…)")
print(f" Run 2: WITH Session Pinning (X-Routing-Session-Id: {sid[:8]}…)")
print(" ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━")
print()
print(" LLM turns inside the agent loop:")