rename session pinning to model affinity with x-model-affinity header

This commit is contained in:
Adil Hafeez 2026-04-08 15:23:53 -07:00
parent 5789694d2f
commit da9792c2dd
14 changed files with 468 additions and 371 deletions

View file

@ -1,8 +1,6 @@
use bytes::Bytes; use bytes::Bytes;
use common::configuration::{FilterPipeline, ModelAlias}; use common::configuration::{FilterPipeline, ModelAlias};
use common::consts::{ use common::consts::{ARCH_IS_STREAMING_HEADER, ARCH_PROVIDER_HINT_HEADER, MODEL_AFFINITY_HEADER};
ARCH_IS_STREAMING_HEADER, ARCH_PROVIDER_HINT_HEADER, ROUTING_SESSION_ID_HEADER,
};
use common::llm_providers::LlmProviders; use common::llm_providers::LlmProviders;
use hermesllm::apis::openai::Message; use hermesllm::apis::openai::Message;
use hermesllm::apis::openai_responses::InputParam; use hermesllm::apis::openai_responses::InputParam;
@ -98,7 +96,7 @@ async fn llm_chat_inner(
// Session pinning: extract session ID and check cache before routing // Session pinning: extract session ID and check cache before routing
let session_id: Option<String> = request_headers let session_id: Option<String> = request_headers
.get(ROUTING_SESSION_ID_HEADER) .get(MODEL_AFFINITY_HEADER)
.and_then(|h| h.to_str().ok()) .and_then(|h| h.to_str().ok())
.map(|s| s.to_string()); .map(|s| s.to_string());
let pinned_model: Option<String> = if let Some(ref sid) = session_id { let pinned_model: Option<String> = if let Some(ref sid) = session_id {

View file

@ -1,6 +1,6 @@
use bytes::Bytes; use bytes::Bytes;
use common::configuration::{SpanAttributes, TopLevelRoutingPreference}; use common::configuration::{SpanAttributes, TopLevelRoutingPreference};
use common::consts::{REQUEST_ID_HEADER, ROUTING_SESSION_ID_HEADER}; use common::consts::{MODEL_AFFINITY_HEADER, REQUEST_ID_HEADER};
use common::errors::BrightStaffError; use common::errors::BrightStaffError;
use hermesllm::clients::SupportedAPIsFromClient; use hermesllm::clients::SupportedAPIsFromClient;
use hermesllm::ProviderRequestType; use hermesllm::ProviderRequestType;
@ -72,7 +72,7 @@ pub async fn routing_decision(
.unwrap_or_else(|| uuid::Uuid::new_v4().to_string()); .unwrap_or_else(|| uuid::Uuid::new_v4().to_string());
let session_id: Option<String> = request_headers let session_id: Option<String> = request_headers
.get(ROUTING_SESSION_ID_HEADER) .get(MODEL_AFFINITY_HEADER)
.and_then(|h| h.to_str().ok()) .and_then(|h| h.to_str().ok())
.map(|s| s.to_string()); .map(|s| s.to_string());

View file

@ -22,7 +22,7 @@ pub const X_ARCH_TOOL_CALL: &str = "x-arch-tool-call-message";
pub const X_ARCH_FC_MODEL_RESPONSE: &str = "x-arch-fc-model-response"; pub const X_ARCH_FC_MODEL_RESPONSE: &str = "x-arch-fc-model-response";
pub const ARCH_FC_MODEL_NAME: &str = "Arch-Function"; pub const ARCH_FC_MODEL_NAME: &str = "Arch-Function";
pub const REQUEST_ID_HEADER: &str = "x-request-id"; pub const REQUEST_ID_HEADER: &str = "x-request-id";
pub const ROUTING_SESSION_ID_HEADER: &str = "x-routing-session-id"; pub const MODEL_AFFINITY_HEADER: &str = "x-model-affinity";
pub const ENVOY_ORIGINAL_PATH_HEADER: &str = "x-envoy-original-path"; pub const ENVOY_ORIGINAL_PATH_HEADER: &str = "x-envoy-original-path";
pub const TRACE_PARENT_HEADER: &str = "traceparent"; pub const TRACE_PARENT_HEADER: &str = "traceparent";
pub const ARCH_INTERNAL_CLUSTER_NAME: &str = "arch_internal"; pub const ARCH_INTERNAL_CLUSTER_NAME: &str = "arch_internal";

View file

@ -0,0 +1,135 @@
# Model Affinity Demo
> Consistent model selection for agentic loops using `X-Model-Affinity`.
## Why Model Affinity?
When an agent runs in a loop — calling tools, reasoning about results, calling more tools — each LLM request hits Plano's router independently. Because prompts vary in intent (tool selection looks like code generation, reasoning about results looks like complex analysis), the router may select **different models** for each turn, fragmenting context mid-session.
**Model affinity** solves this: send an `X-Model-Affinity` header and the first request runs routing as usual, caching the decision. Every subsequent request with the same affinity ID returns the **same model**, without re-running the router.
```
Without affinity With affinity (X-Model-Affinity)
──────────────── ───────────────────────────────
Turn 1 → claude-sonnet (tool calls) Turn 1 → claude-sonnet ← routed
Turn 2 → gpt-4o (reasoning) Turn 2 → claude-sonnet ← pinned ✓
Turn 3 → claude-sonnet (tool calls) Turn 3 → claude-sonnet ← pinned ✓
Turn 4 → gpt-4o (reasoning) Turn 4 → claude-sonnet ← pinned ✓
Turn 5 → claude-sonnet (final answer) Turn 5 → claude-sonnet ← pinned ✓
↑ model switches every turn ↑ one model, start to finish
```
---
## Quick Start
```bash
# 1. Set API keys
export OPENAI_API_KEY=<your-key>
export ANTHROPIC_API_KEY=<your-key>
# 2. Start Plano
cd demos/llm_routing/model_affinity
planoai up config.yaml
# 3. Run the demo (uv manages dependencies automatically)
./demo.sh # or: uv run demo.py
```
---
## What the Demo Does
A **database selection agent** investigates whether to use PostgreSQL or MongoDB
for an e-commerce platform. It runs a real tool-calling loop: the LLM decides
which tools to call, receives simulated results, and continues until it has
enough data to recommend a database.
Available tools:
- `get_db_benchmarks` — fetch performance data for a workload type
- `get_case_studies` — retrieve real-world e-commerce case studies
- `check_feature_support` — check if a database supports a specific feature
The demo runs the **same agent loop twice**:
1. **Without affinity** — no `X-Model-Affinity`; models may switch between turns
2. **With affinity**`X-Model-Affinity` header included; model is pinned from turn 1
Each turn is a separate `POST /v1/chat/completions` request to Plano using the
[OpenAI SDK](https://github.com/openai/openai-python). The demo prints the
model used on each turn so you can see the difference.
### Expected Output
```
Run 1: WITHOUT Model Affinity
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
turn 1 [claude-sonnet-4-20250514 ] get_db_benchmarks, get_db_benchmarks
turn 2 [gpt-4o ] get_case_studies, get_case_studies ← switched
turn 3 [claude-sonnet-4-20250514 ] check_feature_support ← switched
turn 4 [gpt-4o ] final answer ← switched
✗ Without affinity: model switched 3 time(s)
Run 2: WITH Model Affinity (X-Model-Affinity: a1b2c3d4…)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
turn 1 [claude-sonnet-4-20250514 ] get_db_benchmarks, get_db_benchmarks
turn 2 [claude-sonnet-4-20250514 ] get_case_studies, get_case_studies
turn 3 [claude-sonnet-4-20250514 ] check_feature_support
turn 4 [claude-sonnet-4-20250514 ] final answer
✓ With affinity: claude-sonnet-4-20250514 for all 4 turns
```
### How It Works
Model affinity is implemented in brightstaff. When `X-Model-Affinity` is present:
1. **First request** — routing runs normally, result is cached keyed by the affinity ID
2. **Subsequent requests** — cache hit skips routing and returns the cached model instantly
The `X-Model-Affinity` header is forwarded transparently; no changes to your OpenAI
SDK calls beyond adding the header.
```python
from openai import OpenAI
import uuid
client = OpenAI(base_url="http://localhost:12000/v1", api_key="EMPTY")
affinity_id = str(uuid.uuid4())
response = client.chat.completions.create(
model="gpt-4o-mini",
messages=[{"role": "user", "content": prompt}],
extra_headers={"X-Model-Affinity": affinity_id},
)
```
---
## Configuration
Model affinity is configurable in `config.yaml`:
```yaml
routing:
session_ttl_seconds: 600 # How long affinity lasts (default: 10 min)
session_max_entries: 10000 # Max cached sessions (upper limit: 10000)
```
Without the `X-Model-Affinity` header, routing runs fresh every time — no breaking
change to existing clients.
---
## Advanced: Agent Server Demo
The `agent.py` file is a FastAPI-based agent server that demonstrates a more
complex pattern: an external agent service that forwards `X-Model-Affinity`
on all outbound calls to Plano. Use `start_agents.sh` to run it.
## See Also
- [Model Routing Service Demo](../model_routing_service/) — curl-based examples of the routing endpoint

View file

@ -11,10 +11,9 @@ each with its own tool-calling loop. The tasks deliberately alternate between
code_generation and complex_reasoning intents so Plano's preference-based code_generation and complex_reasoning intents so Plano's preference-based
router selects different models for each task. router selects different models for each task.
If the client sends X-Routing-Session-Id, the agent forwards it on every If the client sends X-Model-Affinity, the agent forwards it on every outbound
outbound call to Plano. The first task pins the model; all subsequent tasks call to Plano. The first task pins the model; all subsequent tasks skip the
skip the router and reuse it keeping the whole session on one consistent router and reuse it keeping the whole session on one consistent model.
model.
Run standalone: Run standalone:
uv run agent.py uv run agent.py
@ -310,12 +309,12 @@ async def run_task(
Each task is an independent conversation so the router sees only Each task is an independent conversation so the router sees only
this task's intent — not the accumulated context of previous tasks. this task's intent — not the accumulated context of previous tasks.
Session pinning via X-Routing-Session-Id pins the model from the first Model affinity via X-Model-Affinity pins the model from the first task
task onward, so all tasks stay on the same model. onward, so all tasks stay on the same model.
Returns (answer, first_model_used). Returns (answer, first_model_used).
""" """
headers = {"X-Routing-Session-Id": session_id} if session_id else {} headers = {"X-Model-Affinity": session_id} if session_id else {}
messages: list[ChatCompletionMessageParam] = [ messages: list[ChatCompletionMessageParam] = [
{"role": "system", "content": SYSTEM_PROMPT}, {"role": "system", "content": SYSTEM_PROMPT},
{"role": "user", "content": prompt}, {"role": "user", "content": prompt},
@ -392,7 +391,7 @@ app = FastAPI(title="Research Agent", version="1.0.0")
@app.post("/v1/chat/completions") @app.post("/v1/chat/completions")
async def chat(request: Request) -> JSONResponse: async def chat(request: Request) -> JSONResponse:
body = await request.json() body = await request.json()
session_id: str | None = request.headers.get("x-routing-session-id") session_id: str | None = request.headers.get("x-model-affinity")
log.info("request session_id=%s", session_id or "none") log.info("request session_id=%s", session_id or "none")

View file

@ -0,0 +1,307 @@
#!/usr/bin/env -S uv run --script
# /// script
# requires-python = ">=3.12"
# dependencies = ["openai>=1.0.0"]
# ///
"""
Model Affinity Demo Agentic Tool-Calling Loop
Runs the same agentic loop twice through Plano:
1. Without model affinity the router may pick different models per turn
2. With model affinity all turns use the model selected on turn 1
Each loop is a real tool-calling agent: the LLM decides which tools to call,
we provide simulated results, and the LLM continues until it has enough
information to produce a final answer. Each turn is a separate request to
Plano, so the router classifies intent independently every time.
Usage:
planoai up config.yaml # start Plano
uv run demo.py # run this demo
"""
import asyncio
import json
import os
import uuid
from openai import AsyncOpenAI
from openai.types.chat import ChatCompletionMessageParam
PLANO_URL = os.environ.get("PLANO_URL", "http://localhost:12000")
SYSTEM_PROMPT = (
"You are a database selection analyst. Use the provided tools to gather "
"benchmark data and case studies, then recommend PostgreSQL or MongoDB "
"for a high-traffic e-commerce backend. Be concise."
)
USER_QUERY = (
"Should we use PostgreSQL or MongoDB for our e-commerce platform? "
"We need strong consistency for orders but flexible schemas for products. "
"Use the tools to research both options, then give a recommendation."
)
TOOLS = [
{
"type": "function",
"function": {
"name": "get_db_benchmarks",
"description": "Fetch performance benchmarks for a database under a given workload.",
"parameters": {
"type": "object",
"properties": {
"database": {
"type": "string",
"enum": ["postgresql", "mongodb"],
},
"workload": {
"type": "string",
"enum": ["read_heavy", "write_heavy", "mixed"],
},
},
"required": ["database", "workload"],
},
},
},
{
"type": "function",
"function": {
"name": "get_case_studies",
"description": "Retrieve real-world e-commerce case studies for a database.",
"parameters": {
"type": "object",
"properties": {
"database": {
"type": "string",
"enum": ["postgresql", "mongodb"],
},
},
"required": ["database"],
},
},
},
{
"type": "function",
"function": {
"name": "check_feature_support",
"description": "Check if a database supports a specific feature.",
"parameters": {
"type": "object",
"properties": {
"database": {
"type": "string",
"enum": ["postgresql", "mongodb"],
},
"feature": {"type": "string"},
},
"required": ["database", "feature"],
},
},
},
]
# Simulated tool responses
_BENCHMARKS = {
("postgresql", "mixed"): {
"read_qps": 42000,
"write_qps": 21000,
"p99_ms": 6,
"notes": "Solid all-round; MVCC keeps reads non-blocking",
},
("mongodb", "mixed"): {
"read_qps": 60000,
"write_qps": 50000,
"p99_ms": 3,
"notes": "Flexible schema accelerates feature iteration",
},
}
_CASE_STUDIES = {
"postgresql": [
{"company": "Shopify", "notes": "Moved orders back to Postgres for ACID"},
{
"company": "Zalando",
"notes": "Postgres + Citus for sharded order processing",
},
],
"mongodb": [
{"company": "eBay", "notes": "Product catalogue — flexible attribute schemas"},
{"company": "Alibaba", "notes": "Session/cart data — high write throughput"},
],
}
_FEATURES = {
("postgresql", "acid transactions"): {"supported": True, "notes": "Full ACID"},
("mongodb", "acid transactions"): {
"supported": True,
"notes": "Multi-doc ACID since v4.0",
},
("postgresql", "horizontal sharding"): {
"supported": True,
"notes": "Via Citus extension",
},
("mongodb", "horizontal sharding"): {
"supported": True,
"notes": "Native auto-balancing",
},
}
def dispatch_tool(name: str, args: dict) -> str:
if name == "get_db_benchmarks":
key = (args["database"], args["workload"])
return json.dumps(_BENCHMARKS.get(key, {"error": f"no data for {key}"}))
if name == "get_case_studies":
return json.dumps(_CASE_STUDIES.get(args["database"], {"error": "unknown db"}))
if name == "check_feature_support":
key = (args["database"], args["feature"].lower())
for k, v in _FEATURES.items():
if k[0] == key[0] and k[1] in key[1]:
return json.dumps(v)
return json.dumps({"error": f"no data for {key}"})
return json.dumps({"error": f"unknown tool {name}"})
# ---------------------------------------------------------------------------
# Agentic loop — runs tool calls until the LLM produces a final answer
# ---------------------------------------------------------------------------
async def run_agent_loop(
affinity_id: str | None = None,
max_turns: int = 10,
) -> tuple[str, list[dict]]:
"""
Run a tool-calling agent loop against Plano.
Returns (final_answer, trace) where trace is a list of
{"turn": int, "model": str, "tool_calls": [...]} dicts.
"""
client = AsyncOpenAI(base_url=f"{PLANO_URL}/v1", api_key="EMPTY")
headers = {"X-Model-Affinity": affinity_id} if affinity_id else None
messages: list[ChatCompletionMessageParam] = [
{"role": "system", "content": SYSTEM_PROMPT},
{"role": "user", "content": USER_QUERY},
]
trace: list[dict] = []
for turn in range(1, max_turns + 1):
resp = await client.chat.completions.create(
model="gpt-4o-mini",
messages=messages,
tools=TOOLS,
tool_choice="auto",
max_completion_tokens=800,
extra_headers=headers,
)
choice = resp.choices[0]
turn_info: dict = {"turn": turn, "model": resp.model}
if choice.finish_reason == "tool_calls" and choice.message.tool_calls:
tool_names = [tc.function.name for tc in choice.message.tool_calls]
turn_info["tool_calls"] = tool_names
trace.append(turn_info)
messages.append(choice.message)
for tc in choice.message.tool_calls:
args = json.loads(tc.function.arguments or "{}")
result = dispatch_tool(tc.function.name, args)
messages.append(
{"role": "tool", "content": result, "tool_call_id": tc.id}
)
else:
turn_info["tool_calls"] = []
trace.append(turn_info)
return (choice.message.content or "").strip(), trace
return "(max turns reached)", trace
# ---------------------------------------------------------------------------
# Display helpers
# ---------------------------------------------------------------------------
def short_model(model: str) -> str:
return model.split("/")[-1] if "/" in model else model
def print_trace(trace: list[dict]) -> None:
for t in trace:
model = short_model(t["model"])
tools = ", ".join(t["tool_calls"]) if t["tool_calls"] else "final answer"
print(f" turn {t['turn']} [{model:<30}] {tools}")
def print_summary(label: str, trace: list[dict]) -> None:
models = [t["model"] for t in trace]
unique = set(models)
if len(unique) == 1:
print(
f"{label}: {short_model(next(iter(unique)))} "
f"for all {len(models)} turns"
)
else:
switches = sum(1 for a, b in zip(models, models[1:]) if a != b)
names = ", ".join(sorted(short_model(m) for m in unique))
print(f"{label}: model switched {switches} time(s) — {names}")
# ---------------------------------------------------------------------------
# Main
# ---------------------------------------------------------------------------
async def main() -> None:
print()
print(" ╔══════════════════════════════════════════════════════════╗")
print(" ║ Model Affinity Demo — Agentic Loop ║")
print(" ╚══════════════════════════════════════════════════════════╝")
print()
print(f" Plano : {PLANO_URL}")
print(f' Query : "{USER_QUERY[:65]}"')
print()
print(" The agent calls tools (get_db_benchmarks, get_case_studies,")
print(" check_feature_support) across multiple turns. Each turn is")
print(" a separate request to Plano — the router classifies intent")
print(" independently, so different turns may get different models.")
print()
# --- Run 1: without affinity ---
print(" ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━")
print(" Run 1: WITHOUT Model Affinity")
print(" ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━")
print()
answer1, trace1 = await run_agent_loop(affinity_id=None)
print_trace(trace1)
print()
print_summary("Without affinity", trace1)
print()
# --- Run 2: with affinity ---
aid = str(uuid.uuid4())
print(" ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━")
print(f" Run 2: WITH Model Affinity (X-Model-Affinity: {aid[:8]}…)")
print(" ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━")
print()
answer2, trace2 = await run_agent_loop(affinity_id=aid)
print_trace(trace2)
print()
print_summary("With affinity ", trace2)
print()
# --- Final answer ---
print(" ══ Agent recommendation (affinity session) ════════════════")
print()
for line in answer2.splitlines():
print(f" {line}")
print()
print(" ═══════════════════════════════════════════════════════════")
print()
if __name__ == "__main__":
asyncio.run(main())

View file

@ -0,0 +1,7 @@
#!/bin/bash
set -e
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
# Run the demo directly against Plano (no agent server needed)
uv run "$SCRIPT_DIR/demo.py"

View file

@ -108,13 +108,13 @@ The response contains the model list — your client should try `models[0]` firs
## Session Pinning ## Session Pinning
Send an `X-Routing-Session-Id` header to pin the routing decision for a session. Once a model is selected, all subsequent requests with the same session ID return the same model without re-running routing. Send an `X-Model-Affinity` header to pin the routing decision for a session. Once a model is selected, all subsequent requests with the same session ID return the same model without re-running routing.
```bash ```bash
# First call — runs routing, caches result # First call — runs routing, caches result
curl http://localhost:12000/routing/v1/chat/completions \ curl http://localhost:12000/routing/v1/chat/completions \
-H "Content-Type: application/json" \ -H "Content-Type: application/json" \
-H "X-Routing-Session-Id: my-session-123" \ -H "X-Model-Affinity: my-session-123" \
-d '{ -d '{
"model": "gpt-4o-mini", "model": "gpt-4o-mini",
"messages": [{"role": "user", "content": "Write a Python function for binary search"}] "messages": [{"role": "user", "content": "Write a Python function for binary search"}]
@ -136,7 +136,7 @@ Response (first call):
# Second call — same session, returns cached result # Second call — same session, returns cached result
curl http://localhost:12000/routing/v1/chat/completions \ curl http://localhost:12000/routing/v1/chat/completions \
-H "Content-Type: application/json" \ -H "Content-Type: application/json" \
-H "X-Routing-Session-Id: my-session-123" \ -H "X-Model-Affinity: my-session-123" \
-d '{ -d '{
"model": "gpt-4o-mini", "model": "gpt-4o-mini",
"messages": [{"role": "user", "content": "Now explain merge sort"}] "messages": [{"role": "user", "content": "Now explain merge sort"}]
@ -161,7 +161,7 @@ routing:
session_max_entries: 10000 # default: 10000 session_max_entries: 10000 # default: 10000
``` ```
Without the `X-Routing-Session-Id` header, routing runs fresh every time (no breaking change). Without the `X-Model-Affinity` header, routing runs fresh every time (no breaking change).
## Kubernetes Deployment (Self-hosted Arch-Router on GPU) ## Kubernetes Deployment (Self-hosted Arch-Router on GPU)

View file

@ -114,7 +114,7 @@ echo "--- 7. Session pinning - first call (fresh routing decision) ---"
echo "" echo ""
curl -s "$PLANO_URL/routing/v1/chat/completions" \ curl -s "$PLANO_URL/routing/v1/chat/completions" \
-H "Content-Type: application/json" \ -H "Content-Type: application/json" \
-H "X-Routing-Session-Id: demo-session-001" \ -H "X-Model-Affinity: demo-session-001" \
-d '{ -d '{
"model": "gpt-4o-mini", "model": "gpt-4o-mini",
"messages": [ "messages": [
@ -129,7 +129,7 @@ echo " Notice: same model returned with \"pinned\": true, routing was skipped
echo "" echo ""
curl -s "$PLANO_URL/routing/v1/chat/completions" \ curl -s "$PLANO_URL/routing/v1/chat/completions" \
-H "Content-Type: application/json" \ -H "Content-Type: application/json" \
-H "X-Routing-Session-Id: demo-session-001" \ -H "X-Model-Affinity: demo-session-001" \
-d '{ -d '{
"model": "gpt-4o-mini", "model": "gpt-4o-mini",
"messages": [ "messages": [
@ -143,7 +143,7 @@ echo "--- 9. Different session gets its own fresh routing ---"
echo "" echo ""
curl -s "$PLANO_URL/routing/v1/chat/completions" \ curl -s "$PLANO_URL/routing/v1/chat/completions" \
-H "Content-Type: application/json" \ -H "Content-Type: application/json" \
-H "X-Routing-Session-Id: demo-session-002" \ -H "X-Model-Affinity: demo-session-002" \
-d '{ -d '{
"model": "gpt-4o-mini", "model": "gpt-4o-mini",
"messages": [ "messages": [

View file

@ -1,156 +0,0 @@
# Session Pinning Demo
> Consistent model selection for agentic loops using `X-Routing-Session-Id`.
## Why Session Pinning?
When an agent runs in a loop — research → analyse → implement → evaluate → summarise — each step hits Plano's router independently. Because prompts vary in intent, the router may select **different models** for each step, fragmenting context mid-session.
**Session pinning** solves this: send an `X-Routing-Session-Id` header and the first request runs routing as usual, caching the decision. Every subsequent request with the same session ID returns the **same model**, without re-running the router.
```
Without pinning With pinning (X-Routing-Session-Id)
───────────────── ──────────────────────────
Step 1 → claude-sonnet (code_gen) Step 1 → claude-sonnet ← routed
Step 2 → gpt-4o (reasoning) Step 2 → claude-sonnet ← pinned ✓
Step 3 → claude-sonnet (code_gen) Step 3 → claude-sonnet ← pinned ✓
Step 4 → gpt-4o (reasoning) Step 4 → claude-sonnet ← pinned ✓
Step 5 → claude-sonnet (code_gen) Step 5 → claude-sonnet ← pinned ✓
↑ model switches every step ↑ one model, start to finish
```
---
## Quick Start
```bash
# 1. Set API keys
export OPENAI_API_KEY=<your-key>
export ANTHROPIC_API_KEY=<your-key>
# 2. Start Plano
cd demos/llm_routing/session_pinning
planoai up config.yaml
# 3. Run the demo (uv manages dependencies automatically)
./demo.sh # or: uv run demo.py
```
---
## What the Demo Does
A **Database Research Agent** investigates whether to use PostgreSQL or MongoDB
for an e-commerce platform. It runs 5 steps, each building on prior findings via
accumulated message history. Steps alternate between `code_generation` and
`complex_reasoning` intents so Plano routes to different models without pinning.
| Step | Task | Intent |
|:----:|------|--------|
| 1 | List technical requirements | code_generation → claude-sonnet |
| 2 | Compare PostgreSQL vs MongoDB | complex_reasoning → gpt-4o |
| 3 | Write schema (CREATE TABLE) | code_generation → claude-sonnet |
| 4 | Assess scalability trade-offs | complex_reasoning → gpt-4o |
| 5 | Write final recommendation report | code_generation → claude-sonnet |
The demo runs the loop **twice** against `/v1/chat/completions` using the
[OpenAI SDK](https://github.com/openai/openai-python):
1. **Without pinning** — no `X-Routing-Session-Id`; models alternate per step
2. **With pinning**`X-Routing-Session-Id` header included; model is pinned from step 1
Each step makes real LLM calls. Step 5's report explicitly references findings
from earlier steps, demonstrating why coherent context requires a consistent model.
### Expected Output
```
Run 1: WITHOUT Session Pinning
─────────────────────────────────────────────────────────────────────
step 1 [claude-sonnet-4-20250514] List requirements
"Critical requirements: 1. ACID transactions for order integrity…"
step 2 [gpt-4o ] Compare databases ← switched
"PostgreSQL excels at joins and ACID guarantees…"
step 3 [claude-sonnet-4-20250514] Write schema ← switched
"CREATE TABLE orders (\n id SERIAL PRIMARY KEY…"
step 4 [gpt-4o ] Assess scalability ← switched
"At high write volume, PostgreSQL row-level locking…"
step 5 [claude-sonnet-4-20250514] Write report ← switched
"RECOMMENDATION: PostgreSQL is the right choice…"
✗ Without pinning: model switched 4 time(s) — gpt-4o, claude-sonnet-4-20250514
Run 2: WITH Session Pinning (X-Routing-Session-Id: a1b2c3d4…)
─────────────────────────────────────────────────────────────────────
step 1 [claude-sonnet-4-20250514] List requirements
"Critical requirements: 1. ACID transactions for order integrity…"
step 2 [claude-sonnet-4-20250514] Compare databases
"Building on the requirements I just outlined: PostgreSQL…"
step 3 [claude-sonnet-4-20250514] Write schema
"Following the comparison above, here is the PostgreSQL schema…"
step 4 [claude-sonnet-4-20250514] Assess scalability
"Given the schema I designed, PostgreSQL's row-level locking…"
step 5 [claude-sonnet-4-20250514] Write report
"RECOMMENDATION: Based on my analysis of requirements, comparison…"
✓ With pinning: claude-sonnet-4-20250514 held for all 5 steps
══ Final Report (pinned session) ═════════════════════════════════════
RECOMMENDATION: Based on my analysis of requirements, the head-to-head
comparison, the schema I designed, and the scalability trade-offs…
══════════════════════════════════════════════════════════════════════
```
### How It Works
Session pinning is implemented in brightstaff. When `X-Routing-Session-Id` is present:
1. **First request** — routing runs normally, result is cached keyed by session ID
2. **Subsequent requests** — cache hit skips routing and returns the cached model instantly
The `X-Routing-Session-Id` header is forwarded transparently; no changes to your OpenAI
SDK calls beyond adding the header.
```python
from openai import OpenAI
client = OpenAI(base_url="http://localhost:12000/v1", api_key="EMPTY")
session_id = str(uuid.uuid4())
response = client.chat.completions.create(
model="gpt-4o-mini",
messages=[{"role": "user", "content": prompt}],
extra_headers={"X-Routing-Session-Id": session_id}, # pin the session
)
```
---
## Configuration
Session pinning is configurable in `config.yaml`:
```yaml
routing:
session_ttl_seconds: 600 # How long a pinned session lasts (default: 10 min)
session_max_entries: 10000 # Max cached sessions before LRU eviction
```
Without the `X-Routing-Session-Id` header, routing runs fresh every time — no breaking
change to existing clients.
---
## See Also
- [Model Routing Service Demo](../model_routing_service/) — curl-based examples of the routing endpoint

View file

@ -1,174 +0,0 @@
#!/usr/bin/env -S uv run --script
# /// script
# requires-python = ">=3.12"
# dependencies = ["httpx>=0.27"]
# ///
"""
Session Pinning Demo Research Agent client
Sends the same query to the Research Agent twice once without a session ID
and once with one and compares the routing trace to show how session pinning
keeps the model consistent across the LLM's tool-calling loop.
Requires the agent to already be running (start it with ./start_agents.sh).
Usage:
uv run demo.py
AGENT_URL=http://localhost:8000 uv run demo.py
"""
import asyncio
import os
import uuid
import httpx
AGENT_URL = os.environ.get("AGENT_URL", "http://localhost:8000")
QUERY = (
"Should we use PostgreSQL or MongoDB for a high-traffic e-commerce backend "
"that needs strong consistency for orders but flexible schemas for products?"
)
# ---------------------------------------------------------------------------
# Client helpers
# ---------------------------------------------------------------------------
async def wait_for_agent(timeout: int = 30) -> bool:
async with httpx.AsyncClient() as client:
for _ in range(timeout * 2):
try:
r = await client.get(f"{AGENT_URL}/health", timeout=1.0)
if r.status_code == 200:
return True
except Exception:
pass
await asyncio.sleep(0.5)
return False
async def ask_agent(query: str, session_id: str | None = None) -> dict:
headers: dict[str, str] = {}
if session_id:
headers["X-Routing-Session-Id"] = session_id
async with httpx.AsyncClient(timeout=120.0) as client:
r = await client.post(
f"{AGENT_URL}/v1/chat/completions",
headers=headers,
json={"messages": [{"role": "user", "content": query}]},
)
r.raise_for_status()
return r.json()
# ---------------------------------------------------------------------------
# Display helpers
# ---------------------------------------------------------------------------
def _short(model: str) -> str:
return model.split("/")[-1] if "/" in model else model
def _print_trace(result: dict) -> None:
trace = result.get("routing_trace", [])
if not trace:
print(" (no trace)")
return
prev: str | None = None
for t in trace:
short = _short(t["model"])
switch = " ← switched" if (prev and t["model"] != prev) else ""
prev = t["model"]
print(f" {t['task']:<26} [{short}]{switch}")
def _print_summary(label: str, result: dict) -> None:
models = [t["model"] for t in result.get("routing_trace", [])]
if not models:
print(f" ? {label}: no routing data")
return
unique = set(models)
if len(unique) == 1:
print(f"{label}: {_short(next(iter(unique)))} for all {len(models)} turns")
else:
switched = sum(1 for a, b in zip(models, models[1:]) if a != b)
names = ", ".join(sorted(_short(m) for m in unique))
print(f"{label}: model switched {switched} time(s) — {names}")
# ---------------------------------------------------------------------------
# Demo
# ---------------------------------------------------------------------------
async def main() -> None:
print()
print(" ╔══════════════════════════════════════════════════════════════╗")
print(" ║ Session Pinning Demo — Research Agent ║")
print(" ╚══════════════════════════════════════════════════════════════╝")
print()
print(f" Agent : {AGENT_URL}")
print(f' Query : "{QUERY[:72]}"')
print()
print(" The agent uses a tool-calling loop (get_db_benchmarks,")
print(" get_case_studies, check_feature_support) to research the")
print(" question. Each LLM turn hits Plano's preference-based router.")
print()
print(f" Waiting for agent at {AGENT_URL}", end=" ", flush=True)
if not await wait_for_agent():
print("FAILED — agent did not respond within 30 s")
return
print("ready.")
print()
sid = str(uuid.uuid4())
print(" Sending queries (running concurrently)…")
print()
without, with_pin = await asyncio.gather(
ask_agent(QUERY, session_id=None),
ask_agent(QUERY, session_id=sid),
)
# ── Run 1 ────────────────────────────────────────────────────────────
print(" ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━")
print(" Run 1: WITHOUT Session Pinning")
print(" ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━")
print()
print(" LLM turns inside the agent loop:")
print()
_print_trace(without)
print()
_print_summary("Without pinning", without)
print()
# ── Run 2 ────────────────────────────────────────────────────────────
print(" ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━")
print(f" Run 2: WITH Session Pinning (X-Routing-Session-Id: {sid[:8]}…)")
print(" ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━")
print()
print(" LLM turns inside the agent loop:")
print()
_print_trace(with_pin)
print()
_print_summary("With pinning ", with_pin)
print()
# ── Final answer ─────────────────────────────────────────────────────
answer = with_pin["choices"][0]["message"]["content"]
print(" ══ Agent recommendation (pinned session) ═════════════════════")
print()
for line in answer.splitlines():
print(f" {line}")
print()
print(" ══════════════════════════════════════════════════════════════")
print()
if __name__ == "__main__":
asyncio.run(main())

View file

@ -1,19 +0,0 @@
#!/bin/bash
set -e
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
export PLANO_URL="${PLANO_URL:-http://localhost:12000}"
export AGENT_PORT="${AGENT_PORT:-8000}"
export AGENT_URL="http://localhost:$AGENT_PORT"
cleanup() {
[ -n "$AGENT_PID" ] && kill "$AGENT_PID" 2>/dev/null
}
trap cleanup EXIT INT TERM
# Start the agent in the background
"$SCRIPT_DIR/start_agents.sh" &
AGENT_PID=$!
# Run the demo client
uv run "$SCRIPT_DIR/demo.py"