rename session pinning to model affinity with x-model-affinity header

2026-05-07 06:42:42 +02:00 · 2026-04-08 15:23:53 -07:00 · 2026-04-08 15:23:53 -07:00 · da9792c2dd
commit da9792c2dd
parent 5789694d2f
14 changed files with 468 additions and 371 deletions
--- a/crates/brightstaff/src/handlers/llm/mod.rs
+++ b/crates/brightstaff/src/handlers/llm/mod.rs
@ -1,8 +1,6 @@
 use bytes::Bytes;
 use common::configuration::{FilterPipeline, ModelAlias};
-use common::consts::{
+use common::consts::{ARCH_IS_STREAMING_HEADER, ARCH_PROVIDER_HINT_HEADER, MODEL_AFFINITY_HEADER};
    ARCH_IS_STREAMING_HEADER, ARCH_PROVIDER_HINT_HEADER, ROUTING_SESSION_ID_HEADER,
 };
 use common::llm_providers::LlmProviders;
 use hermesllm::apis::openai::Message;
 use hermesllm::apis::openai_responses::InputParam;
@ -98,7 +96,7 @@ async fn llm_chat_inner(
    // Session pinning: extract session ID and check cache before routing
    let session_id: Option<String> = request_headers
-        .get(ROUTING_SESSION_ID_HEADER)
+        .get(MODEL_AFFINITY_HEADER)
        .and_then(|h| h.to_str().ok())
        .map(|s| s.to_string());
    let pinned_model: Option<String> = if let Some(ref sid) = session_id {
--- a/crates/brightstaff/src/handlers/routing_service.rs
+++ b/crates/brightstaff/src/handlers/routing_service.rs
@ -1,6 +1,6 @@
 use bytes::Bytes;
 use common::configuration::{SpanAttributes, TopLevelRoutingPreference};
-use common::consts::{REQUEST_ID_HEADER, ROUTING_SESSION_ID_HEADER};
+use common::consts::{MODEL_AFFINITY_HEADER, REQUEST_ID_HEADER};
 use common::errors::BrightStaffError;
 use hermesllm::clients::SupportedAPIsFromClient;
 use hermesllm::ProviderRequestType;
@ -72,7 +72,7 @@ pub async fn routing_decision(
        .unwrap_or_else(|| uuid::Uuid::new_v4().to_string());
    let session_id: Option<String> = request_headers
-        .get(ROUTING_SESSION_ID_HEADER)
+        .get(MODEL_AFFINITY_HEADER)
        .and_then(|h| h.to_str().ok())
        .map(|s| s.to_string());
--- a/crates/common/src/consts.rs
+++ b/crates/common/src/consts.rs
@ -22,7 +22,7 @@ pub const X_ARCH_TOOL_CALL: &str = "x-arch-tool-call-message";
 pub const X_ARCH_FC_MODEL_RESPONSE: &str = "x-arch-fc-model-response";
 pub const ARCH_FC_MODEL_NAME: &str = "Arch-Function";
 pub const REQUEST_ID_HEADER: &str = "x-request-id";
-pub const ROUTING_SESSION_ID_HEADER: &str = "x-routing-session-id";
+pub const MODEL_AFFINITY_HEADER: &str = "x-model-affinity";
 pub const ENVOY_ORIGINAL_PATH_HEADER: &str = "x-envoy-original-path";
 pub const TRACE_PARENT_HEADER: &str = "traceparent";
 pub const ARCH_INTERNAL_CLUSTER_NAME: &str = "arch_internal";
--- a/demos/llm_routing/model_affinity/README.md
+++ b/demos/llm_routing/model_affinity/README.md
@ -0,0 +1,135 @@
 # Model Affinity Demo
 > Consistent model selection for agentic loops using `X-Model-Affinity`.
 ## Why Model Affinity?
 When an agent runs in a loop — calling tools, reasoning about results, calling more tools — each LLM request hits Plano's router independently. Because prompts vary in intent (tool selection looks like code generation, reasoning about results looks like complex analysis), the router may select **different models** for each turn, fragmenting context mid-session.
 **Model affinity** solves this: send an `X-Model-Affinity` header and the first request runs routing as usual, caching the decision. Every subsequent request with the same affinity ID returns the **same model**, without re-running the router.
 ```
 Without affinity                         With affinity (X-Model-Affinity)
 ────────────────                         ───────────────────────────────
 Turn 1 → claude-sonnet  (tool calls)     Turn 1 → claude-sonnet  ← routed
 Turn 2 → gpt-4o         (reasoning)      Turn 2 → claude-sonnet  ← pinned ✓
 Turn 3 → claude-sonnet  (tool calls)     Turn 3 → claude-sonnet  ← pinned ✓
 Turn 4 → gpt-4o         (reasoning)      Turn 4 → claude-sonnet  ← pinned ✓
 Turn 5 → claude-sonnet  (final answer)   Turn 5 → claude-sonnet  ← pinned ✓
       ↑ model switches every turn                ↑ one model, start to finish
 ```
 ---
 ## Quick Start
 ```bash
 # 1. Set API keys
 export OPENAI_API_KEY=<your-key>
 export ANTHROPIC_API_KEY=<your-key>
 # 2. Start Plano
 cd demos/llm_routing/model_affinity
 planoai up config.yaml
 # 3. Run the demo (uv manages dependencies automatically)
 ./demo.sh          # or: uv run demo.py
 ```
 ---
 ## What the Demo Does
 A **database selection agent** investigates whether to use PostgreSQL or MongoDB
 for an e-commerce platform. It runs a real tool-calling loop: the LLM decides
 which tools to call, receives simulated results, and continues until it has
 enough data to recommend a database.
 Available tools:
 - `get_db_benchmarks` — fetch performance data for a workload type
 - `get_case_studies` — retrieve real-world e-commerce case studies
 - `check_feature_support` — check if a database supports a specific feature
 The demo runs the **same agent loop twice**:
 1. **Without affinity** — no `X-Model-Affinity`; models may switch between turns
 2. **With affinity** — `X-Model-Affinity` header included; model is pinned from turn 1
 Each turn is a separate `POST /v1/chat/completions` request to Plano using the
 [OpenAI SDK](https://github.com/openai/openai-python). The demo prints the
 model used on each turn so you can see the difference.
 ### Expected Output
 ```
  Run 1: WITHOUT Model Affinity
  ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
    turn 1  [claude-sonnet-4-20250514     ]  get_db_benchmarks, get_db_benchmarks
    turn 2  [gpt-4o                       ]  get_case_studies, get_case_studies     ← switched
    turn 3  [claude-sonnet-4-20250514     ]  check_feature_support                 ← switched
    turn 4  [gpt-4o                       ]  final answer                          ← switched
  ✗  Without affinity: model switched 3 time(s)
  Run 2: WITH Model Affinity  (X-Model-Affinity: a1b2c3d4…)
  ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
    turn 1  [claude-sonnet-4-20250514     ]  get_db_benchmarks, get_db_benchmarks
    turn 2  [claude-sonnet-4-20250514     ]  get_case_studies, get_case_studies
    turn 3  [claude-sonnet-4-20250514     ]  check_feature_support
    turn 4  [claude-sonnet-4-20250514     ]  final answer
  ✓  With affinity: claude-sonnet-4-20250514 for all 4 turns
 ```
 ### How It Works
 Model affinity is implemented in brightstaff. When `X-Model-Affinity` is present:
 1. **First request** — routing runs normally, result is cached keyed by the affinity ID
 2. **Subsequent requests** — cache hit skips routing and returns the cached model instantly
 The `X-Model-Affinity` header is forwarded transparently; no changes to your OpenAI
 SDK calls beyond adding the header.
 ```python
 from openai import OpenAI
 import uuid
 client = OpenAI(base_url="http://localhost:12000/v1", api_key="EMPTY")
 affinity_id = str(uuid.uuid4())
 response = client.chat.completions.create(
    model="gpt-4o-mini",
    messages=[{"role": "user", "content": prompt}],
    extra_headers={"X-Model-Affinity": affinity_id},
 )
 ```
 ---
 ## Configuration
 Model affinity is configurable in `config.yaml`:
 ```yaml
 routing:
  session_ttl_seconds: 600      # How long affinity lasts (default: 10 min)
  session_max_entries: 10000    # Max cached sessions (upper limit: 10000)
 ```
 Without the `X-Model-Affinity` header, routing runs fresh every time — no breaking
 change to existing clients.
 ---
 ## Advanced: Agent Server Demo
 The `agent.py` file is a FastAPI-based agent server that demonstrates a more
 complex pattern: an external agent service that forwards `X-Model-Affinity`
 on all outbound calls to Plano. Use `start_agents.sh` to run it.
 ## See Also
 - [Model Routing Service Demo](../model_routing_service/) — curl-based examples of the routing endpoint
--- a/demos/llm_routing/session_pinning/agent.py
+++ b/demos/llm_routing/session_pinning/agent.py
@ -11,10 +11,9 @@ each with its own tool-calling loop. The tasks deliberately alternate between
 code_generation and complex_reasoning intents so Plano's preference-based
 router selects different models for each task.
-If the client sends X-Routing-Session-Id, the agent forwards it on every
+If the client sends X-Model-Affinity, the agent forwards it on every outbound
-outbound call to Plano. The first task pins the model; all subsequent tasks
+call to Plano. The first task pins the model; all subsequent tasks skip the
-skip the router and reuse it — keeping the whole session on one consistent
+router and reuse it — keeping the whole session on one consistent model.
 model.
 Run standalone:
    uv run agent.py
@ -310,12 +309,12 @@ async def run_task(
    Each task is an independent conversation so the router sees only
    this task's intent — not the accumulated context of previous tasks.
-    Session pinning via X-Routing-Session-Id pins the model from the first
+    Model affinity via X-Model-Affinity pins the model from the first task
-    task onward, so all tasks stay on the same model.
+    onward, so all tasks stay on the same model.
    Returns (answer, first_model_used).
    """
-    headers = {"X-Routing-Session-Id": session_id} if session_id else {}
+    headers = {"X-Model-Affinity": session_id} if session_id else {}
    messages: list[ChatCompletionMessageParam] = [
        {"role": "system", "content": SYSTEM_PROMPT},
        {"role": "user", "content": prompt},
@ -392,7 +391,7 @@ app = FastAPI(title="Research Agent", version="1.0.0")
@app.post("/v1/chat/completions")
 async def chat(request: Request) -> JSONResponse:
    body = await request.json()
-    session_id: str | None = request.headers.get("x-routing-session-id")
+    session_id: str | None = request.headers.get("x-model-affinity")
    log.info("request  session_id=%s", session_id or "none")
--- a/demos/llm_routing/session_pinning/config.yaml
+++ b/demos/llm_routing/session_pinning/config.yaml
--- a/demos/llm_routing/model_affinity/demo.py
+++ b/demos/llm_routing/model_affinity/demo.py
@ -0,0 +1,307 @@
 #!/usr/bin/env -S uv run --script
 # /// script
 # requires-python = ">=3.12"
 # dependencies = ["openai>=1.0.0"]
 # ///
 """
 Model Affinity Demo — Agentic Tool-Calling Loop
 Runs the same agentic loop twice through Plano:
  1. Without model affinity — the router may pick different models per turn
  2. With model affinity  — all turns use the model selected on turn 1
 Each loop is a real tool-calling agent: the LLM decides which tools to call,
 we provide simulated results, and the LLM continues until it has enough
 information to produce a final answer. Each turn is a separate request to
 Plano, so the router classifies intent independently every time.
 Usage:
    planoai up config.yaml          # start Plano
    uv run demo.py                  # run this demo
 """
 import asyncio
 import json
 import os
 import uuid
 from openai import AsyncOpenAI
 from openai.types.chat import ChatCompletionMessageParam
 PLANO_URL = os.environ.get("PLANO_URL", "http://localhost:12000")
 SYSTEM_PROMPT = (
    "You are a database selection analyst. Use the provided tools to gather "
    "benchmark data and case studies, then recommend PostgreSQL or MongoDB "
    "for a high-traffic e-commerce backend. Be concise."
 )
 USER_QUERY = (
    "Should we use PostgreSQL or MongoDB for our e-commerce platform? "
    "We need strong consistency for orders but flexible schemas for products. "
    "Use the tools to research both options, then give a recommendation."
 )
 TOOLS = [
    {
        "type": "function",
        "function": {
            "name": "get_db_benchmarks",
            "description": "Fetch performance benchmarks for a database under a given workload.",
            "parameters": {
                "type": "object",
                "properties": {
                    "database": {
                        "type": "string",
                        "enum": ["postgresql", "mongodb"],
                    },
                    "workload": {
                        "type": "string",
                        "enum": ["read_heavy", "write_heavy", "mixed"],
                    },
                },
                "required": ["database", "workload"],
            },
        },
    },
    {
        "type": "function",
        "function": {
            "name": "get_case_studies",
            "description": "Retrieve real-world e-commerce case studies for a database.",
            "parameters": {
                "type": "object",
                "properties": {
                    "database": {
                        "type": "string",
                        "enum": ["postgresql", "mongodb"],
                    },
                },
                "required": ["database"],
            },
        },
    },
    {
        "type": "function",
        "function": {
            "name": "check_feature_support",
            "description": "Check if a database supports a specific feature.",
            "parameters": {
                "type": "object",
                "properties": {
                    "database": {
                        "type": "string",
                        "enum": ["postgresql", "mongodb"],
                    },
                    "feature": {"type": "string"},
                },
                "required": ["database", "feature"],
            },
        },
    },
 ]
 # Simulated tool responses
 _BENCHMARKS = {
    ("postgresql", "mixed"): {
        "read_qps": 42000,
        "write_qps": 21000,
        "p99_ms": 6,
        "notes": "Solid all-round; MVCC keeps reads non-blocking",
    },
    ("mongodb", "mixed"): {
        "read_qps": 60000,
        "write_qps": 50000,
        "p99_ms": 3,
        "notes": "Flexible schema accelerates feature iteration",
    },
 }
 _CASE_STUDIES = {
    "postgresql": [
        {"company": "Shopify", "notes": "Moved orders back to Postgres for ACID"},
        {
            "company": "Zalando",
            "notes": "Postgres + Citus for sharded order processing",
        },
    ],
    "mongodb": [
        {"company": "eBay", "notes": "Product catalogue — flexible attribute schemas"},
        {"company": "Alibaba", "notes": "Session/cart data — high write throughput"},
    ],
 }
 _FEATURES = {
    ("postgresql", "acid transactions"): {"supported": True, "notes": "Full ACID"},
    ("mongodb", "acid transactions"): {
        "supported": True,
        "notes": "Multi-doc ACID since v4.0",
    },
    ("postgresql", "horizontal sharding"): {
        "supported": True,
        "notes": "Via Citus extension",
    },
    ("mongodb", "horizontal sharding"): {
        "supported": True,
        "notes": "Native auto-balancing",
    },
 }
 def dispatch_tool(name: str, args: dict) -> str:
    if name == "get_db_benchmarks":
        key = (args["database"], args["workload"])
        return json.dumps(_BENCHMARKS.get(key, {"error": f"no data for {key}"}))
    if name == "get_case_studies":
        return json.dumps(_CASE_STUDIES.get(args["database"], {"error": "unknown db"}))
    if name == "check_feature_support":
        key = (args["database"], args["feature"].lower())
        for k, v in _FEATURES.items():
            if k[0] == key[0] and k[1] in key[1]:
                return json.dumps(v)
        return json.dumps({"error": f"no data for {key}"})
    return json.dumps({"error": f"unknown tool {name}"})
 # ---------------------------------------------------------------------------
 # Agentic loop — runs tool calls until the LLM produces a final answer
 # ---------------------------------------------------------------------------
 async def run_agent_loop(
    affinity_id: str | None = None,
    max_turns: int = 10,
 ) -> tuple[str, list[dict]]:
    """
    Run a tool-calling agent loop against Plano.
    Returns (final_answer, trace) where trace is a list of
    {"turn": int, "model": str, "tool_calls": [...]} dicts.
    """
    client = AsyncOpenAI(base_url=f"{PLANO_URL}/v1", api_key="EMPTY")
    headers = {"X-Model-Affinity": affinity_id} if affinity_id else None
    messages: list[ChatCompletionMessageParam] = [
        {"role": "system", "content": SYSTEM_PROMPT},
        {"role": "user", "content": USER_QUERY},
    ]
    trace: list[dict] = []
    for turn in range(1, max_turns + 1):
        resp = await client.chat.completions.create(
            model="gpt-4o-mini",
            messages=messages,
            tools=TOOLS,
            tool_choice="auto",
            max_completion_tokens=800,
            extra_headers=headers,
        )
        choice = resp.choices[0]
        turn_info: dict = {"turn": turn, "model": resp.model}
        if choice.finish_reason == "tool_calls" and choice.message.tool_calls:
            tool_names = [tc.function.name for tc in choice.message.tool_calls]
            turn_info["tool_calls"] = tool_names
            trace.append(turn_info)
            messages.append(choice.message)
            for tc in choice.message.tool_calls:
                args = json.loads(tc.function.arguments or "{}")
                result = dispatch_tool(tc.function.name, args)
                messages.append(
                    {"role": "tool", "content": result, "tool_call_id": tc.id}
                )
        else:
            turn_info["tool_calls"] = []
            trace.append(turn_info)
            return (choice.message.content or "").strip(), trace
    return "(max turns reached)", trace
 # ---------------------------------------------------------------------------
 # Display helpers
 # ---------------------------------------------------------------------------
 def short_model(model: str) -> str:
    return model.split("/")[-1] if "/" in model else model
 def print_trace(trace: list[dict]) -> None:
    for t in trace:
        model = short_model(t["model"])
        tools = ", ".join(t["tool_calls"]) if t["tool_calls"] else "final answer"
        print(f"    turn {t['turn']}  [{model:<30}]  {tools}")
 def print_summary(label: str, trace: list[dict]) -> None:
    models = [t["model"] for t in trace]
    unique = set(models)
    if len(unique) == 1:
        print(
            f"  ✓  {label}: {short_model(next(iter(unique)))} "
            f"for all {len(models)} turns"
        )
    else:
        switches = sum(1 for a, b in zip(models, models[1:]) if a != b)
        names = ", ".join(sorted(short_model(m) for m in unique))
        print(f"  ✗  {label}: model switched {switches} time(s) — {names}")
 # ---------------------------------------------------------------------------
 # Main
 # ---------------------------------------------------------------------------
 async def main() -> None:
    print()
    print("  ╔══════════════════════════════════════════════════════════╗")
    print("  ║          Model Affinity Demo — Agentic Loop             ║")
    print("  ╚══════════════════════════════════════════════════════════╝")
    print()
    print(f"  Plano : {PLANO_URL}")
    print(f'  Query : "{USER_QUERY[:65]}…"')
    print()
    print("  The agent calls tools (get_db_benchmarks, get_case_studies,")
    print("  check_feature_support) across multiple turns. Each turn is")
    print("  a separate request to Plano — the router classifies intent")
    print("  independently, so different turns may get different models.")
    print()
    # --- Run 1: without affinity ---
    print("  ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━")
    print("  Run 1: WITHOUT Model Affinity")
    print("  ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━")
    print()
    answer1, trace1 = await run_agent_loop(affinity_id=None)
    print_trace(trace1)
    print()
    print_summary("Without affinity", trace1)
    print()
    # --- Run 2: with affinity ---
    aid = str(uuid.uuid4())
    print("  ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━")
    print(f"  Run 2: WITH Model Affinity  (X-Model-Affinity: {aid[:8]}…)")
    print("  ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━")
    print()
    answer2, trace2 = await run_agent_loop(affinity_id=aid)
    print_trace(trace2)
    print()
    print_summary("With affinity   ", trace2)
    print()
    # --- Final answer ---
    print("  ══ Agent recommendation (affinity session) ════════════════")
    print()
    for line in answer2.splitlines():
        print(f"  {line}")
    print()
    print("  ═══════════════════════════════════════════════════════════")
    print()
 if __name__ == "__main__":
    asyncio.run(main())
--- a/demos/llm_routing/model_affinity/demo.sh
+++ b/demos/llm_routing/model_affinity/demo.sh
@ -0,0 +1,7 @@
 #!/bin/bash
 set -e
 SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
 # Run the demo directly against Plano (no agent server needed)
 uv run "$SCRIPT_DIR/demo.py"
--- a/demos/llm_routing/session_pinning/start_agents.sh
+++ b/demos/llm_routing/session_pinning/start_agents.sh
--- a/demos/llm_routing/model_routing_service/README.md
+++ b/demos/llm_routing/model_routing_service/README.md
@ -108,13 +108,13 @@ The response contains the model list — your client should try `models[0]` firs
 ## Session Pinning
-Send an `X-Routing-Session-Id` header to pin the routing decision for a session. Once a model is selected, all subsequent requests with the same session ID return the same model without re-running routing.
+Send an `X-Model-Affinity` header to pin the routing decision for a session. Once a model is selected, all subsequent requests with the same session ID return the same model without re-running routing.
 ```bash
 # First call — runs routing, caches result
 curl http://localhost:12000/routing/v1/chat/completions \
  -H "Content-Type: application/json" \
-  -H "X-Routing-Session-Id: my-session-123" \
+  -H "X-Model-Affinity: my-session-123" \
  -d '{
    "model": "gpt-4o-mini",
    "messages": [{"role": "user", "content": "Write a Python function for binary search"}]
@ -136,7 +136,7 @@ Response (first call):
 # Second call — same session, returns cached result
 curl http://localhost:12000/routing/v1/chat/completions \
  -H "Content-Type: application/json" \
-  -H "X-Routing-Session-Id: my-session-123" \
+  -H "X-Model-Affinity: my-session-123" \
  -d '{
    "model": "gpt-4o-mini",
    "messages": [{"role": "user", "content": "Now explain merge sort"}]
@ -161,7 +161,7 @@ routing:
  session_max_entries: 10000    # default: 10000
 ```
-Without the `X-Routing-Session-Id` header, routing runs fresh every time (no breaking change).
+Without the `X-Model-Affinity` header, routing runs fresh every time (no breaking change).
 ## Kubernetes Deployment (Self-hosted Arch-Router on GPU)
--- a/demos/llm_routing/model_routing_service/demo.sh
+++ b/demos/llm_routing/model_routing_service/demo.sh
@ -114,7 +114,7 @@ echo "--- 7. Session pinning - first call (fresh routing decision) ---"
 echo ""
 curl -s "$PLANO_URL/routing/v1/chat/completions" \
  -H "Content-Type: application/json" \
-  -H "X-Routing-Session-Id: demo-session-001" \
+  -H "X-Model-Affinity: demo-session-001" \
  -d '{
    "model": "gpt-4o-mini",
    "messages": [
@ -129,7 +129,7 @@ echo "    Notice: same model returned with \"pinned\": true, routing was skipped
 echo ""
 curl -s "$PLANO_URL/routing/v1/chat/completions" \
  -H "Content-Type: application/json" \
-  -H "X-Routing-Session-Id: demo-session-001" \
+  -H "X-Model-Affinity: demo-session-001" \
  -d '{
    "model": "gpt-4o-mini",
    "messages": [
@ -143,7 +143,7 @@ echo "--- 9. Different session gets its own fresh routing ---"
 echo ""
 curl -s "$PLANO_URL/routing/v1/chat/completions" \
  -H "Content-Type: application/json" \
-  -H "X-Routing-Session-Id: demo-session-002" \
+  -H "X-Model-Affinity: demo-session-002" \
  -d '{
    "model": "gpt-4o-mini",
    "messages": [
--- a/demos/llm_routing/session_pinning/README.md
+++ b/demos/llm_routing/session_pinning/README.md
@ -1,156 +0,0 @@
 # Session Pinning Demo
 > Consistent model selection for agentic loops using `X-Routing-Session-Id`.
 ## Why Session Pinning?
 When an agent runs in a loop — research → analyse → implement → evaluate → summarise — each step hits Plano's router independently. Because prompts vary in intent, the router may select **different models** for each step, fragmenting context mid-session.
 **Session pinning** solves this: send an `X-Routing-Session-Id` header and the first request runs routing as usual, caching the decision. Every subsequent request with the same session ID returns the **same model**, without re-running the router.
 ```
 Without pinning                          With pinning (X-Routing-Session-Id)
 ─────────────────                        ──────────────────────────
 Step 1 → claude-sonnet  (code_gen)       Step 1 → claude-sonnet  ← routed
 Step 2 → gpt-4o         (reasoning)      Step 2 → claude-sonnet  ← pinned ✓
 Step 3 → claude-sonnet  (code_gen)       Step 3 → claude-sonnet  ← pinned ✓
 Step 4 → gpt-4o         (reasoning)      Step 4 → claude-sonnet  ← pinned ✓
 Step 5 → claude-sonnet  (code_gen)       Step 5 → claude-sonnet  ← pinned ✓
       ↑ model switches every step                ↑ one model, start to finish
 ```
 ---
 ## Quick Start
 ```bash
 # 1. Set API keys
 export OPENAI_API_KEY=<your-key>
 export ANTHROPIC_API_KEY=<your-key>
 # 2. Start Plano
 cd demos/llm_routing/session_pinning
 planoai up config.yaml
 # 3. Run the demo (uv manages dependencies automatically)
 ./demo.sh          # or: uv run demo.py
 ```
 ---
 ## What the Demo Does
 A **Database Research Agent** investigates whether to use PostgreSQL or MongoDB
 for an e-commerce platform. It runs 5 steps, each building on prior findings via
 accumulated message history. Steps alternate between `code_generation` and
 `complex_reasoning` intents so Plano routes to different models without pinning.
 | Step | Task | Intent |
 |:----:|------|--------|
 | 1 | List technical requirements | code_generation → claude-sonnet |
 | 2 | Compare PostgreSQL vs MongoDB | complex_reasoning → gpt-4o |
 | 3 | Write schema (CREATE TABLE) | code_generation → claude-sonnet |
 | 4 | Assess scalability trade-offs | complex_reasoning → gpt-4o |
 | 5 | Write final recommendation report | code_generation → claude-sonnet |
 The demo runs the loop **twice** against `/v1/chat/completions` using the
 [OpenAI SDK](https://github.com/openai/openai-python):
 1. **Without pinning** — no `X-Routing-Session-Id`; models alternate per step
 2. **With pinning** — `X-Routing-Session-Id` header included; model is pinned from step 1
 Each step makes real LLM calls. Step 5's report explicitly references findings
 from earlier steps, demonstrating why coherent context requires a consistent model.
 ### Expected Output
 ```
  Run 1: WITHOUT Session Pinning
  ─────────────────────────────────────────────────────────────────────
  step 1  [claude-sonnet-4-20250514]  List requirements
          "Critical requirements: 1. ACID transactions for order integrity…"
  step 2  [gpt-4o                 ]  Compare databases    ← switched
          "PostgreSQL excels at joins and ACID guarantees…"
  step 3  [claude-sonnet-4-20250514]  Write schema        ← switched
          "CREATE TABLE orders (\n  id SERIAL PRIMARY KEY…"
  step 4  [gpt-4o                 ]  Assess scalability   ← switched
          "At high write volume, PostgreSQL row-level locking…"
  step 5  [claude-sonnet-4-20250514]  Write report        ← switched
          "RECOMMENDATION: PostgreSQL is the right choice…"
  ✗  Without pinning: model switched 4 time(s) — gpt-4o, claude-sonnet-4-20250514
  Run 2: WITH Session Pinning  (X-Routing-Session-Id: a1b2c3d4…)
  ─────────────────────────────────────────────────────────────────────
  step 1  [claude-sonnet-4-20250514]  List requirements
          "Critical requirements: 1. ACID transactions for order integrity…"
  step 2  [claude-sonnet-4-20250514]  Compare databases
          "Building on the requirements I just outlined: PostgreSQL…"
  step 3  [claude-sonnet-4-20250514]  Write schema
          "Following the comparison above, here is the PostgreSQL schema…"
  step 4  [claude-sonnet-4-20250514]  Assess scalability
          "Given the schema I designed, PostgreSQL's row-level locking…"
  step 5  [claude-sonnet-4-20250514]  Write report
          "RECOMMENDATION: Based on my analysis of requirements, comparison…"
  ✓  With pinning: claude-sonnet-4-20250514 held for all 5 steps
  ══ Final Report (pinned session) ═════════════════════════════════════
  RECOMMENDATION: Based on my analysis of requirements, the head-to-head
  comparison, the schema I designed, and the scalability trade-offs…
  ══════════════════════════════════════════════════════════════════════
 ```
 ### How It Works
 Session pinning is implemented in brightstaff. When `X-Routing-Session-Id` is present:
 1. **First request** — routing runs normally, result is cached keyed by session ID
 2. **Subsequent requests** — cache hit skips routing and returns the cached model instantly
 The `X-Routing-Session-Id` header is forwarded transparently; no changes to your OpenAI
 SDK calls beyond adding the header.
 ```python
 from openai import OpenAI
 client = OpenAI(base_url="http://localhost:12000/v1", api_key="EMPTY")
 session_id = str(uuid.uuid4())
 response = client.chat.completions.create(
    model="gpt-4o-mini",
    messages=[{"role": "user", "content": prompt}],
    extra_headers={"X-Routing-Session-Id": session_id},  # pin the session
 )
 ```
 ---
 ## Configuration
 Session pinning is configurable in `config.yaml`:
 ```yaml
 routing:
  session_ttl_seconds: 600      # How long a pinned session lasts (default: 10 min)
  session_max_entries: 10000    # Max cached sessions before LRU eviction
 ```
 Without the `X-Routing-Session-Id` header, routing runs fresh every time — no breaking
 change to existing clients.
 ---
 ## See Also
 - [Model Routing Service Demo](../model_routing_service/) — curl-based examples of the routing endpoint
--- a/demos/llm_routing/session_pinning/demo.py
+++ b/demos/llm_routing/session_pinning/demo.py
@ -1,174 +0,0 @@
 #!/usr/bin/env -S uv run --script
 # /// script
 # requires-python = ">=3.12"
 # dependencies = ["httpx>=0.27"]
 # ///
 """
 Session Pinning Demo — Research Agent client
 Sends the same query to the Research Agent twice — once without a session ID
 and once with one — and compares the routing trace to show how session pinning
 keeps the model consistent across the LLM's tool-calling loop.
 Requires the agent to already be running (start it with ./start_agents.sh).
 Usage:
    uv run demo.py
    AGENT_URL=http://localhost:8000 uv run demo.py
 """
 import asyncio
 import os
 import uuid
 import httpx
 AGENT_URL = os.environ.get("AGENT_URL", "http://localhost:8000")
 QUERY = (
    "Should we use PostgreSQL or MongoDB for a high-traffic e-commerce backend "
    "that needs strong consistency for orders but flexible schemas for products?"
 )
 # ---------------------------------------------------------------------------
 # Client helpers
 # ---------------------------------------------------------------------------
 async def wait_for_agent(timeout: int = 30) -> bool:
    async with httpx.AsyncClient() as client:
        for _ in range(timeout * 2):
            try:
                r = await client.get(f"{AGENT_URL}/health", timeout=1.0)
                if r.status_code == 200:
                    return True
            except Exception:
                pass
            await asyncio.sleep(0.5)
    return False
 async def ask_agent(query: str, session_id: str | None = None) -> dict:
    headers: dict[str, str] = {}
    if session_id:
        headers["X-Routing-Session-Id"] = session_id
    async with httpx.AsyncClient(timeout=120.0) as client:
        r = await client.post(
            f"{AGENT_URL}/v1/chat/completions",
            headers=headers,
            json={"messages": [{"role": "user", "content": query}]},
        )
        r.raise_for_status()
        return r.json()
 # ---------------------------------------------------------------------------
 # Display helpers
 # ---------------------------------------------------------------------------
 def _short(model: str) -> str:
    return model.split("/")[-1] if "/" in model else model
 def _print_trace(result: dict) -> None:
    trace = result.get("routing_trace", [])
    if not trace:
        print("    (no trace)")
        return
    prev: str | None = None
    for t in trace:
        short = _short(t["model"])
        switch = "  ← switched" if (prev and t["model"] != prev) else ""
        prev = t["model"]
        print(f"    {t['task']:<26}  [{short}]{switch}")
 def _print_summary(label: str, result: dict) -> None:
    models = [t["model"] for t in result.get("routing_trace", [])]
    if not models:
        print(f"  ?  {label}: no routing data")
        return
    unique = set(models)
    if len(unique) == 1:
        print(f"  ✓  {label}: {_short(next(iter(unique)))} for all {len(models)} turns")
    else:
        switched = sum(1 for a, b in zip(models, models[1:]) if a != b)
        names = ", ".join(sorted(_short(m) for m in unique))
        print(f"  ✗  {label}: model switched {switched} time(s) — {names}")
 # ---------------------------------------------------------------------------
 # Demo
 # ---------------------------------------------------------------------------
 async def main() -> None:
    print()
    print("  ╔══════════════════════════════════════════════════════════════╗")
    print("  ║      Session Pinning Demo — Research Agent                   ║")
    print("  ╚══════════════════════════════════════════════════════════════╝")
    print()
    print(f"  Agent : {AGENT_URL}")
    print(f'  Query : "{QUERY[:72]}…"')
    print()
    print("  The agent uses a tool-calling loop (get_db_benchmarks,")
    print("  get_case_studies, check_feature_support) to research the")
    print("  question. Each LLM turn hits Plano's preference-based router.")
    print()
    print(f"  Waiting for agent at {AGENT_URL}…", end=" ", flush=True)
    if not await wait_for_agent():
        print("FAILED — agent did not respond within 30 s")
        return
    print("ready.")
    print()
    sid = str(uuid.uuid4())
    print("  Sending queries (running concurrently)…")
    print()
    without, with_pin = await asyncio.gather(
        ask_agent(QUERY, session_id=None),
        ask_agent(QUERY, session_id=sid),
    )
    # ── Run 1 ────────────────────────────────────────────────────────────
    print("  ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━")
    print("  Run 1: WITHOUT Session Pinning")
    print("  ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━")
    print()
    print("  LLM turns inside the agent loop:")
    print()
    _print_trace(without)
    print()
    _print_summary("Without pinning", without)
    print()
    # ── Run 2 ────────────────────────────────────────────────────────────
    print("  ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━")
    print(f"  Run 2: WITH Session Pinning  (X-Routing-Session-Id: {sid[:8]}…)")
    print("  ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━")
    print()
    print("  LLM turns inside the agent loop:")
    print()
    _print_trace(with_pin)
    print()
    _print_summary("With pinning   ", with_pin)
    print()
    # ── Final answer ─────────────────────────────────────────────────────
    answer = with_pin["choices"][0]["message"]["content"]
    print("  ══ Agent recommendation (pinned session) ═════════════════════")
    print()
    for line in answer.splitlines():
        print(f"  {line}")
    print()
    print("  ══════════════════════════════════════════════════════════════")
    print()
 if __name__ == "__main__":
    asyncio.run(main())
--- a/demos/llm_routing/session_pinning/demo.sh
+++ b/demos/llm_routing/session_pinning/demo.sh
@ -1,19 +0,0 @@
 #!/bin/bash
 set -e
 SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
 export PLANO_URL="${PLANO_URL:-http://localhost:12000}"
 export AGENT_PORT="${AGENT_PORT:-8000}"
 export AGENT_URL="http://localhost:$AGENT_PORT"
 cleanup() {
    [ -n "$AGENT_PID" ] && kill "$AGENT_PID" 2>/dev/null
 }
 trap cleanup EXIT INT TERM
 # Start the agent in the background
 "$SCRIPT_DIR/start_agents.sh" &
 AGENT_PID=$!
 # Run the demo client
 uv run "$SCRIPT_DIR/demo.py"