diff --git a/crates/brightstaff/src/handlers/llm/mod.rs b/crates/brightstaff/src/handlers/llm/mod.rs index ac460420..80455cfb 100644 --- a/crates/brightstaff/src/handlers/llm/mod.rs +++ b/crates/brightstaff/src/handlers/llm/mod.rs @@ -1,8 +1,6 @@ use bytes::Bytes; use common::configuration::{FilterPipeline, ModelAlias}; -use common::consts::{ - ARCH_IS_STREAMING_HEADER, ARCH_PROVIDER_HINT_HEADER, ROUTING_SESSION_ID_HEADER, -}; +use common::consts::{ARCH_IS_STREAMING_HEADER, ARCH_PROVIDER_HINT_HEADER, MODEL_AFFINITY_HEADER}; use common::llm_providers::LlmProviders; use hermesllm::apis::openai::Message; use hermesllm::apis::openai_responses::InputParam; @@ -98,7 +96,7 @@ async fn llm_chat_inner( // Session pinning: extract session ID and check cache before routing let session_id: Option = request_headers - .get(ROUTING_SESSION_ID_HEADER) + .get(MODEL_AFFINITY_HEADER) .and_then(|h| h.to_str().ok()) .map(|s| s.to_string()); let pinned_model: Option = if let Some(ref sid) = session_id { diff --git a/crates/brightstaff/src/handlers/routing_service.rs b/crates/brightstaff/src/handlers/routing_service.rs index 5c498519..d09afe21 100644 --- a/crates/brightstaff/src/handlers/routing_service.rs +++ b/crates/brightstaff/src/handlers/routing_service.rs @@ -1,6 +1,6 @@ use bytes::Bytes; use common::configuration::{SpanAttributes, TopLevelRoutingPreference}; -use common::consts::{REQUEST_ID_HEADER, ROUTING_SESSION_ID_HEADER}; +use common::consts::{MODEL_AFFINITY_HEADER, REQUEST_ID_HEADER}; use common::errors::BrightStaffError; use hermesllm::clients::SupportedAPIsFromClient; use hermesllm::ProviderRequestType; @@ -72,7 +72,7 @@ pub async fn routing_decision( .unwrap_or_else(|| uuid::Uuid::new_v4().to_string()); let session_id: Option = request_headers - .get(ROUTING_SESSION_ID_HEADER) + .get(MODEL_AFFINITY_HEADER) .and_then(|h| h.to_str().ok()) .map(|s| s.to_string()); diff --git a/crates/common/src/consts.rs b/crates/common/src/consts.rs index 03074c4a..c99639ad 100644 --- a/crates/common/src/consts.rs +++ b/crates/common/src/consts.rs @@ -22,7 +22,7 @@ pub const X_ARCH_TOOL_CALL: &str = "x-arch-tool-call-message"; pub const X_ARCH_FC_MODEL_RESPONSE: &str = "x-arch-fc-model-response"; pub const ARCH_FC_MODEL_NAME: &str = "Arch-Function"; pub const REQUEST_ID_HEADER: &str = "x-request-id"; -pub const ROUTING_SESSION_ID_HEADER: &str = "x-routing-session-id"; +pub const MODEL_AFFINITY_HEADER: &str = "x-model-affinity"; pub const ENVOY_ORIGINAL_PATH_HEADER: &str = "x-envoy-original-path"; pub const TRACE_PARENT_HEADER: &str = "traceparent"; pub const ARCH_INTERNAL_CLUSTER_NAME: &str = "arch_internal"; diff --git a/demos/llm_routing/model_affinity/README.md b/demos/llm_routing/model_affinity/README.md new file mode 100644 index 00000000..1a1524e9 --- /dev/null +++ b/demos/llm_routing/model_affinity/README.md @@ -0,0 +1,135 @@ +# Model Affinity Demo + +> Consistent model selection for agentic loops using `X-Model-Affinity`. + +## Why Model Affinity? + +When an agent runs in a loop — calling tools, reasoning about results, calling more tools — each LLM request hits Plano's router independently. Because prompts vary in intent (tool selection looks like code generation, reasoning about results looks like complex analysis), the router may select **different models** for each turn, fragmenting context mid-session. + +**Model affinity** solves this: send an `X-Model-Affinity` header and the first request runs routing as usual, caching the decision. Every subsequent request with the same affinity ID returns the **same model**, without re-running the router. + +``` +Without affinity With affinity (X-Model-Affinity) +──────────────── ─────────────────────────────── +Turn 1 → claude-sonnet (tool calls) Turn 1 → claude-sonnet ← routed +Turn 2 → gpt-4o (reasoning) Turn 2 → claude-sonnet ← pinned ✓ +Turn 3 → claude-sonnet (tool calls) Turn 3 → claude-sonnet ← pinned ✓ +Turn 4 → gpt-4o (reasoning) Turn 4 → claude-sonnet ← pinned ✓ +Turn 5 → claude-sonnet (final answer) Turn 5 → claude-sonnet ← pinned ✓ + ↑ model switches every turn ↑ one model, start to finish +``` + +--- + +## Quick Start + +```bash +# 1. Set API keys +export OPENAI_API_KEY= +export ANTHROPIC_API_KEY= + +# 2. Start Plano +cd demos/llm_routing/model_affinity +planoai up config.yaml + +# 3. Run the demo (uv manages dependencies automatically) +./demo.sh # or: uv run demo.py +``` + +--- + +## What the Demo Does + +A **database selection agent** investigates whether to use PostgreSQL or MongoDB +for an e-commerce platform. It runs a real tool-calling loop: the LLM decides +which tools to call, receives simulated results, and continues until it has +enough data to recommend a database. + +Available tools: +- `get_db_benchmarks` — fetch performance data for a workload type +- `get_case_studies` — retrieve real-world e-commerce case studies +- `check_feature_support` — check if a database supports a specific feature + +The demo runs the **same agent loop twice**: + +1. **Without affinity** — no `X-Model-Affinity`; models may switch between turns +2. **With affinity** — `X-Model-Affinity` header included; model is pinned from turn 1 + +Each turn is a separate `POST /v1/chat/completions` request to Plano using the +[OpenAI SDK](https://github.com/openai/openai-python). The demo prints the +model used on each turn so you can see the difference. + +### Expected Output + +``` + Run 1: WITHOUT Model Affinity + ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ + turn 1 [claude-sonnet-4-20250514 ] get_db_benchmarks, get_db_benchmarks + turn 2 [gpt-4o ] get_case_studies, get_case_studies ← switched + turn 3 [claude-sonnet-4-20250514 ] check_feature_support ← switched + turn 4 [gpt-4o ] final answer ← switched + + ✗ Without affinity: model switched 3 time(s) + + + Run 2: WITH Model Affinity (X-Model-Affinity: a1b2c3d4…) + ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ + turn 1 [claude-sonnet-4-20250514 ] get_db_benchmarks, get_db_benchmarks + turn 2 [claude-sonnet-4-20250514 ] get_case_studies, get_case_studies + turn 3 [claude-sonnet-4-20250514 ] check_feature_support + turn 4 [claude-sonnet-4-20250514 ] final answer + + ✓ With affinity: claude-sonnet-4-20250514 for all 4 turns +``` + +### How It Works + +Model affinity is implemented in brightstaff. When `X-Model-Affinity` is present: + +1. **First request** — routing runs normally, result is cached keyed by the affinity ID +2. **Subsequent requests** — cache hit skips routing and returns the cached model instantly + +The `X-Model-Affinity` header is forwarded transparently; no changes to your OpenAI +SDK calls beyond adding the header. + +```python +from openai import OpenAI +import uuid + +client = OpenAI(base_url="http://localhost:12000/v1", api_key="EMPTY") + +affinity_id = str(uuid.uuid4()) + +response = client.chat.completions.create( + model="gpt-4o-mini", + messages=[{"role": "user", "content": prompt}], + extra_headers={"X-Model-Affinity": affinity_id}, +) +``` + +--- + +## Configuration + +Model affinity is configurable in `config.yaml`: + +```yaml +routing: + session_ttl_seconds: 600 # How long affinity lasts (default: 10 min) + session_max_entries: 10000 # Max cached sessions (upper limit: 10000) +``` + +Without the `X-Model-Affinity` header, routing runs fresh every time — no breaking +change to existing clients. + +--- + +## Advanced: Agent Server Demo + +The `agent.py` file is a FastAPI-based agent server that demonstrates a more +complex pattern: an external agent service that forwards `X-Model-Affinity` +on all outbound calls to Plano. Use `start_agents.sh` to run it. + +## See Also + +- [Model Routing Service Demo](../model_routing_service/) — curl-based examples of the routing endpoint diff --git a/demos/llm_routing/session_pinning/agent.py b/demos/llm_routing/model_affinity/agent.py similarity index 96% rename from demos/llm_routing/session_pinning/agent.py rename to demos/llm_routing/model_affinity/agent.py index 2d51085f..b51bd28a 100644 --- a/demos/llm_routing/session_pinning/agent.py +++ b/demos/llm_routing/model_affinity/agent.py @@ -11,10 +11,9 @@ each with its own tool-calling loop. The tasks deliberately alternate between code_generation and complex_reasoning intents so Plano's preference-based router selects different models for each task. -If the client sends X-Routing-Session-Id, the agent forwards it on every -outbound call to Plano. The first task pins the model; all subsequent tasks -skip the router and reuse it — keeping the whole session on one consistent -model. +If the client sends X-Model-Affinity, the agent forwards it on every outbound +call to Plano. The first task pins the model; all subsequent tasks skip the +router and reuse it — keeping the whole session on one consistent model. Run standalone: uv run agent.py @@ -310,12 +309,12 @@ async def run_task( Each task is an independent conversation so the router sees only this task's intent — not the accumulated context of previous tasks. - Session pinning via X-Routing-Session-Id pins the model from the first - task onward, so all tasks stay on the same model. + Model affinity via X-Model-Affinity pins the model from the first task + onward, so all tasks stay on the same model. Returns (answer, first_model_used). """ - headers = {"X-Routing-Session-Id": session_id} if session_id else {} + headers = {"X-Model-Affinity": session_id} if session_id else {} messages: list[ChatCompletionMessageParam] = [ {"role": "system", "content": SYSTEM_PROMPT}, {"role": "user", "content": prompt}, @@ -392,7 +391,7 @@ app = FastAPI(title="Research Agent", version="1.0.0") @app.post("/v1/chat/completions") async def chat(request: Request) -> JSONResponse: body = await request.json() - session_id: str | None = request.headers.get("x-routing-session-id") + session_id: str | None = request.headers.get("x-model-affinity") log.info("request session_id=%s", session_id or "none") diff --git a/demos/llm_routing/session_pinning/config.yaml b/demos/llm_routing/model_affinity/config.yaml similarity index 100% rename from demos/llm_routing/session_pinning/config.yaml rename to demos/llm_routing/model_affinity/config.yaml diff --git a/demos/llm_routing/model_affinity/demo.py b/demos/llm_routing/model_affinity/demo.py new file mode 100644 index 00000000..f01a5a31 --- /dev/null +++ b/demos/llm_routing/model_affinity/demo.py @@ -0,0 +1,307 @@ +#!/usr/bin/env -S uv run --script +# /// script +# requires-python = ">=3.12" +# dependencies = ["openai>=1.0.0"] +# /// +""" +Model Affinity Demo — Agentic Tool-Calling Loop + +Runs the same agentic loop twice through Plano: + 1. Without model affinity — the router may pick different models per turn + 2. With model affinity — all turns use the model selected on turn 1 + +Each loop is a real tool-calling agent: the LLM decides which tools to call, +we provide simulated results, and the LLM continues until it has enough +information to produce a final answer. Each turn is a separate request to +Plano, so the router classifies intent independently every time. + +Usage: + planoai up config.yaml # start Plano + uv run demo.py # run this demo +""" + +import asyncio +import json +import os +import uuid + +from openai import AsyncOpenAI +from openai.types.chat import ChatCompletionMessageParam + +PLANO_URL = os.environ.get("PLANO_URL", "http://localhost:12000") + +SYSTEM_PROMPT = ( + "You are a database selection analyst. Use the provided tools to gather " + "benchmark data and case studies, then recommend PostgreSQL or MongoDB " + "for a high-traffic e-commerce backend. Be concise." +) + +USER_QUERY = ( + "Should we use PostgreSQL or MongoDB for our e-commerce platform? " + "We need strong consistency for orders but flexible schemas for products. " + "Use the tools to research both options, then give a recommendation." +) + +TOOLS = [ + { + "type": "function", + "function": { + "name": "get_db_benchmarks", + "description": "Fetch performance benchmarks for a database under a given workload.", + "parameters": { + "type": "object", + "properties": { + "database": { + "type": "string", + "enum": ["postgresql", "mongodb"], + }, + "workload": { + "type": "string", + "enum": ["read_heavy", "write_heavy", "mixed"], + }, + }, + "required": ["database", "workload"], + }, + }, + }, + { + "type": "function", + "function": { + "name": "get_case_studies", + "description": "Retrieve real-world e-commerce case studies for a database.", + "parameters": { + "type": "object", + "properties": { + "database": { + "type": "string", + "enum": ["postgresql", "mongodb"], + }, + }, + "required": ["database"], + }, + }, + }, + { + "type": "function", + "function": { + "name": "check_feature_support", + "description": "Check if a database supports a specific feature.", + "parameters": { + "type": "object", + "properties": { + "database": { + "type": "string", + "enum": ["postgresql", "mongodb"], + }, + "feature": {"type": "string"}, + }, + "required": ["database", "feature"], + }, + }, + }, +] + +# Simulated tool responses +_BENCHMARKS = { + ("postgresql", "mixed"): { + "read_qps": 42000, + "write_qps": 21000, + "p99_ms": 6, + "notes": "Solid all-round; MVCC keeps reads non-blocking", + }, + ("mongodb", "mixed"): { + "read_qps": 60000, + "write_qps": 50000, + "p99_ms": 3, + "notes": "Flexible schema accelerates feature iteration", + }, +} + +_CASE_STUDIES = { + "postgresql": [ + {"company": "Shopify", "notes": "Moved orders back to Postgres for ACID"}, + { + "company": "Zalando", + "notes": "Postgres + Citus for sharded order processing", + }, + ], + "mongodb": [ + {"company": "eBay", "notes": "Product catalogue — flexible attribute schemas"}, + {"company": "Alibaba", "notes": "Session/cart data — high write throughput"}, + ], +} + +_FEATURES = { + ("postgresql", "acid transactions"): {"supported": True, "notes": "Full ACID"}, + ("mongodb", "acid transactions"): { + "supported": True, + "notes": "Multi-doc ACID since v4.0", + }, + ("postgresql", "horizontal sharding"): { + "supported": True, + "notes": "Via Citus extension", + }, + ("mongodb", "horizontal sharding"): { + "supported": True, + "notes": "Native auto-balancing", + }, +} + + +def dispatch_tool(name: str, args: dict) -> str: + if name == "get_db_benchmarks": + key = (args["database"], args["workload"]) + return json.dumps(_BENCHMARKS.get(key, {"error": f"no data for {key}"})) + if name == "get_case_studies": + return json.dumps(_CASE_STUDIES.get(args["database"], {"error": "unknown db"})) + if name == "check_feature_support": + key = (args["database"], args["feature"].lower()) + for k, v in _FEATURES.items(): + if k[0] == key[0] and k[1] in key[1]: + return json.dumps(v) + return json.dumps({"error": f"no data for {key}"}) + return json.dumps({"error": f"unknown tool {name}"}) + + +# --------------------------------------------------------------------------- +# Agentic loop — runs tool calls until the LLM produces a final answer +# --------------------------------------------------------------------------- + + +async def run_agent_loop( + affinity_id: str | None = None, + max_turns: int = 10, +) -> tuple[str, list[dict]]: + """ + Run a tool-calling agent loop against Plano. + + Returns (final_answer, trace) where trace is a list of + {"turn": int, "model": str, "tool_calls": [...]} dicts. + """ + client = AsyncOpenAI(base_url=f"{PLANO_URL}/v1", api_key="EMPTY") + headers = {"X-Model-Affinity": affinity_id} if affinity_id else None + + messages: list[ChatCompletionMessageParam] = [ + {"role": "system", "content": SYSTEM_PROMPT}, + {"role": "user", "content": USER_QUERY}, + ] + trace: list[dict] = [] + + for turn in range(1, max_turns + 1): + resp = await client.chat.completions.create( + model="gpt-4o-mini", + messages=messages, + tools=TOOLS, + tool_choice="auto", + max_completion_tokens=800, + extra_headers=headers, + ) + + choice = resp.choices[0] + turn_info: dict = {"turn": turn, "model": resp.model} + + if choice.finish_reason == "tool_calls" and choice.message.tool_calls: + tool_names = [tc.function.name for tc in choice.message.tool_calls] + turn_info["tool_calls"] = tool_names + trace.append(turn_info) + + messages.append(choice.message) + for tc in choice.message.tool_calls: + args = json.loads(tc.function.arguments or "{}") + result = dispatch_tool(tc.function.name, args) + messages.append( + {"role": "tool", "content": result, "tool_call_id": tc.id} + ) + else: + turn_info["tool_calls"] = [] + trace.append(turn_info) + return (choice.message.content or "").strip(), trace + + return "(max turns reached)", trace + + +# --------------------------------------------------------------------------- +# Display helpers +# --------------------------------------------------------------------------- + + +def short_model(model: str) -> str: + return model.split("/")[-1] if "/" in model else model + + +def print_trace(trace: list[dict]) -> None: + for t in trace: + model = short_model(t["model"]) + tools = ", ".join(t["tool_calls"]) if t["tool_calls"] else "final answer" + print(f" turn {t['turn']} [{model:<30}] {tools}") + + +def print_summary(label: str, trace: list[dict]) -> None: + models = [t["model"] for t in trace] + unique = set(models) + if len(unique) == 1: + print( + f" ✓ {label}: {short_model(next(iter(unique)))} " + f"for all {len(models)} turns" + ) + else: + switches = sum(1 for a, b in zip(models, models[1:]) if a != b) + names = ", ".join(sorted(short_model(m) for m in unique)) + print(f" ✗ {label}: model switched {switches} time(s) — {names}") + + +# --------------------------------------------------------------------------- +# Main +# --------------------------------------------------------------------------- + + +async def main() -> None: + print() + print(" ╔══════════════════════════════════════════════════════════╗") + print(" ║ Model Affinity Demo — Agentic Loop ║") + print(" ╚══════════════════════════════════════════════════════════╝") + print() + print(f" Plano : {PLANO_URL}") + print(f' Query : "{USER_QUERY[:65]}…"') + print() + print(" The agent calls tools (get_db_benchmarks, get_case_studies,") + print(" check_feature_support) across multiple turns. Each turn is") + print(" a separate request to Plano — the router classifies intent") + print(" independently, so different turns may get different models.") + print() + + # --- Run 1: without affinity --- + print(" ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━") + print(" Run 1: WITHOUT Model Affinity") + print(" ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━") + print() + answer1, trace1 = await run_agent_loop(affinity_id=None) + print_trace(trace1) + print() + print_summary("Without affinity", trace1) + print() + + # --- Run 2: with affinity --- + aid = str(uuid.uuid4()) + print(" ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━") + print(f" Run 2: WITH Model Affinity (X-Model-Affinity: {aid[:8]}…)") + print(" ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━") + print() + answer2, trace2 = await run_agent_loop(affinity_id=aid) + print_trace(trace2) + print() + print_summary("With affinity ", trace2) + print() + + # --- Final answer --- + print(" ══ Agent recommendation (affinity session) ════════════════") + print() + for line in answer2.splitlines(): + print(f" {line}") + print() + print(" ═══════════════════════════════════════════════════════════") + print() + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/demos/llm_routing/model_affinity/demo.sh b/demos/llm_routing/model_affinity/demo.sh new file mode 100755 index 00000000..3ce50b3c --- /dev/null +++ b/demos/llm_routing/model_affinity/demo.sh @@ -0,0 +1,7 @@ +#!/bin/bash +set -e + +SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" + +# Run the demo directly against Plano (no agent server needed) +uv run "$SCRIPT_DIR/demo.py" diff --git a/demos/llm_routing/session_pinning/start_agents.sh b/demos/llm_routing/model_affinity/start_agents.sh similarity index 100% rename from demos/llm_routing/session_pinning/start_agents.sh rename to demos/llm_routing/model_affinity/start_agents.sh diff --git a/demos/llm_routing/model_routing_service/README.md b/demos/llm_routing/model_routing_service/README.md index bc7b14c4..4687b47c 100644 --- a/demos/llm_routing/model_routing_service/README.md +++ b/demos/llm_routing/model_routing_service/README.md @@ -108,13 +108,13 @@ The response contains the model list — your client should try `models[0]` firs ## Session Pinning -Send an `X-Routing-Session-Id` header to pin the routing decision for a session. Once a model is selected, all subsequent requests with the same session ID return the same model without re-running routing. +Send an `X-Model-Affinity` header to pin the routing decision for a session. Once a model is selected, all subsequent requests with the same session ID return the same model without re-running routing. ```bash # First call — runs routing, caches result curl http://localhost:12000/routing/v1/chat/completions \ -H "Content-Type: application/json" \ - -H "X-Routing-Session-Id: my-session-123" \ + -H "X-Model-Affinity: my-session-123" \ -d '{ "model": "gpt-4o-mini", "messages": [{"role": "user", "content": "Write a Python function for binary search"}] @@ -136,7 +136,7 @@ Response (first call): # Second call — same session, returns cached result curl http://localhost:12000/routing/v1/chat/completions \ -H "Content-Type: application/json" \ - -H "X-Routing-Session-Id: my-session-123" \ + -H "X-Model-Affinity: my-session-123" \ -d '{ "model": "gpt-4o-mini", "messages": [{"role": "user", "content": "Now explain merge sort"}] @@ -161,7 +161,7 @@ routing: session_max_entries: 10000 # default: 10000 ``` -Without the `X-Routing-Session-Id` header, routing runs fresh every time (no breaking change). +Without the `X-Model-Affinity` header, routing runs fresh every time (no breaking change). ## Kubernetes Deployment (Self-hosted Arch-Router on GPU) diff --git a/demos/llm_routing/model_routing_service/demo.sh b/demos/llm_routing/model_routing_service/demo.sh index 97d3032d..dafd60b3 100755 --- a/demos/llm_routing/model_routing_service/demo.sh +++ b/demos/llm_routing/model_routing_service/demo.sh @@ -114,7 +114,7 @@ echo "--- 7. Session pinning - first call (fresh routing decision) ---" echo "" curl -s "$PLANO_URL/routing/v1/chat/completions" \ -H "Content-Type: application/json" \ - -H "X-Routing-Session-Id: demo-session-001" \ + -H "X-Model-Affinity: demo-session-001" \ -d '{ "model": "gpt-4o-mini", "messages": [ @@ -129,7 +129,7 @@ echo " Notice: same model returned with \"pinned\": true, routing was skipped echo "" curl -s "$PLANO_URL/routing/v1/chat/completions" \ -H "Content-Type: application/json" \ - -H "X-Routing-Session-Id: demo-session-001" \ + -H "X-Model-Affinity: demo-session-001" \ -d '{ "model": "gpt-4o-mini", "messages": [ @@ -143,7 +143,7 @@ echo "--- 9. Different session gets its own fresh routing ---" echo "" curl -s "$PLANO_URL/routing/v1/chat/completions" \ -H "Content-Type: application/json" \ - -H "X-Routing-Session-Id: demo-session-002" \ + -H "X-Model-Affinity: demo-session-002" \ -d '{ "model": "gpt-4o-mini", "messages": [ diff --git a/demos/llm_routing/session_pinning/README.md b/demos/llm_routing/session_pinning/README.md deleted file mode 100644 index 6ea6db02..00000000 --- a/demos/llm_routing/session_pinning/README.md +++ /dev/null @@ -1,156 +0,0 @@ -# Session Pinning Demo - -> Consistent model selection for agentic loops using `X-Routing-Session-Id`. - -## Why Session Pinning? - -When an agent runs in a loop — research → analyse → implement → evaluate → summarise — each step hits Plano's router independently. Because prompts vary in intent, the router may select **different models** for each step, fragmenting context mid-session. - -**Session pinning** solves this: send an `X-Routing-Session-Id` header and the first request runs routing as usual, caching the decision. Every subsequent request with the same session ID returns the **same model**, without re-running the router. - -``` -Without pinning With pinning (X-Routing-Session-Id) -───────────────── ────────────────────────── -Step 1 → claude-sonnet (code_gen) Step 1 → claude-sonnet ← routed -Step 2 → gpt-4o (reasoning) Step 2 → claude-sonnet ← pinned ✓ -Step 3 → claude-sonnet (code_gen) Step 3 → claude-sonnet ← pinned ✓ -Step 4 → gpt-4o (reasoning) Step 4 → claude-sonnet ← pinned ✓ -Step 5 → claude-sonnet (code_gen) Step 5 → claude-sonnet ← pinned ✓ - ↑ model switches every step ↑ one model, start to finish -``` - ---- - -## Quick Start - -```bash -# 1. Set API keys -export OPENAI_API_KEY= -export ANTHROPIC_API_KEY= - -# 2. Start Plano -cd demos/llm_routing/session_pinning -planoai up config.yaml - -# 3. Run the demo (uv manages dependencies automatically) -./demo.sh # or: uv run demo.py -``` - ---- - -## What the Demo Does - -A **Database Research Agent** investigates whether to use PostgreSQL or MongoDB -for an e-commerce platform. It runs 5 steps, each building on prior findings via -accumulated message history. Steps alternate between `code_generation` and -`complex_reasoning` intents so Plano routes to different models without pinning. - -| Step | Task | Intent | -|:----:|------|--------| -| 1 | List technical requirements | code_generation → claude-sonnet | -| 2 | Compare PostgreSQL vs MongoDB | complex_reasoning → gpt-4o | -| 3 | Write schema (CREATE TABLE) | code_generation → claude-sonnet | -| 4 | Assess scalability trade-offs | complex_reasoning → gpt-4o | -| 5 | Write final recommendation report | code_generation → claude-sonnet | - -The demo runs the loop **twice** against `/v1/chat/completions` using the -[OpenAI SDK](https://github.com/openai/openai-python): - -1. **Without pinning** — no `X-Routing-Session-Id`; models alternate per step -2. **With pinning** — `X-Routing-Session-Id` header included; model is pinned from step 1 - -Each step makes real LLM calls. Step 5's report explicitly references findings -from earlier steps, demonstrating why coherent context requires a consistent model. - -### Expected Output - -``` - Run 1: WITHOUT Session Pinning - ───────────────────────────────────────────────────────────────────── - step 1 [claude-sonnet-4-20250514] List requirements - "Critical requirements: 1. ACID transactions for order integrity…" - - step 2 [gpt-4o ] Compare databases ← switched - "PostgreSQL excels at joins and ACID guarantees…" - - step 3 [claude-sonnet-4-20250514] Write schema ← switched - "CREATE TABLE orders (\n id SERIAL PRIMARY KEY…" - - step 4 [gpt-4o ] Assess scalability ← switched - "At high write volume, PostgreSQL row-level locking…" - - step 5 [claude-sonnet-4-20250514] Write report ← switched - "RECOMMENDATION: PostgreSQL is the right choice…" - - ✗ Without pinning: model switched 4 time(s) — gpt-4o, claude-sonnet-4-20250514 - - - Run 2: WITH Session Pinning (X-Routing-Session-Id: a1b2c3d4…) - ───────────────────────────────────────────────────────────────────── - step 1 [claude-sonnet-4-20250514] List requirements - "Critical requirements: 1. ACID transactions for order integrity…" - - step 2 [claude-sonnet-4-20250514] Compare databases - "Building on the requirements I just outlined: PostgreSQL…" - - step 3 [claude-sonnet-4-20250514] Write schema - "Following the comparison above, here is the PostgreSQL schema…" - - step 4 [claude-sonnet-4-20250514] Assess scalability - "Given the schema I designed, PostgreSQL's row-level locking…" - - step 5 [claude-sonnet-4-20250514] Write report - "RECOMMENDATION: Based on my analysis of requirements, comparison…" - - ✓ With pinning: claude-sonnet-4-20250514 held for all 5 steps - - ══ Final Report (pinned session) ═════════════════════════════════════ - RECOMMENDATION: Based on my analysis of requirements, the head-to-head - comparison, the schema I designed, and the scalability trade-offs… - ══════════════════════════════════════════════════════════════════════ -``` - -### How It Works - -Session pinning is implemented in brightstaff. When `X-Routing-Session-Id` is present: - -1. **First request** — routing runs normally, result is cached keyed by session ID -2. **Subsequent requests** — cache hit skips routing and returns the cached model instantly - -The `X-Routing-Session-Id` header is forwarded transparently; no changes to your OpenAI -SDK calls beyond adding the header. - -```python -from openai import OpenAI - -client = OpenAI(base_url="http://localhost:12000/v1", api_key="EMPTY") - -session_id = str(uuid.uuid4()) - -response = client.chat.completions.create( - model="gpt-4o-mini", - messages=[{"role": "user", "content": prompt}], - extra_headers={"X-Routing-Session-Id": session_id}, # pin the session -) -``` - ---- - -## Configuration - -Session pinning is configurable in `config.yaml`: - -```yaml -routing: - session_ttl_seconds: 600 # How long a pinned session lasts (default: 10 min) - session_max_entries: 10000 # Max cached sessions before LRU eviction -``` - -Without the `X-Routing-Session-Id` header, routing runs fresh every time — no breaking -change to existing clients. - ---- - -## See Also - -- [Model Routing Service Demo](../model_routing_service/) — curl-based examples of the routing endpoint diff --git a/demos/llm_routing/session_pinning/demo.py b/demos/llm_routing/session_pinning/demo.py deleted file mode 100644 index 034898e7..00000000 --- a/demos/llm_routing/session_pinning/demo.py +++ /dev/null @@ -1,174 +0,0 @@ -#!/usr/bin/env -S uv run --script -# /// script -# requires-python = ">=3.12" -# dependencies = ["httpx>=0.27"] -# /// -""" -Session Pinning Demo — Research Agent client - -Sends the same query to the Research Agent twice — once without a session ID -and once with one — and compares the routing trace to show how session pinning -keeps the model consistent across the LLM's tool-calling loop. - -Requires the agent to already be running (start it with ./start_agents.sh). - -Usage: - uv run demo.py - AGENT_URL=http://localhost:8000 uv run demo.py -""" - -import asyncio -import os -import uuid - -import httpx - -AGENT_URL = os.environ.get("AGENT_URL", "http://localhost:8000") - -QUERY = ( - "Should we use PostgreSQL or MongoDB for a high-traffic e-commerce backend " - "that needs strong consistency for orders but flexible schemas for products?" -) - - -# --------------------------------------------------------------------------- -# Client helpers -# --------------------------------------------------------------------------- - - -async def wait_for_agent(timeout: int = 30) -> bool: - async with httpx.AsyncClient() as client: - for _ in range(timeout * 2): - try: - r = await client.get(f"{AGENT_URL}/health", timeout=1.0) - if r.status_code == 200: - return True - except Exception: - pass - await asyncio.sleep(0.5) - return False - - -async def ask_agent(query: str, session_id: str | None = None) -> dict: - headers: dict[str, str] = {} - if session_id: - headers["X-Routing-Session-Id"] = session_id - - async with httpx.AsyncClient(timeout=120.0) as client: - r = await client.post( - f"{AGENT_URL}/v1/chat/completions", - headers=headers, - json={"messages": [{"role": "user", "content": query}]}, - ) - r.raise_for_status() - return r.json() - - -# --------------------------------------------------------------------------- -# Display helpers -# --------------------------------------------------------------------------- - - -def _short(model: str) -> str: - return model.split("/")[-1] if "/" in model else model - - -def _print_trace(result: dict) -> None: - trace = result.get("routing_trace", []) - if not trace: - print(" (no trace)") - return - - prev: str | None = None - for t in trace: - short = _short(t["model"]) - switch = " ← switched" if (prev and t["model"] != prev) else "" - prev = t["model"] - print(f" {t['task']:<26} [{short}]{switch}") - - -def _print_summary(label: str, result: dict) -> None: - models = [t["model"] for t in result.get("routing_trace", [])] - if not models: - print(f" ? {label}: no routing data") - return - unique = set(models) - if len(unique) == 1: - print(f" ✓ {label}: {_short(next(iter(unique)))} for all {len(models)} turns") - else: - switched = sum(1 for a, b in zip(models, models[1:]) if a != b) - names = ", ".join(sorted(_short(m) for m in unique)) - print(f" ✗ {label}: model switched {switched} time(s) — {names}") - - -# --------------------------------------------------------------------------- -# Demo -# --------------------------------------------------------------------------- - - -async def main() -> None: - print() - print(" ╔══════════════════════════════════════════════════════════════╗") - print(" ║ Session Pinning Demo — Research Agent ║") - print(" ╚══════════════════════════════════════════════════════════════╝") - print() - print(f" Agent : {AGENT_URL}") - print(f' Query : "{QUERY[:72]}…"') - print() - print(" The agent uses a tool-calling loop (get_db_benchmarks,") - print(" get_case_studies, check_feature_support) to research the") - print(" question. Each LLM turn hits Plano's preference-based router.") - print() - - print(f" Waiting for agent at {AGENT_URL}…", end=" ", flush=True) - if not await wait_for_agent(): - print("FAILED — agent did not respond within 30 s") - return - print("ready.") - print() - - sid = str(uuid.uuid4()) - print(" Sending queries (running concurrently)…") - print() - without, with_pin = await asyncio.gather( - ask_agent(QUERY, session_id=None), - ask_agent(QUERY, session_id=sid), - ) - - # ── Run 1 ──────────────────────────────────────────────────────────── - print(" ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━") - print(" Run 1: WITHOUT Session Pinning") - print(" ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━") - print() - print(" LLM turns inside the agent loop:") - print() - _print_trace(without) - print() - _print_summary("Without pinning", without) - print() - - # ── Run 2 ──────────────────────────────────────────────────────────── - print(" ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━") - print(f" Run 2: WITH Session Pinning (X-Routing-Session-Id: {sid[:8]}…)") - print(" ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━") - print() - print(" LLM turns inside the agent loop:") - print() - _print_trace(with_pin) - print() - _print_summary("With pinning ", with_pin) - print() - - # ── Final answer ───────────────────────────────────────────────────── - answer = with_pin["choices"][0]["message"]["content"] - print(" ══ Agent recommendation (pinned session) ═════════════════════") - print() - for line in answer.splitlines(): - print(f" {line}") - print() - print(" ══════════════════════════════════════════════════════════════") - print() - - -if __name__ == "__main__": - asyncio.run(main()) diff --git a/demos/llm_routing/session_pinning/demo.sh b/demos/llm_routing/session_pinning/demo.sh deleted file mode 100755 index 210fd136..00000000 --- a/demos/llm_routing/session_pinning/demo.sh +++ /dev/null @@ -1,19 +0,0 @@ -#!/bin/bash -set -e - -SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" -export PLANO_URL="${PLANO_URL:-http://localhost:12000}" -export AGENT_PORT="${AGENT_PORT:-8000}" -export AGENT_URL="http://localhost:$AGENT_PORT" - -cleanup() { - [ -n "$AGENT_PID" ] && kill "$AGENT_PID" 2>/dev/null -} -trap cleanup EXIT INT TERM - -# Start the agent in the background -"$SCRIPT_DIR/start_agents.sh" & -AGENT_PID=$! - -# Run the demo client -uv run "$SCRIPT_DIR/demo.py"