From 190790a3d2d32e69baf9d7bad3c2d865b8a045a1 Mon Sep 17 00:00:00 2001 From: Adil Hafeez Date: Wed, 25 Mar 2026 23:09:50 -0700 Subject: [PATCH] add session pinning demo with iterative research agent --- demos/llm_routing/session_pinning/README.md | 102 +++++++++++++++ demos/llm_routing/session_pinning/config.yaml | 27 ++++ demos/llm_routing/session_pinning/demo.py | 120 ++++++++++++++++++ demos/llm_routing/session_pinning/demo.sh | 11 ++ 4 files changed, 260 insertions(+) create mode 100644 demos/llm_routing/session_pinning/README.md create mode 100644 demos/llm_routing/session_pinning/config.yaml create mode 100644 demos/llm_routing/session_pinning/demo.py create mode 100755 demos/llm_routing/session_pinning/demo.sh diff --git a/demos/llm_routing/session_pinning/README.md b/demos/llm_routing/session_pinning/README.md new file mode 100644 index 00000000..500d8ef6 --- /dev/null +++ b/demos/llm_routing/session_pinning/README.md @@ -0,0 +1,102 @@ +# Session Pinning Demo + +> Consistent model selection for agentic loops using `X-Session-Id`. + +## Why Session Pinning? + +When an agent runs in a loop — research → plan → implement → review → refine — each iteration hits Plano's router independently. Since the prompts vary in intent, the router may select **different models** for each step, breaking consistency mid-workflow. + +**Session pinning** solves this: send an `X-Session-Id` header and the first request runs routing as usual, caching the decision. Every subsequent request with the same session ID returns the **same model** instantly (`"pinned": true`), without re-running the router. + +``` +Without pinning With pinning (X-Session-Id) +───────────────── ─────────────────────────── +Step 1 → Claude (code_generation) Step 1 → Claude (code_generation) ← routed +Step 2 → GPT-4o (complex_reasoning) Step 2 → Claude (pinned ✓) +Step 3 → Claude (code_generation) Step 3 → Claude (pinned ✓) +Step 4 → GPT-4o (complex_reasoning) Step 4 → Claude (pinned ✓) +Step 5 → Claude (code_generation) Step 5 → Claude (pinned ✓) + ↑ model switches every step ↑ one model, start to finish +``` + +--- + +## Quick Start + +```bash +# 1. Set API keys +export OPENAI_API_KEY= +export ANTHROPIC_API_KEY= + +# 2. Start Plano +cd demos/llm_routing/session_pinning +planoai up config.yaml + +# 3. Run the demo +./demo.sh # or: python3 demo.py +``` + +--- + +## What the Demo Does + +The script simulates an agent building a task management app in **5 iterative steps**, deliberately mixing intents: + +| Step | Prompt | Intent | +|:----:|--------|--------| +| 1 | Design a REST API schema for a task management app… | code generation | +| 2 | Analyze SQL vs NoSQL trade-offs for this system… | complex reasoning | +| 3 | Write the SQLAlchemy database models… | code generation | +| 4 | Review the API design for security vulnerabilities… | complex reasoning | +| 5 | Implement JWT authentication middleware… | code generation | + +It runs this loop **twice** against the `/routing/v1/chat/completions` endpoint (routing decisions only — no actual LLM calls): + +1. **Without pinning** — no `X-Session-Id` header; models switch between steps +2. **With pinning** — `X-Session-Id` header included; the model selected in step 1 is reused for all 5 steps + +### Expected Output + +``` +══════════════════════════════════════════════════════════════════ + Run 1: WITHOUT Session Pinning +────────────────────────────────────────────────────────────────── + Step 1: Design a REST API schema… → anthropic/claude-sonnet-4-20250514 + Step 2: Analyze SQL vs NoSQL… → openai/gpt-4o + Step 3: Write SQLAlchemy models… → anthropic/claude-sonnet-4-20250514 + Step 4: Review API for security… → openai/gpt-4o + Step 5: Implement JWT auth… → anthropic/claude-sonnet-4-20250514 + + ✗ Models varied: anthropic/claude-sonnet-4-20250514, openai/gpt-4o + +══════════════════════════════════════════════════════════════════ + Run 2: WITH Session Pinning (X-Session-Id: a1b2c3d4-…) +────────────────────────────────────────────────────────────────── + Step 1: Design a REST API schema… → anthropic/claude-sonnet-4-20250514 (pinned=false) + Step 2: Analyze SQL vs NoSQL… → anthropic/claude-sonnet-4-20250514 (pinned=true) + Step 3: Write SQLAlchemy models… → anthropic/claude-sonnet-4-20250514 (pinned=true) + Step 4: Review API for security… → anthropic/claude-sonnet-4-20250514 (pinned=true) + Step 5: Implement JWT auth… → anthropic/claude-sonnet-4-20250514 (pinned=true) + + ✓ All 5 steps routed to anthropic/claude-sonnet-4-20250514 +``` + +--- + +## Configuration + +Session pinning is configurable in `config.yaml`: + +```yaml +routing: + session_ttl_seconds: 600 # How long a pinned session lasts (default: 10 min) + session_max_entries: 10000 # Max cached sessions before LRU eviction +``` + +Without the `X-Session-Id` header, routing runs fresh every time — no breaking change to existing clients. + +--- + +## See Also + +- [Model Routing Service Demo](../model_routing_service/) — curl-based examples of the routing endpoint and session pinning diff --git a/demos/llm_routing/session_pinning/config.yaml b/demos/llm_routing/session_pinning/config.yaml new file mode 100644 index 00000000..7b98b25b --- /dev/null +++ b/demos/llm_routing/session_pinning/config.yaml @@ -0,0 +1,27 @@ +version: v0.3.0 + +listeners: + - type: model + name: model_listener + port: 12000 + +model_providers: + + - model: openai/gpt-4o-mini + access_key: $OPENAI_API_KEY + default: true + + - model: openai/gpt-4o + access_key: $OPENAI_API_KEY + routing_preferences: + - name: complex_reasoning + description: complex reasoning tasks, multi-step analysis, or detailed explanations + + - model: anthropic/claude-sonnet-4-20250514 + access_key: $ANTHROPIC_API_KEY + routing_preferences: + - name: code_generation + description: generating new code, writing functions, or creating boilerplate + +tracing: + random_sampling: 100 diff --git a/demos/llm_routing/session_pinning/demo.py b/demos/llm_routing/session_pinning/demo.py new file mode 100644 index 00000000..7117c53e --- /dev/null +++ b/demos/llm_routing/session_pinning/demo.py @@ -0,0 +1,120 @@ +#!/usr/bin/env python3 +""" +Session Pinning Demo — Iterative Research Agent + +Demonstrates how session pinning ensures consistent model selection +across multiple iterations of an agentic loop. Runs the same 5-step +research workflow twice: + + 1) Without session pinning — models may switch between iterations + 2) With session pinning — first iteration pins the model for all subsequent ones + +Uses the /routing/v1/chat/completions endpoint (routing decisions only, no LLM calls). +""" + +import json +import os +import urllib.request +import uuid + +PLANO_URL = os.environ.get("PLANO_URL", "http://localhost:12000") + +# Simulates an iterative research agent building a task management app. +# Prompts deliberately alternate between code_generation and complex_reasoning +# intents so that without pinning, different models get selected per step. +RESEARCH_STEPS = [ + "Design a REST API schema for a task management app with users, projects, and tasks", + "Analyze the trade-offs between SQL and NoSQL databases for this task management system", + "Write the database models and ORM setup in Python using SQLAlchemy", + "Review the API design for security vulnerabilities and suggest improvements", + "Implement the authentication middleware with JWT tokens", +] + + +def run_research_loop(session_id=None): + """Run the research agent loop, optionally with session pinning.""" + results = [] + + for i, prompt in enumerate(RESEARCH_STEPS, 1): + headers = {"Content-Type": "application/json"} + if session_id: + headers["X-Session-Id"] = session_id + + payload = { + "model": "gpt-4o-mini", + "messages": [{"role": "user", "content": prompt}], + } + + resp = urllib.request.urlopen( + urllib.request.Request( + f"{PLANO_URL}/routing/v1/chat/completions", + data=json.dumps(payload).encode(), + headers=headers, + ), + timeout=10, + ) + data = json.loads(resp.read()) + + model = data.get("model", "unknown") + route = data.get("route") or "none" + pinned = data.get("pinned") + + pinned_str = "" + if pinned is not None: + pinned_str = f" pinned={pinned}" + + print(f" Step {i}: {prompt[:60]:<60s}") + print(f" → model={model} route={route}{pinned_str}") + print() + + results.append({"step": i, "model": model, "route": route, "pinned": pinned}) + + return results + + +def print_summary(label, results): + """Print a one-line summary of model consistency.""" + models = [r["model"] for r in results] + unique = set(models) + if len(unique) == 1: + print(f" ✓ {label}: All 5 steps routed to {models[0]}") + else: + print(f" ✗ {label}: Models varied across steps — {', '.join(unique)}") + + +def main(): + print("=" * 70) + print(" Iterative Research Agent — Session Pinning Demo") + print("=" * 70) + print() + print("An agent is building a task management app in 5 iterative steps.") + print("Each step hits Plano's routing endpoint to pick the best model.") + print() + + # --- Run 1: Without session pinning --- + print("-" * 70) + print(" Run 1: WITHOUT Session Pinning (no X-Session-Id header)") + print("-" * 70) + print() + results_no_pin = run_research_loop(session_id=None) + + # --- Run 2: With session pinning --- + session_id = str(uuid.uuid4()) + print("-" * 70) + print(f" Run 2: WITH Session Pinning (X-Session-Id: {session_id})") + print("-" * 70) + print() + results_pinned = run_research_loop(session_id=session_id) + + # --- Summary --- + print("=" * 70) + print(" Summary") + print("=" * 70) + print() + print_summary("Without pinning", results_no_pin) + print_summary("With pinning ", results_pinned) + print() + + +if __name__ == "__main__": + main() diff --git a/demos/llm_routing/session_pinning/demo.sh b/demos/llm_routing/session_pinning/demo.sh new file mode 100755 index 00000000..882144b0 --- /dev/null +++ b/demos/llm_routing/session_pinning/demo.sh @@ -0,0 +1,11 @@ +#!/bin/bash +set -e + +SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" +export PLANO_URL="${PLANO_URL:-http://localhost:12000}" + +echo "Running session pinning demo..." +echo "PLANO_URL=$PLANO_URL" +echo "" + +python3 "$SCRIPT_DIR/demo.py"