From 190790a3d2d32e69baf9d7bad3c2d865b8a045a1 Mon Sep 17 00:00:00 2001
From: Adil Hafeez <adil.hafeez@gmail.com>
Date: Wed, 25 Mar 2026 23:09:50 -0700
Subject: [PATCH] add session pinning demo with iterative research agent

---
 demos/llm_routing/session_pinning/README.md   | 102 +++++++++++++++
 demos/llm_routing/session_pinning/config.yaml |  27 ++++
 demos/llm_routing/session_pinning/demo.py     | 120 ++++++++++++++++++
 demos/llm_routing/session_pinning/demo.sh     |  11 ++
 4 files changed, 260 insertions(+)
 create mode 100644 demos/llm_routing/session_pinning/README.md
 create mode 100644 demos/llm_routing/session_pinning/config.yaml
 create mode 100644 demos/llm_routing/session_pinning/demo.py
 create mode 100755 demos/llm_routing/session_pinning/demo.sh
diff --git a/demos/llm_routing/session_pinning/README.md b/demos/llm_routing/session_pinning/README.md
new file mode 100644
index 00000000..500d8ef6
--- /dev/null
+++ b/demos/llm_routing/session_pinning/README.md
@@ -0,0 +1,102 @@
+# Session Pinning Demo
+
+> Consistent model selection for agentic loops using `X-Session-Id`.
+
+## Why Session Pinning?
+
+When an agent runs in a loop — research → plan → implement → review → refine — each iteration hits Plano's router independently. Since the prompts vary in intent, the router may select **different models** for each step, breaking consistency mid-workflow.
+
+**Session pinning** solves this: send an `X-Session-Id` header and the first request runs routing as usual, caching the decision. Every subsequent request with the same session ID returns the **same model** instantly (`"pinned": true`), without re-running the router.
+
+```
+Without pinning                          With pinning (X-Session-Id)
+─────────────────                        ───────────────────────────
+Step 1 → Claude (code_generation)        Step 1 → Claude (code_generation) ← routed
+Step 2 → GPT-4o (complex_reasoning)      Step 2 → Claude (pinned ✓)
+Step 3 → Claude (code_generation)        Step 3 → Claude (pinned ✓)
+Step 4 → GPT-4o (complex_reasoning)      Step 4 → Claude (pinned ✓)
+Step 5 → Claude (code_generation)        Step 5 → Claude (pinned ✓)
+       ↑ model switches every step                ↑ one model, start to finish
+```
+
+---
+
+## Quick Start
+
+```bash
+# 1. Set API keys
+export OPENAI_API_KEY=<your-key>
+export ANTHROPIC_API_KEY=<your-key>
+
+# 2. Start Plano
+cd demos/llm_routing/session_pinning
+planoai up config.yaml
+
+# 3. Run the demo
+./demo.sh          # or: python3 demo.py
+```
+
+---
+
+## What the Demo Does
+
+The script simulates an agent building a task management app in **5 iterative steps**, deliberately mixing intents:
+
+| Step | Prompt | Intent |
+|:----:|--------|--------|
+| 1 | Design a REST API schema for a task management app… | code generation |
+| 2 | Analyze SQL vs NoSQL trade-offs for this system… | complex reasoning |
+| 3 | Write the SQLAlchemy database models… | code generation |
+| 4 | Review the API design for security vulnerabilities… | complex reasoning |
+| 5 | Implement JWT authentication middleware… | code generation |
+
+It runs this loop **twice** against the `/routing/v1/chat/completions` endpoint (routing decisions only — no actual LLM calls):
+
+1. **Without pinning** — no `X-Session-Id` header; models switch between steps
+2. **With pinning** — `X-Session-Id` header included; the model selected in step 1 is reused for all 5 steps
+
+### Expected Output
+
+```
+══════════════════════════════════════════════════════════════════
+  Run 1: WITHOUT Session Pinning
+──────────────────────────────────────────────────────────────────
+  Step 1: Design a REST API schema…        → anthropic/claude-sonnet-4-20250514
+  Step 2: Analyze SQL vs NoSQL…            → openai/gpt-4o
+  Step 3: Write SQLAlchemy models…         → anthropic/claude-sonnet-4-20250514
+  Step 4: Review API for security…         → openai/gpt-4o
+  Step 5: Implement JWT auth…              → anthropic/claude-sonnet-4-20250514
+
+  ✗ Models varied: anthropic/claude-sonnet-4-20250514, openai/gpt-4o
+
+══════════════════════════════════════════════════════════════════
+  Run 2: WITH Session Pinning (X-Session-Id: a1b2c3d4-…)
+──────────────────────────────────────────────────────────────────
+  Step 1: Design a REST API schema…        → anthropic/claude-sonnet-4-20250514  (pinned=false)
+  Step 2: Analyze SQL vs NoSQL…            → anthropic/claude-sonnet-4-20250514  (pinned=true)
+  Step 3: Write SQLAlchemy models…         → anthropic/claude-sonnet-4-20250514  (pinned=true)
+  Step 4: Review API for security…         → anthropic/claude-sonnet-4-20250514  (pinned=true)
+  Step 5: Implement JWT auth…              → anthropic/claude-sonnet-4-20250514  (pinned=true)
+
+  ✓ All 5 steps routed to anthropic/claude-sonnet-4-20250514
+```
+
+---
+
+## Configuration
+
+Session pinning is configurable in `config.yaml`:
+
+```yaml
+routing:
+  session_ttl_seconds: 600      # How long a pinned session lasts (default: 10 min)
+  session_max_entries: 10000    # Max cached sessions before LRU eviction
+```
+
+Without the `X-Session-Id` header, routing runs fresh every time — no breaking change to existing clients.
+
+---
+
+## See Also
+
+- [Model Routing Service Demo](../model_routing_service/) — curl-based examples of the routing endpoint and session pinning
diff --git a/demos/llm_routing/session_pinning/config.yaml b/demos/llm_routing/session_pinning/config.yaml
new file mode 100644
index 00000000..7b98b25b
--- /dev/null
+++ b/demos/llm_routing/session_pinning/config.yaml
@@ -0,0 +1,27 @@
+version: v0.3.0
+
+listeners:
+  - type: model
+    name: model_listener
+    port: 12000
+
+model_providers:
+
+  - model: openai/gpt-4o-mini
+    access_key: $OPENAI_API_KEY
+    default: true
+
+  - model: openai/gpt-4o
+    access_key: $OPENAI_API_KEY
+    routing_preferences:
+      - name: complex_reasoning
+        description: complex reasoning tasks, multi-step analysis, or detailed explanations
+
+  - model: anthropic/claude-sonnet-4-20250514
+    access_key: $ANTHROPIC_API_KEY
+    routing_preferences:
+      - name: code_generation
+        description: generating new code, writing functions, or creating boilerplate
+
+tracing:
+  random_sampling: 100
diff --git a/demos/llm_routing/session_pinning/demo.py b/demos/llm_routing/session_pinning/demo.py
new file mode 100644
index 00000000..7117c53e
--- /dev/null
+++ b/demos/llm_routing/session_pinning/demo.py
@@ -0,0 +1,120 @@
+#!/usr/bin/env python3
+"""
+Session Pinning Demo — Iterative Research Agent
+
+Demonstrates how session pinning ensures consistent model selection
+across multiple iterations of an agentic loop. Runs the same 5-step
+research workflow twice:
+
+  1) Without session pinning — models may switch between iterations
+  2) With session pinning    — first iteration pins the model for all subsequent ones
+
+Uses the /routing/v1/chat/completions endpoint (routing decisions only, no LLM calls).
+"""
+
+import json
+import os
+import urllib.request
+import uuid
+
+PLANO_URL = os.environ.get("PLANO_URL", "http://localhost:12000")
+
+# Simulates an iterative research agent building a task management app.
+# Prompts deliberately alternate between code_generation and complex_reasoning
+# intents so that without pinning, different models get selected per step.
+RESEARCH_STEPS = [
+    "Design a REST API schema for a task management app with users, projects, and tasks",
+    "Analyze the trade-offs between SQL and NoSQL databases for this task management system",
+    "Write the database models and ORM setup in Python using SQLAlchemy",
+    "Review the API design for security vulnerabilities and suggest improvements",
+    "Implement the authentication middleware with JWT tokens",
+]
+
+
+def run_research_loop(session_id=None):
+    """Run the research agent loop, optionally with session pinning."""
+    results = []
+
+    for i, prompt in enumerate(RESEARCH_STEPS, 1):
+        headers = {"Content-Type": "application/json"}
+        if session_id:
+            headers["X-Session-Id"] = session_id
+
+        payload = {
+            "model": "gpt-4o-mini",
+            "messages": [{"role": "user", "content": prompt}],
+        }
+
+        resp = urllib.request.urlopen(
+            urllib.request.Request(
+                f"{PLANO_URL}/routing/v1/chat/completions",
+                data=json.dumps(payload).encode(),
+                headers=headers,
+            ),
+            timeout=10,
+        )
+        data = json.loads(resp.read())
+
+        model = data.get("model", "unknown")
+        route = data.get("route") or "none"
+        pinned = data.get("pinned")
+
+        pinned_str = ""
+        if pinned is not None:
+            pinned_str = f"  pinned={pinned}"
+
+        print(f"  Step {i}: {prompt[:60]:<60s}")
+        print(f"          → model={model}  route={route}{pinned_str}")
+        print()
+
+        results.append({"step": i, "model": model, "route": route, "pinned": pinned})
+
+    return results
+
+
+def print_summary(label, results):
+    """Print a one-line summary of model consistency."""
+    models = [r["model"] for r in results]
+    unique = set(models)
+    if len(unique) == 1:
+        print(f"  ✓ {label}: All 5 steps routed to {models[0]}")
+    else:
+        print(f"  ✗ {label}: Models varied across steps — {', '.join(unique)}")
+
+
+def main():
+    print("=" * 70)
+    print("  Iterative Research Agent — Session Pinning Demo")
+    print("=" * 70)
+    print()
+    print("An agent is building a task management app in 5 iterative steps.")
+    print("Each step hits Plano's routing endpoint to pick the best model.")
+    print()
+
+    # --- Run 1: Without session pinning ---
+    print("-" * 70)
+    print("  Run 1: WITHOUT Session Pinning (no X-Session-Id header)")
+    print("-" * 70)
+    print()
+    results_no_pin = run_research_loop(session_id=None)
+
+    # --- Run 2: With session pinning ---
+    session_id = str(uuid.uuid4())
+    print("-" * 70)
+    print(f"  Run 2: WITH Session Pinning (X-Session-Id: {session_id})")
+    print("-" * 70)
+    print()
+    results_pinned = run_research_loop(session_id=session_id)
+
+    # --- Summary ---
+    print("=" * 70)
+    print("  Summary")
+    print("=" * 70)
+    print()
+    print_summary("Without pinning", results_no_pin)
+    print_summary("With pinning   ", results_pinned)
+    print()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/demos/llm_routing/session_pinning/demo.sh b/demos/llm_routing/session_pinning/demo.sh
new file mode 100755
index 00000000..882144b0
--- /dev/null
+++ b/demos/llm_routing/session_pinning/demo.sh
@@ -0,0 +1,11 @@
+#!/bin/bash
+set -e
+
+SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
+export PLANO_URL="${PLANO_URL:-http://localhost:12000}"
+
+echo "Running session pinning demo..."
+echo "PLANO_URL=$PLANO_URL"
+echo ""
+
+python3 "$SCRIPT_DIR/demo.py"