plano/demos/llm_routing/session_pinning/demo.py

175 lines
6.8 KiB
Python
Raw Normal View History

#!/usr/bin/env -S uv run --script
# /// script
# requires-python = ">=3.12"
# dependencies = ["httpx>=0.27"]
# ///
"""
Session Pinning Demo Research Agent client
Sends the same query to the Research Agent twice once without a session ID
and once with one and compares the routing trace to show how session pinning
keeps the model consistent across the LLM's tool-calling loop.
Requires the agent to already be running (start it with ./start_agents.sh).
Usage:
uv run demo.py
AGENT_URL=http://localhost:8000 uv run demo.py
"""
import asyncio
import os
import uuid
import httpx
AGENT_URL = os.environ.get("AGENT_URL", "http://localhost:8000")
QUERY = (
"Should we use PostgreSQL or MongoDB for a high-traffic e-commerce backend "
"that needs strong consistency for orders but flexible schemas for products?"
)
# ---------------------------------------------------------------------------
# Client helpers
# ---------------------------------------------------------------------------
async def wait_for_agent(timeout: int = 30) -> bool:
async with httpx.AsyncClient() as client:
for _ in range(timeout * 2):
try:
r = await client.get(f"{AGENT_URL}/health", timeout=1.0)
if r.status_code == 200:
return True
except Exception:
pass
await asyncio.sleep(0.5)
return False
2026-03-25 23:15:00 -07:00
async def ask_agent(query: str, session_id: str | None = None) -> dict:
headers: dict[str, str] = {}
if session_id:
headers["X-Session-Id"] = session_id
async with httpx.AsyncClient(timeout=120.0) as client:
r = await client.post(
f"{AGENT_URL}/v1/chat/completions",
headers=headers,
json={"messages": [{"role": "user", "content": query}]},
)
r.raise_for_status()
return r.json()
2026-03-25 23:15:00 -07:00
# ---------------------------------------------------------------------------
# Display helpers
# ---------------------------------------------------------------------------
2026-03-25 23:15:00 -07:00
def _short(model: str) -> str:
return model.split("/")[-1] if "/" in model else model
2026-03-25 23:15:00 -07:00
def _print_trace(result: dict) -> None:
trace = result.get("routing_trace", [])
if not trace:
print(" (no trace)")
return
prev: str | None = None
for t in trace:
short = _short(t["model"])
switch = " ← switched" if (prev and t["model"] != prev) else ""
prev = t["model"]
print(f" {t['task']:<26} [{short}]{switch}")
def _print_summary(label: str, result: dict) -> None:
models = [t["model"] for t in result.get("routing_trace", [])]
if not models:
print(f" ? {label}: no routing data")
return
unique = set(models)
if len(unique) == 1:
print(f"{label}: {_short(next(iter(unique)))} for all {len(models)} turns")
else:
switched = sum(1 for a, b in zip(models, models[1:]) if a != b)
names = ", ".join(sorted(_short(m) for m in unique))
print(f"{label}: model switched {switched} time(s) — {names}")
# ---------------------------------------------------------------------------
# Demo
# ---------------------------------------------------------------------------
async def main() -> None:
print()
2026-03-25 23:15:00 -07:00
print(" ╔══════════════════════════════════════════════════════════════╗")
print(" ║ Session Pinning Demo — Research Agent ║")
2026-03-25 23:15:00 -07:00
print(" ╚══════════════════════════════════════════════════════════════╝")
print()
print(f" Agent : {AGENT_URL}")
print(f" Query : \"{QUERY[:72]}\"")
print()
print(" The agent uses a tool-calling loop (get_db_benchmarks,")
print(" get_case_studies, check_feature_support) to research the")
print(" question. Each LLM turn hits Plano's preference-based router.")
print()
print(f" Waiting for agent at {AGENT_URL}", end=" ", flush=True)
if not await wait_for_agent():
print("FAILED — agent did not respond within 30 s")
return
print("ready.")
print()
sid = str(uuid.uuid4())
print(" Sending queries (running concurrently)…")
print()
without, with_pin = await asyncio.gather(
ask_agent(QUERY, session_id=None),
ask_agent(QUERY, session_id=sid),
)
# ── Run 1 ────────────────────────────────────────────────────────────
print(" ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━")
print(" Run 1: WITHOUT Session Pinning")
print(" ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━")
print()
print(" LLM turns inside the agent loop:")
print()
_print_trace(without)
print()
_print_summary("Without pinning", without)
2026-03-25 23:15:00 -07:00
print()
# ── Run 2 ────────────────────────────────────────────────────────────
print(" ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━")
print(f" Run 2: WITH Session Pinning (X-Session-Id: {sid[:8]}…)")
print(" ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━")
print()
print(" LLM turns inside the agent loop:")
print()
_print_trace(with_pin)
print()
_print_summary("With pinning ", with_pin)
2026-03-25 23:15:00 -07:00
print()
# ── Final answer ─────────────────────────────────────────────────────
answer = with_pin["choices"][0]["message"]["content"]
print(" ══ Agent recommendation (pinned session) ═════════════════════")
print()
for line in answer.splitlines():
print(f" {line}")
print()
print(" ══════════════════════════════════════════════════════════════")
print()
if __name__ == "__main__":
asyncio.run(main())