mirror of
https://github.com/katanemo/plano.git
synced 2026-05-07 06:42:42 +02:00
rename session pinning to model affinity with x-model-affinity header
This commit is contained in:
parent
5789694d2f
commit
da9792c2dd
14 changed files with 468 additions and 371 deletions
|
|
@ -1,8 +1,6 @@
|
||||||
use bytes::Bytes;
|
use bytes::Bytes;
|
||||||
use common::configuration::{FilterPipeline, ModelAlias};
|
use common::configuration::{FilterPipeline, ModelAlias};
|
||||||
use common::consts::{
|
use common::consts::{ARCH_IS_STREAMING_HEADER, ARCH_PROVIDER_HINT_HEADER, MODEL_AFFINITY_HEADER};
|
||||||
ARCH_IS_STREAMING_HEADER, ARCH_PROVIDER_HINT_HEADER, ROUTING_SESSION_ID_HEADER,
|
|
||||||
};
|
|
||||||
use common::llm_providers::LlmProviders;
|
use common::llm_providers::LlmProviders;
|
||||||
use hermesllm::apis::openai::Message;
|
use hermesllm::apis::openai::Message;
|
||||||
use hermesllm::apis::openai_responses::InputParam;
|
use hermesllm::apis::openai_responses::InputParam;
|
||||||
|
|
@ -98,7 +96,7 @@ async fn llm_chat_inner(
|
||||||
|
|
||||||
// Session pinning: extract session ID and check cache before routing
|
// Session pinning: extract session ID and check cache before routing
|
||||||
let session_id: Option<String> = request_headers
|
let session_id: Option<String> = request_headers
|
||||||
.get(ROUTING_SESSION_ID_HEADER)
|
.get(MODEL_AFFINITY_HEADER)
|
||||||
.and_then(|h| h.to_str().ok())
|
.and_then(|h| h.to_str().ok())
|
||||||
.map(|s| s.to_string());
|
.map(|s| s.to_string());
|
||||||
let pinned_model: Option<String> = if let Some(ref sid) = session_id {
|
let pinned_model: Option<String> = if let Some(ref sid) = session_id {
|
||||||
|
|
|
||||||
|
|
@ -1,6 +1,6 @@
|
||||||
use bytes::Bytes;
|
use bytes::Bytes;
|
||||||
use common::configuration::{SpanAttributes, TopLevelRoutingPreference};
|
use common::configuration::{SpanAttributes, TopLevelRoutingPreference};
|
||||||
use common::consts::{REQUEST_ID_HEADER, ROUTING_SESSION_ID_HEADER};
|
use common::consts::{MODEL_AFFINITY_HEADER, REQUEST_ID_HEADER};
|
||||||
use common::errors::BrightStaffError;
|
use common::errors::BrightStaffError;
|
||||||
use hermesllm::clients::SupportedAPIsFromClient;
|
use hermesllm::clients::SupportedAPIsFromClient;
|
||||||
use hermesllm::ProviderRequestType;
|
use hermesllm::ProviderRequestType;
|
||||||
|
|
@ -72,7 +72,7 @@ pub async fn routing_decision(
|
||||||
.unwrap_or_else(|| uuid::Uuid::new_v4().to_string());
|
.unwrap_or_else(|| uuid::Uuid::new_v4().to_string());
|
||||||
|
|
||||||
let session_id: Option<String> = request_headers
|
let session_id: Option<String> = request_headers
|
||||||
.get(ROUTING_SESSION_ID_HEADER)
|
.get(MODEL_AFFINITY_HEADER)
|
||||||
.and_then(|h| h.to_str().ok())
|
.and_then(|h| h.to_str().ok())
|
||||||
.map(|s| s.to_string());
|
.map(|s| s.to_string());
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -22,7 +22,7 @@ pub const X_ARCH_TOOL_CALL: &str = "x-arch-tool-call-message";
|
||||||
pub const X_ARCH_FC_MODEL_RESPONSE: &str = "x-arch-fc-model-response";
|
pub const X_ARCH_FC_MODEL_RESPONSE: &str = "x-arch-fc-model-response";
|
||||||
pub const ARCH_FC_MODEL_NAME: &str = "Arch-Function";
|
pub const ARCH_FC_MODEL_NAME: &str = "Arch-Function";
|
||||||
pub const REQUEST_ID_HEADER: &str = "x-request-id";
|
pub const REQUEST_ID_HEADER: &str = "x-request-id";
|
||||||
pub const ROUTING_SESSION_ID_HEADER: &str = "x-routing-session-id";
|
pub const MODEL_AFFINITY_HEADER: &str = "x-model-affinity";
|
||||||
pub const ENVOY_ORIGINAL_PATH_HEADER: &str = "x-envoy-original-path";
|
pub const ENVOY_ORIGINAL_PATH_HEADER: &str = "x-envoy-original-path";
|
||||||
pub const TRACE_PARENT_HEADER: &str = "traceparent";
|
pub const TRACE_PARENT_HEADER: &str = "traceparent";
|
||||||
pub const ARCH_INTERNAL_CLUSTER_NAME: &str = "arch_internal";
|
pub const ARCH_INTERNAL_CLUSTER_NAME: &str = "arch_internal";
|
||||||
|
|
|
||||||
135
demos/llm_routing/model_affinity/README.md
Normal file
135
demos/llm_routing/model_affinity/README.md
Normal file
|
|
@ -0,0 +1,135 @@
|
||||||
|
# Model Affinity Demo
|
||||||
|
|
||||||
|
> Consistent model selection for agentic loops using `X-Model-Affinity`.
|
||||||
|
|
||||||
|
## Why Model Affinity?
|
||||||
|
|
||||||
|
When an agent runs in a loop — calling tools, reasoning about results, calling more tools — each LLM request hits Plano's router independently. Because prompts vary in intent (tool selection looks like code generation, reasoning about results looks like complex analysis), the router may select **different models** for each turn, fragmenting context mid-session.
|
||||||
|
|
||||||
|
**Model affinity** solves this: send an `X-Model-Affinity` header and the first request runs routing as usual, caching the decision. Every subsequent request with the same affinity ID returns the **same model**, without re-running the router.
|
||||||
|
|
||||||
|
```
|
||||||
|
Without affinity With affinity (X-Model-Affinity)
|
||||||
|
──────────────── ───────────────────────────────
|
||||||
|
Turn 1 → claude-sonnet (tool calls) Turn 1 → claude-sonnet ← routed
|
||||||
|
Turn 2 → gpt-4o (reasoning) Turn 2 → claude-sonnet ← pinned ✓
|
||||||
|
Turn 3 → claude-sonnet (tool calls) Turn 3 → claude-sonnet ← pinned ✓
|
||||||
|
Turn 4 → gpt-4o (reasoning) Turn 4 → claude-sonnet ← pinned ✓
|
||||||
|
Turn 5 → claude-sonnet (final answer) Turn 5 → claude-sonnet ← pinned ✓
|
||||||
|
↑ model switches every turn ↑ one model, start to finish
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Quick Start
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# 1. Set API keys
|
||||||
|
export OPENAI_API_KEY=<your-key>
|
||||||
|
export ANTHROPIC_API_KEY=<your-key>
|
||||||
|
|
||||||
|
# 2. Start Plano
|
||||||
|
cd demos/llm_routing/model_affinity
|
||||||
|
planoai up config.yaml
|
||||||
|
|
||||||
|
# 3. Run the demo (uv manages dependencies automatically)
|
||||||
|
./demo.sh # or: uv run demo.py
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## What the Demo Does
|
||||||
|
|
||||||
|
A **database selection agent** investigates whether to use PostgreSQL or MongoDB
|
||||||
|
for an e-commerce platform. It runs a real tool-calling loop: the LLM decides
|
||||||
|
which tools to call, receives simulated results, and continues until it has
|
||||||
|
enough data to recommend a database.
|
||||||
|
|
||||||
|
Available tools:
|
||||||
|
- `get_db_benchmarks` — fetch performance data for a workload type
|
||||||
|
- `get_case_studies` — retrieve real-world e-commerce case studies
|
||||||
|
- `check_feature_support` — check if a database supports a specific feature
|
||||||
|
|
||||||
|
The demo runs the **same agent loop twice**:
|
||||||
|
|
||||||
|
1. **Without affinity** — no `X-Model-Affinity`; models may switch between turns
|
||||||
|
2. **With affinity** — `X-Model-Affinity` header included; model is pinned from turn 1
|
||||||
|
|
||||||
|
Each turn is a separate `POST /v1/chat/completions` request to Plano using the
|
||||||
|
[OpenAI SDK](https://github.com/openai/openai-python). The demo prints the
|
||||||
|
model used on each turn so you can see the difference.
|
||||||
|
|
||||||
|
### Expected Output
|
||||||
|
|
||||||
|
```
|
||||||
|
Run 1: WITHOUT Model Affinity
|
||||||
|
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
|
||||||
|
turn 1 [claude-sonnet-4-20250514 ] get_db_benchmarks, get_db_benchmarks
|
||||||
|
turn 2 [gpt-4o ] get_case_studies, get_case_studies ← switched
|
||||||
|
turn 3 [claude-sonnet-4-20250514 ] check_feature_support ← switched
|
||||||
|
turn 4 [gpt-4o ] final answer ← switched
|
||||||
|
|
||||||
|
✗ Without affinity: model switched 3 time(s)
|
||||||
|
|
||||||
|
|
||||||
|
Run 2: WITH Model Affinity (X-Model-Affinity: a1b2c3d4…)
|
||||||
|
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
|
||||||
|
turn 1 [claude-sonnet-4-20250514 ] get_db_benchmarks, get_db_benchmarks
|
||||||
|
turn 2 [claude-sonnet-4-20250514 ] get_case_studies, get_case_studies
|
||||||
|
turn 3 [claude-sonnet-4-20250514 ] check_feature_support
|
||||||
|
turn 4 [claude-sonnet-4-20250514 ] final answer
|
||||||
|
|
||||||
|
✓ With affinity: claude-sonnet-4-20250514 for all 4 turns
|
||||||
|
```
|
||||||
|
|
||||||
|
### How It Works
|
||||||
|
|
||||||
|
Model affinity is implemented in brightstaff. When `X-Model-Affinity` is present:
|
||||||
|
|
||||||
|
1. **First request** — routing runs normally, result is cached keyed by the affinity ID
|
||||||
|
2. **Subsequent requests** — cache hit skips routing and returns the cached model instantly
|
||||||
|
|
||||||
|
The `X-Model-Affinity` header is forwarded transparently; no changes to your OpenAI
|
||||||
|
SDK calls beyond adding the header.
|
||||||
|
|
||||||
|
```python
|
||||||
|
from openai import OpenAI
|
||||||
|
import uuid
|
||||||
|
|
||||||
|
client = OpenAI(base_url="http://localhost:12000/v1", api_key="EMPTY")
|
||||||
|
|
||||||
|
affinity_id = str(uuid.uuid4())
|
||||||
|
|
||||||
|
response = client.chat.completions.create(
|
||||||
|
model="gpt-4o-mini",
|
||||||
|
messages=[{"role": "user", "content": prompt}],
|
||||||
|
extra_headers={"X-Model-Affinity": affinity_id},
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Configuration
|
||||||
|
|
||||||
|
Model affinity is configurable in `config.yaml`:
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
routing:
|
||||||
|
session_ttl_seconds: 600 # How long affinity lasts (default: 10 min)
|
||||||
|
session_max_entries: 10000 # Max cached sessions (upper limit: 10000)
|
||||||
|
```
|
||||||
|
|
||||||
|
Without the `X-Model-Affinity` header, routing runs fresh every time — no breaking
|
||||||
|
change to existing clients.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Advanced: Agent Server Demo
|
||||||
|
|
||||||
|
The `agent.py` file is a FastAPI-based agent server that demonstrates a more
|
||||||
|
complex pattern: an external agent service that forwards `X-Model-Affinity`
|
||||||
|
on all outbound calls to Plano. Use `start_agents.sh` to run it.
|
||||||
|
|
||||||
|
## See Also
|
||||||
|
|
||||||
|
- [Model Routing Service Demo](../model_routing_service/) — curl-based examples of the routing endpoint
|
||||||
|
|
@ -11,10 +11,9 @@ each with its own tool-calling loop. The tasks deliberately alternate between
|
||||||
code_generation and complex_reasoning intents so Plano's preference-based
|
code_generation and complex_reasoning intents so Plano's preference-based
|
||||||
router selects different models for each task.
|
router selects different models for each task.
|
||||||
|
|
||||||
If the client sends X-Routing-Session-Id, the agent forwards it on every
|
If the client sends X-Model-Affinity, the agent forwards it on every outbound
|
||||||
outbound call to Plano. The first task pins the model; all subsequent tasks
|
call to Plano. The first task pins the model; all subsequent tasks skip the
|
||||||
skip the router and reuse it — keeping the whole session on one consistent
|
router and reuse it — keeping the whole session on one consistent model.
|
||||||
model.
|
|
||||||
|
|
||||||
Run standalone:
|
Run standalone:
|
||||||
uv run agent.py
|
uv run agent.py
|
||||||
|
|
@ -310,12 +309,12 @@ async def run_task(
|
||||||
|
|
||||||
Each task is an independent conversation so the router sees only
|
Each task is an independent conversation so the router sees only
|
||||||
this task's intent — not the accumulated context of previous tasks.
|
this task's intent — not the accumulated context of previous tasks.
|
||||||
Session pinning via X-Routing-Session-Id pins the model from the first
|
Model affinity via X-Model-Affinity pins the model from the first task
|
||||||
task onward, so all tasks stay on the same model.
|
onward, so all tasks stay on the same model.
|
||||||
|
|
||||||
Returns (answer, first_model_used).
|
Returns (answer, first_model_used).
|
||||||
"""
|
"""
|
||||||
headers = {"X-Routing-Session-Id": session_id} if session_id else {}
|
headers = {"X-Model-Affinity": session_id} if session_id else {}
|
||||||
messages: list[ChatCompletionMessageParam] = [
|
messages: list[ChatCompletionMessageParam] = [
|
||||||
{"role": "system", "content": SYSTEM_PROMPT},
|
{"role": "system", "content": SYSTEM_PROMPT},
|
||||||
{"role": "user", "content": prompt},
|
{"role": "user", "content": prompt},
|
||||||
|
|
@ -392,7 +391,7 @@ app = FastAPI(title="Research Agent", version="1.0.0")
|
||||||
@app.post("/v1/chat/completions")
|
@app.post("/v1/chat/completions")
|
||||||
async def chat(request: Request) -> JSONResponse:
|
async def chat(request: Request) -> JSONResponse:
|
||||||
body = await request.json()
|
body = await request.json()
|
||||||
session_id: str | None = request.headers.get("x-routing-session-id")
|
session_id: str | None = request.headers.get("x-model-affinity")
|
||||||
|
|
||||||
log.info("request session_id=%s", session_id or "none")
|
log.info("request session_id=%s", session_id or "none")
|
||||||
|
|
||||||
307
demos/llm_routing/model_affinity/demo.py
Normal file
307
demos/llm_routing/model_affinity/demo.py
Normal file
|
|
@ -0,0 +1,307 @@
|
||||||
|
#!/usr/bin/env -S uv run --script
|
||||||
|
# /// script
|
||||||
|
# requires-python = ">=3.12"
|
||||||
|
# dependencies = ["openai>=1.0.0"]
|
||||||
|
# ///
|
||||||
|
"""
|
||||||
|
Model Affinity Demo — Agentic Tool-Calling Loop
|
||||||
|
|
||||||
|
Runs the same agentic loop twice through Plano:
|
||||||
|
1. Without model affinity — the router may pick different models per turn
|
||||||
|
2. With model affinity — all turns use the model selected on turn 1
|
||||||
|
|
||||||
|
Each loop is a real tool-calling agent: the LLM decides which tools to call,
|
||||||
|
we provide simulated results, and the LLM continues until it has enough
|
||||||
|
information to produce a final answer. Each turn is a separate request to
|
||||||
|
Plano, so the router classifies intent independently every time.
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
planoai up config.yaml # start Plano
|
||||||
|
uv run demo.py # run this demo
|
||||||
|
"""
|
||||||
|
|
||||||
|
import asyncio
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
import uuid
|
||||||
|
|
||||||
|
from openai import AsyncOpenAI
|
||||||
|
from openai.types.chat import ChatCompletionMessageParam
|
||||||
|
|
||||||
|
PLANO_URL = os.environ.get("PLANO_URL", "http://localhost:12000")
|
||||||
|
|
||||||
|
SYSTEM_PROMPT = (
|
||||||
|
"You are a database selection analyst. Use the provided tools to gather "
|
||||||
|
"benchmark data and case studies, then recommend PostgreSQL or MongoDB "
|
||||||
|
"for a high-traffic e-commerce backend. Be concise."
|
||||||
|
)
|
||||||
|
|
||||||
|
USER_QUERY = (
|
||||||
|
"Should we use PostgreSQL or MongoDB for our e-commerce platform? "
|
||||||
|
"We need strong consistency for orders but flexible schemas for products. "
|
||||||
|
"Use the tools to research both options, then give a recommendation."
|
||||||
|
)
|
||||||
|
|
||||||
|
TOOLS = [
|
||||||
|
{
|
||||||
|
"type": "function",
|
||||||
|
"function": {
|
||||||
|
"name": "get_db_benchmarks",
|
||||||
|
"description": "Fetch performance benchmarks for a database under a given workload.",
|
||||||
|
"parameters": {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"database": {
|
||||||
|
"type": "string",
|
||||||
|
"enum": ["postgresql", "mongodb"],
|
||||||
|
},
|
||||||
|
"workload": {
|
||||||
|
"type": "string",
|
||||||
|
"enum": ["read_heavy", "write_heavy", "mixed"],
|
||||||
|
},
|
||||||
|
},
|
||||||
|
"required": ["database", "workload"],
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "function",
|
||||||
|
"function": {
|
||||||
|
"name": "get_case_studies",
|
||||||
|
"description": "Retrieve real-world e-commerce case studies for a database.",
|
||||||
|
"parameters": {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"database": {
|
||||||
|
"type": "string",
|
||||||
|
"enum": ["postgresql", "mongodb"],
|
||||||
|
},
|
||||||
|
},
|
||||||
|
"required": ["database"],
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "function",
|
||||||
|
"function": {
|
||||||
|
"name": "check_feature_support",
|
||||||
|
"description": "Check if a database supports a specific feature.",
|
||||||
|
"parameters": {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"database": {
|
||||||
|
"type": "string",
|
||||||
|
"enum": ["postgresql", "mongodb"],
|
||||||
|
},
|
||||||
|
"feature": {"type": "string"},
|
||||||
|
},
|
||||||
|
"required": ["database", "feature"],
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
]
|
||||||
|
|
||||||
|
# Simulated tool responses
|
||||||
|
_BENCHMARKS = {
|
||||||
|
("postgresql", "mixed"): {
|
||||||
|
"read_qps": 42000,
|
||||||
|
"write_qps": 21000,
|
||||||
|
"p99_ms": 6,
|
||||||
|
"notes": "Solid all-round; MVCC keeps reads non-blocking",
|
||||||
|
},
|
||||||
|
("mongodb", "mixed"): {
|
||||||
|
"read_qps": 60000,
|
||||||
|
"write_qps": 50000,
|
||||||
|
"p99_ms": 3,
|
||||||
|
"notes": "Flexible schema accelerates feature iteration",
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
_CASE_STUDIES = {
|
||||||
|
"postgresql": [
|
||||||
|
{"company": "Shopify", "notes": "Moved orders back to Postgres for ACID"},
|
||||||
|
{
|
||||||
|
"company": "Zalando",
|
||||||
|
"notes": "Postgres + Citus for sharded order processing",
|
||||||
|
},
|
||||||
|
],
|
||||||
|
"mongodb": [
|
||||||
|
{"company": "eBay", "notes": "Product catalogue — flexible attribute schemas"},
|
||||||
|
{"company": "Alibaba", "notes": "Session/cart data — high write throughput"},
|
||||||
|
],
|
||||||
|
}
|
||||||
|
|
||||||
|
_FEATURES = {
|
||||||
|
("postgresql", "acid transactions"): {"supported": True, "notes": "Full ACID"},
|
||||||
|
("mongodb", "acid transactions"): {
|
||||||
|
"supported": True,
|
||||||
|
"notes": "Multi-doc ACID since v4.0",
|
||||||
|
},
|
||||||
|
("postgresql", "horizontal sharding"): {
|
||||||
|
"supported": True,
|
||||||
|
"notes": "Via Citus extension",
|
||||||
|
},
|
||||||
|
("mongodb", "horizontal sharding"): {
|
||||||
|
"supported": True,
|
||||||
|
"notes": "Native auto-balancing",
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def dispatch_tool(name: str, args: dict) -> str:
|
||||||
|
if name == "get_db_benchmarks":
|
||||||
|
key = (args["database"], args["workload"])
|
||||||
|
return json.dumps(_BENCHMARKS.get(key, {"error": f"no data for {key}"}))
|
||||||
|
if name == "get_case_studies":
|
||||||
|
return json.dumps(_CASE_STUDIES.get(args["database"], {"error": "unknown db"}))
|
||||||
|
if name == "check_feature_support":
|
||||||
|
key = (args["database"], args["feature"].lower())
|
||||||
|
for k, v in _FEATURES.items():
|
||||||
|
if k[0] == key[0] and k[1] in key[1]:
|
||||||
|
return json.dumps(v)
|
||||||
|
return json.dumps({"error": f"no data for {key}"})
|
||||||
|
return json.dumps({"error": f"unknown tool {name}"})
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Agentic loop — runs tool calls until the LLM produces a final answer
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
async def run_agent_loop(
|
||||||
|
affinity_id: str | None = None,
|
||||||
|
max_turns: int = 10,
|
||||||
|
) -> tuple[str, list[dict]]:
|
||||||
|
"""
|
||||||
|
Run a tool-calling agent loop against Plano.
|
||||||
|
|
||||||
|
Returns (final_answer, trace) where trace is a list of
|
||||||
|
{"turn": int, "model": str, "tool_calls": [...]} dicts.
|
||||||
|
"""
|
||||||
|
client = AsyncOpenAI(base_url=f"{PLANO_URL}/v1", api_key="EMPTY")
|
||||||
|
headers = {"X-Model-Affinity": affinity_id} if affinity_id else None
|
||||||
|
|
||||||
|
messages: list[ChatCompletionMessageParam] = [
|
||||||
|
{"role": "system", "content": SYSTEM_PROMPT},
|
||||||
|
{"role": "user", "content": USER_QUERY},
|
||||||
|
]
|
||||||
|
trace: list[dict] = []
|
||||||
|
|
||||||
|
for turn in range(1, max_turns + 1):
|
||||||
|
resp = await client.chat.completions.create(
|
||||||
|
model="gpt-4o-mini",
|
||||||
|
messages=messages,
|
||||||
|
tools=TOOLS,
|
||||||
|
tool_choice="auto",
|
||||||
|
max_completion_tokens=800,
|
||||||
|
extra_headers=headers,
|
||||||
|
)
|
||||||
|
|
||||||
|
choice = resp.choices[0]
|
||||||
|
turn_info: dict = {"turn": turn, "model": resp.model}
|
||||||
|
|
||||||
|
if choice.finish_reason == "tool_calls" and choice.message.tool_calls:
|
||||||
|
tool_names = [tc.function.name for tc in choice.message.tool_calls]
|
||||||
|
turn_info["tool_calls"] = tool_names
|
||||||
|
trace.append(turn_info)
|
||||||
|
|
||||||
|
messages.append(choice.message)
|
||||||
|
for tc in choice.message.tool_calls:
|
||||||
|
args = json.loads(tc.function.arguments or "{}")
|
||||||
|
result = dispatch_tool(tc.function.name, args)
|
||||||
|
messages.append(
|
||||||
|
{"role": "tool", "content": result, "tool_call_id": tc.id}
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
turn_info["tool_calls"] = []
|
||||||
|
trace.append(turn_info)
|
||||||
|
return (choice.message.content or "").strip(), trace
|
||||||
|
|
||||||
|
return "(max turns reached)", trace
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Display helpers
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
def short_model(model: str) -> str:
|
||||||
|
return model.split("/")[-1] if "/" in model else model
|
||||||
|
|
||||||
|
|
||||||
|
def print_trace(trace: list[dict]) -> None:
|
||||||
|
for t in trace:
|
||||||
|
model = short_model(t["model"])
|
||||||
|
tools = ", ".join(t["tool_calls"]) if t["tool_calls"] else "final answer"
|
||||||
|
print(f" turn {t['turn']} [{model:<30}] {tools}")
|
||||||
|
|
||||||
|
|
||||||
|
def print_summary(label: str, trace: list[dict]) -> None:
|
||||||
|
models = [t["model"] for t in trace]
|
||||||
|
unique = set(models)
|
||||||
|
if len(unique) == 1:
|
||||||
|
print(
|
||||||
|
f" ✓ {label}: {short_model(next(iter(unique)))} "
|
||||||
|
f"for all {len(models)} turns"
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
switches = sum(1 for a, b in zip(models, models[1:]) if a != b)
|
||||||
|
names = ", ".join(sorted(short_model(m) for m in unique))
|
||||||
|
print(f" ✗ {label}: model switched {switches} time(s) — {names}")
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Main
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
async def main() -> None:
|
||||||
|
print()
|
||||||
|
print(" ╔══════════════════════════════════════════════════════════╗")
|
||||||
|
print(" ║ Model Affinity Demo — Agentic Loop ║")
|
||||||
|
print(" ╚══════════════════════════════════════════════════════════╝")
|
||||||
|
print()
|
||||||
|
print(f" Plano : {PLANO_URL}")
|
||||||
|
print(f' Query : "{USER_QUERY[:65]}…"')
|
||||||
|
print()
|
||||||
|
print(" The agent calls tools (get_db_benchmarks, get_case_studies,")
|
||||||
|
print(" check_feature_support) across multiple turns. Each turn is")
|
||||||
|
print(" a separate request to Plano — the router classifies intent")
|
||||||
|
print(" independently, so different turns may get different models.")
|
||||||
|
print()
|
||||||
|
|
||||||
|
# --- Run 1: without affinity ---
|
||||||
|
print(" ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━")
|
||||||
|
print(" Run 1: WITHOUT Model Affinity")
|
||||||
|
print(" ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━")
|
||||||
|
print()
|
||||||
|
answer1, trace1 = await run_agent_loop(affinity_id=None)
|
||||||
|
print_trace(trace1)
|
||||||
|
print()
|
||||||
|
print_summary("Without affinity", trace1)
|
||||||
|
print()
|
||||||
|
|
||||||
|
# --- Run 2: with affinity ---
|
||||||
|
aid = str(uuid.uuid4())
|
||||||
|
print(" ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━")
|
||||||
|
print(f" Run 2: WITH Model Affinity (X-Model-Affinity: {aid[:8]}…)")
|
||||||
|
print(" ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━")
|
||||||
|
print()
|
||||||
|
answer2, trace2 = await run_agent_loop(affinity_id=aid)
|
||||||
|
print_trace(trace2)
|
||||||
|
print()
|
||||||
|
print_summary("With affinity ", trace2)
|
||||||
|
print()
|
||||||
|
|
||||||
|
# --- Final answer ---
|
||||||
|
print(" ══ Agent recommendation (affinity session) ════════════════")
|
||||||
|
print()
|
||||||
|
for line in answer2.splitlines():
|
||||||
|
print(f" {line}")
|
||||||
|
print()
|
||||||
|
print(" ═══════════════════════════════════════════════════════════")
|
||||||
|
print()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
asyncio.run(main())
|
||||||
7
demos/llm_routing/model_affinity/demo.sh
Executable file
7
demos/llm_routing/model_affinity/demo.sh
Executable file
|
|
@ -0,0 +1,7 @@
|
||||||
|
#!/bin/bash
|
||||||
|
set -e
|
||||||
|
|
||||||
|
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
|
||||||
|
|
||||||
|
# Run the demo directly against Plano (no agent server needed)
|
||||||
|
uv run "$SCRIPT_DIR/demo.py"
|
||||||
|
|
@ -108,13 +108,13 @@ The response contains the model list — your client should try `models[0]` firs
|
||||||
|
|
||||||
## Session Pinning
|
## Session Pinning
|
||||||
|
|
||||||
Send an `X-Routing-Session-Id` header to pin the routing decision for a session. Once a model is selected, all subsequent requests with the same session ID return the same model without re-running routing.
|
Send an `X-Model-Affinity` header to pin the routing decision for a session. Once a model is selected, all subsequent requests with the same session ID return the same model without re-running routing.
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
# First call — runs routing, caches result
|
# First call — runs routing, caches result
|
||||||
curl http://localhost:12000/routing/v1/chat/completions \
|
curl http://localhost:12000/routing/v1/chat/completions \
|
||||||
-H "Content-Type: application/json" \
|
-H "Content-Type: application/json" \
|
||||||
-H "X-Routing-Session-Id: my-session-123" \
|
-H "X-Model-Affinity: my-session-123" \
|
||||||
-d '{
|
-d '{
|
||||||
"model": "gpt-4o-mini",
|
"model": "gpt-4o-mini",
|
||||||
"messages": [{"role": "user", "content": "Write a Python function for binary search"}]
|
"messages": [{"role": "user", "content": "Write a Python function for binary search"}]
|
||||||
|
|
@ -136,7 +136,7 @@ Response (first call):
|
||||||
# Second call — same session, returns cached result
|
# Second call — same session, returns cached result
|
||||||
curl http://localhost:12000/routing/v1/chat/completions \
|
curl http://localhost:12000/routing/v1/chat/completions \
|
||||||
-H "Content-Type: application/json" \
|
-H "Content-Type: application/json" \
|
||||||
-H "X-Routing-Session-Id: my-session-123" \
|
-H "X-Model-Affinity: my-session-123" \
|
||||||
-d '{
|
-d '{
|
||||||
"model": "gpt-4o-mini",
|
"model": "gpt-4o-mini",
|
||||||
"messages": [{"role": "user", "content": "Now explain merge sort"}]
|
"messages": [{"role": "user", "content": "Now explain merge sort"}]
|
||||||
|
|
@ -161,7 +161,7 @@ routing:
|
||||||
session_max_entries: 10000 # default: 10000
|
session_max_entries: 10000 # default: 10000
|
||||||
```
|
```
|
||||||
|
|
||||||
Without the `X-Routing-Session-Id` header, routing runs fresh every time (no breaking change).
|
Without the `X-Model-Affinity` header, routing runs fresh every time (no breaking change).
|
||||||
|
|
||||||
## Kubernetes Deployment (Self-hosted Arch-Router on GPU)
|
## Kubernetes Deployment (Self-hosted Arch-Router on GPU)
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -114,7 +114,7 @@ echo "--- 7. Session pinning - first call (fresh routing decision) ---"
|
||||||
echo ""
|
echo ""
|
||||||
curl -s "$PLANO_URL/routing/v1/chat/completions" \
|
curl -s "$PLANO_URL/routing/v1/chat/completions" \
|
||||||
-H "Content-Type: application/json" \
|
-H "Content-Type: application/json" \
|
||||||
-H "X-Routing-Session-Id: demo-session-001" \
|
-H "X-Model-Affinity: demo-session-001" \
|
||||||
-d '{
|
-d '{
|
||||||
"model": "gpt-4o-mini",
|
"model": "gpt-4o-mini",
|
||||||
"messages": [
|
"messages": [
|
||||||
|
|
@ -129,7 +129,7 @@ echo " Notice: same model returned with \"pinned\": true, routing was skipped
|
||||||
echo ""
|
echo ""
|
||||||
curl -s "$PLANO_URL/routing/v1/chat/completions" \
|
curl -s "$PLANO_URL/routing/v1/chat/completions" \
|
||||||
-H "Content-Type: application/json" \
|
-H "Content-Type: application/json" \
|
||||||
-H "X-Routing-Session-Id: demo-session-001" \
|
-H "X-Model-Affinity: demo-session-001" \
|
||||||
-d '{
|
-d '{
|
||||||
"model": "gpt-4o-mini",
|
"model": "gpt-4o-mini",
|
||||||
"messages": [
|
"messages": [
|
||||||
|
|
@ -143,7 +143,7 @@ echo "--- 9. Different session gets its own fresh routing ---"
|
||||||
echo ""
|
echo ""
|
||||||
curl -s "$PLANO_URL/routing/v1/chat/completions" \
|
curl -s "$PLANO_URL/routing/v1/chat/completions" \
|
||||||
-H "Content-Type: application/json" \
|
-H "Content-Type: application/json" \
|
||||||
-H "X-Routing-Session-Id: demo-session-002" \
|
-H "X-Model-Affinity: demo-session-002" \
|
||||||
-d '{
|
-d '{
|
||||||
"model": "gpt-4o-mini",
|
"model": "gpt-4o-mini",
|
||||||
"messages": [
|
"messages": [
|
||||||
|
|
|
||||||
|
|
@ -1,156 +0,0 @@
|
||||||
# Session Pinning Demo
|
|
||||||
|
|
||||||
> Consistent model selection for agentic loops using `X-Routing-Session-Id`.
|
|
||||||
|
|
||||||
## Why Session Pinning?
|
|
||||||
|
|
||||||
When an agent runs in a loop — research → analyse → implement → evaluate → summarise — each step hits Plano's router independently. Because prompts vary in intent, the router may select **different models** for each step, fragmenting context mid-session.
|
|
||||||
|
|
||||||
**Session pinning** solves this: send an `X-Routing-Session-Id` header and the first request runs routing as usual, caching the decision. Every subsequent request with the same session ID returns the **same model**, without re-running the router.
|
|
||||||
|
|
||||||
```
|
|
||||||
Without pinning With pinning (X-Routing-Session-Id)
|
|
||||||
───────────────── ──────────────────────────
|
|
||||||
Step 1 → claude-sonnet (code_gen) Step 1 → claude-sonnet ← routed
|
|
||||||
Step 2 → gpt-4o (reasoning) Step 2 → claude-sonnet ← pinned ✓
|
|
||||||
Step 3 → claude-sonnet (code_gen) Step 3 → claude-sonnet ← pinned ✓
|
|
||||||
Step 4 → gpt-4o (reasoning) Step 4 → claude-sonnet ← pinned ✓
|
|
||||||
Step 5 → claude-sonnet (code_gen) Step 5 → claude-sonnet ← pinned ✓
|
|
||||||
↑ model switches every step ↑ one model, start to finish
|
|
||||||
```
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## Quick Start
|
|
||||||
|
|
||||||
```bash
|
|
||||||
# 1. Set API keys
|
|
||||||
export OPENAI_API_KEY=<your-key>
|
|
||||||
export ANTHROPIC_API_KEY=<your-key>
|
|
||||||
|
|
||||||
# 2. Start Plano
|
|
||||||
cd demos/llm_routing/session_pinning
|
|
||||||
planoai up config.yaml
|
|
||||||
|
|
||||||
# 3. Run the demo (uv manages dependencies automatically)
|
|
||||||
./demo.sh # or: uv run demo.py
|
|
||||||
```
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## What the Demo Does
|
|
||||||
|
|
||||||
A **Database Research Agent** investigates whether to use PostgreSQL or MongoDB
|
|
||||||
for an e-commerce platform. It runs 5 steps, each building on prior findings via
|
|
||||||
accumulated message history. Steps alternate between `code_generation` and
|
|
||||||
`complex_reasoning` intents so Plano routes to different models without pinning.
|
|
||||||
|
|
||||||
| Step | Task | Intent |
|
|
||||||
|:----:|------|--------|
|
|
||||||
| 1 | List technical requirements | code_generation → claude-sonnet |
|
|
||||||
| 2 | Compare PostgreSQL vs MongoDB | complex_reasoning → gpt-4o |
|
|
||||||
| 3 | Write schema (CREATE TABLE) | code_generation → claude-sonnet |
|
|
||||||
| 4 | Assess scalability trade-offs | complex_reasoning → gpt-4o |
|
|
||||||
| 5 | Write final recommendation report | code_generation → claude-sonnet |
|
|
||||||
|
|
||||||
The demo runs the loop **twice** against `/v1/chat/completions` using the
|
|
||||||
[OpenAI SDK](https://github.com/openai/openai-python):
|
|
||||||
|
|
||||||
1. **Without pinning** — no `X-Routing-Session-Id`; models alternate per step
|
|
||||||
2. **With pinning** — `X-Routing-Session-Id` header included; model is pinned from step 1
|
|
||||||
|
|
||||||
Each step makes real LLM calls. Step 5's report explicitly references findings
|
|
||||||
from earlier steps, demonstrating why coherent context requires a consistent model.
|
|
||||||
|
|
||||||
### Expected Output
|
|
||||||
|
|
||||||
```
|
|
||||||
Run 1: WITHOUT Session Pinning
|
|
||||||
─────────────────────────────────────────────────────────────────────
|
|
||||||
step 1 [claude-sonnet-4-20250514] List requirements
|
|
||||||
"Critical requirements: 1. ACID transactions for order integrity…"
|
|
||||||
|
|
||||||
step 2 [gpt-4o ] Compare databases ← switched
|
|
||||||
"PostgreSQL excels at joins and ACID guarantees…"
|
|
||||||
|
|
||||||
step 3 [claude-sonnet-4-20250514] Write schema ← switched
|
|
||||||
"CREATE TABLE orders (\n id SERIAL PRIMARY KEY…"
|
|
||||||
|
|
||||||
step 4 [gpt-4o ] Assess scalability ← switched
|
|
||||||
"At high write volume, PostgreSQL row-level locking…"
|
|
||||||
|
|
||||||
step 5 [claude-sonnet-4-20250514] Write report ← switched
|
|
||||||
"RECOMMENDATION: PostgreSQL is the right choice…"
|
|
||||||
|
|
||||||
✗ Without pinning: model switched 4 time(s) — gpt-4o, claude-sonnet-4-20250514
|
|
||||||
|
|
||||||
|
|
||||||
Run 2: WITH Session Pinning (X-Routing-Session-Id: a1b2c3d4…)
|
|
||||||
─────────────────────────────────────────────────────────────────────
|
|
||||||
step 1 [claude-sonnet-4-20250514] List requirements
|
|
||||||
"Critical requirements: 1. ACID transactions for order integrity…"
|
|
||||||
|
|
||||||
step 2 [claude-sonnet-4-20250514] Compare databases
|
|
||||||
"Building on the requirements I just outlined: PostgreSQL…"
|
|
||||||
|
|
||||||
step 3 [claude-sonnet-4-20250514] Write schema
|
|
||||||
"Following the comparison above, here is the PostgreSQL schema…"
|
|
||||||
|
|
||||||
step 4 [claude-sonnet-4-20250514] Assess scalability
|
|
||||||
"Given the schema I designed, PostgreSQL's row-level locking…"
|
|
||||||
|
|
||||||
step 5 [claude-sonnet-4-20250514] Write report
|
|
||||||
"RECOMMENDATION: Based on my analysis of requirements, comparison…"
|
|
||||||
|
|
||||||
✓ With pinning: claude-sonnet-4-20250514 held for all 5 steps
|
|
||||||
|
|
||||||
══ Final Report (pinned session) ═════════════════════════════════════
|
|
||||||
RECOMMENDATION: Based on my analysis of requirements, the head-to-head
|
|
||||||
comparison, the schema I designed, and the scalability trade-offs…
|
|
||||||
══════════════════════════════════════════════════════════════════════
|
|
||||||
```
|
|
||||||
|
|
||||||
### How It Works
|
|
||||||
|
|
||||||
Session pinning is implemented in brightstaff. When `X-Routing-Session-Id` is present:
|
|
||||||
|
|
||||||
1. **First request** — routing runs normally, result is cached keyed by session ID
|
|
||||||
2. **Subsequent requests** — cache hit skips routing and returns the cached model instantly
|
|
||||||
|
|
||||||
The `X-Routing-Session-Id` header is forwarded transparently; no changes to your OpenAI
|
|
||||||
SDK calls beyond adding the header.
|
|
||||||
|
|
||||||
```python
|
|
||||||
from openai import OpenAI
|
|
||||||
|
|
||||||
client = OpenAI(base_url="http://localhost:12000/v1", api_key="EMPTY")
|
|
||||||
|
|
||||||
session_id = str(uuid.uuid4())
|
|
||||||
|
|
||||||
response = client.chat.completions.create(
|
|
||||||
model="gpt-4o-mini",
|
|
||||||
messages=[{"role": "user", "content": prompt}],
|
|
||||||
extra_headers={"X-Routing-Session-Id": session_id}, # pin the session
|
|
||||||
)
|
|
||||||
```
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## Configuration
|
|
||||||
|
|
||||||
Session pinning is configurable in `config.yaml`:
|
|
||||||
|
|
||||||
```yaml
|
|
||||||
routing:
|
|
||||||
session_ttl_seconds: 600 # How long a pinned session lasts (default: 10 min)
|
|
||||||
session_max_entries: 10000 # Max cached sessions before LRU eviction
|
|
||||||
```
|
|
||||||
|
|
||||||
Without the `X-Routing-Session-Id` header, routing runs fresh every time — no breaking
|
|
||||||
change to existing clients.
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## See Also
|
|
||||||
|
|
||||||
- [Model Routing Service Demo](../model_routing_service/) — curl-based examples of the routing endpoint
|
|
||||||
|
|
@ -1,174 +0,0 @@
|
||||||
#!/usr/bin/env -S uv run --script
|
|
||||||
# /// script
|
|
||||||
# requires-python = ">=3.12"
|
|
||||||
# dependencies = ["httpx>=0.27"]
|
|
||||||
# ///
|
|
||||||
"""
|
|
||||||
Session Pinning Demo — Research Agent client
|
|
||||||
|
|
||||||
Sends the same query to the Research Agent twice — once without a session ID
|
|
||||||
and once with one — and compares the routing trace to show how session pinning
|
|
||||||
keeps the model consistent across the LLM's tool-calling loop.
|
|
||||||
|
|
||||||
Requires the agent to already be running (start it with ./start_agents.sh).
|
|
||||||
|
|
||||||
Usage:
|
|
||||||
uv run demo.py
|
|
||||||
AGENT_URL=http://localhost:8000 uv run demo.py
|
|
||||||
"""
|
|
||||||
|
|
||||||
import asyncio
|
|
||||||
import os
|
|
||||||
import uuid
|
|
||||||
|
|
||||||
import httpx
|
|
||||||
|
|
||||||
AGENT_URL = os.environ.get("AGENT_URL", "http://localhost:8000")
|
|
||||||
|
|
||||||
QUERY = (
|
|
||||||
"Should we use PostgreSQL or MongoDB for a high-traffic e-commerce backend "
|
|
||||||
"that needs strong consistency for orders but flexible schemas for products?"
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
# Client helpers
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
|
|
||||||
|
|
||||||
async def wait_for_agent(timeout: int = 30) -> bool:
|
|
||||||
async with httpx.AsyncClient() as client:
|
|
||||||
for _ in range(timeout * 2):
|
|
||||||
try:
|
|
||||||
r = await client.get(f"{AGENT_URL}/health", timeout=1.0)
|
|
||||||
if r.status_code == 200:
|
|
||||||
return True
|
|
||||||
except Exception:
|
|
||||||
pass
|
|
||||||
await asyncio.sleep(0.5)
|
|
||||||
return False
|
|
||||||
|
|
||||||
|
|
||||||
async def ask_agent(query: str, session_id: str | None = None) -> dict:
|
|
||||||
headers: dict[str, str] = {}
|
|
||||||
if session_id:
|
|
||||||
headers["X-Routing-Session-Id"] = session_id
|
|
||||||
|
|
||||||
async with httpx.AsyncClient(timeout=120.0) as client:
|
|
||||||
r = await client.post(
|
|
||||||
f"{AGENT_URL}/v1/chat/completions",
|
|
||||||
headers=headers,
|
|
||||||
json={"messages": [{"role": "user", "content": query}]},
|
|
||||||
)
|
|
||||||
r.raise_for_status()
|
|
||||||
return r.json()
|
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
# Display helpers
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
|
|
||||||
|
|
||||||
def _short(model: str) -> str:
|
|
||||||
return model.split("/")[-1] if "/" in model else model
|
|
||||||
|
|
||||||
|
|
||||||
def _print_trace(result: dict) -> None:
|
|
||||||
trace = result.get("routing_trace", [])
|
|
||||||
if not trace:
|
|
||||||
print(" (no trace)")
|
|
||||||
return
|
|
||||||
|
|
||||||
prev: str | None = None
|
|
||||||
for t in trace:
|
|
||||||
short = _short(t["model"])
|
|
||||||
switch = " ← switched" if (prev and t["model"] != prev) else ""
|
|
||||||
prev = t["model"]
|
|
||||||
print(f" {t['task']:<26} [{short}]{switch}")
|
|
||||||
|
|
||||||
|
|
||||||
def _print_summary(label: str, result: dict) -> None:
|
|
||||||
models = [t["model"] for t in result.get("routing_trace", [])]
|
|
||||||
if not models:
|
|
||||||
print(f" ? {label}: no routing data")
|
|
||||||
return
|
|
||||||
unique = set(models)
|
|
||||||
if len(unique) == 1:
|
|
||||||
print(f" ✓ {label}: {_short(next(iter(unique)))} for all {len(models)} turns")
|
|
||||||
else:
|
|
||||||
switched = sum(1 for a, b in zip(models, models[1:]) if a != b)
|
|
||||||
names = ", ".join(sorted(_short(m) for m in unique))
|
|
||||||
print(f" ✗ {label}: model switched {switched} time(s) — {names}")
|
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
# Demo
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
|
|
||||||
|
|
||||||
async def main() -> None:
|
|
||||||
print()
|
|
||||||
print(" ╔══════════════════════════════════════════════════════════════╗")
|
|
||||||
print(" ║ Session Pinning Demo — Research Agent ║")
|
|
||||||
print(" ╚══════════════════════════════════════════════════════════════╝")
|
|
||||||
print()
|
|
||||||
print(f" Agent : {AGENT_URL}")
|
|
||||||
print(f' Query : "{QUERY[:72]}…"')
|
|
||||||
print()
|
|
||||||
print(" The agent uses a tool-calling loop (get_db_benchmarks,")
|
|
||||||
print(" get_case_studies, check_feature_support) to research the")
|
|
||||||
print(" question. Each LLM turn hits Plano's preference-based router.")
|
|
||||||
print()
|
|
||||||
|
|
||||||
print(f" Waiting for agent at {AGENT_URL}…", end=" ", flush=True)
|
|
||||||
if not await wait_for_agent():
|
|
||||||
print("FAILED — agent did not respond within 30 s")
|
|
||||||
return
|
|
||||||
print("ready.")
|
|
||||||
print()
|
|
||||||
|
|
||||||
sid = str(uuid.uuid4())
|
|
||||||
print(" Sending queries (running concurrently)…")
|
|
||||||
print()
|
|
||||||
without, with_pin = await asyncio.gather(
|
|
||||||
ask_agent(QUERY, session_id=None),
|
|
||||||
ask_agent(QUERY, session_id=sid),
|
|
||||||
)
|
|
||||||
|
|
||||||
# ── Run 1 ────────────────────────────────────────────────────────────
|
|
||||||
print(" ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━")
|
|
||||||
print(" Run 1: WITHOUT Session Pinning")
|
|
||||||
print(" ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━")
|
|
||||||
print()
|
|
||||||
print(" LLM turns inside the agent loop:")
|
|
||||||
print()
|
|
||||||
_print_trace(without)
|
|
||||||
print()
|
|
||||||
_print_summary("Without pinning", without)
|
|
||||||
print()
|
|
||||||
|
|
||||||
# ── Run 2 ────────────────────────────────────────────────────────────
|
|
||||||
print(" ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━")
|
|
||||||
print(f" Run 2: WITH Session Pinning (X-Routing-Session-Id: {sid[:8]}…)")
|
|
||||||
print(" ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━")
|
|
||||||
print()
|
|
||||||
print(" LLM turns inside the agent loop:")
|
|
||||||
print()
|
|
||||||
_print_trace(with_pin)
|
|
||||||
print()
|
|
||||||
_print_summary("With pinning ", with_pin)
|
|
||||||
print()
|
|
||||||
|
|
||||||
# ── Final answer ─────────────────────────────────────────────────────
|
|
||||||
answer = with_pin["choices"][0]["message"]["content"]
|
|
||||||
print(" ══ Agent recommendation (pinned session) ═════════════════════")
|
|
||||||
print()
|
|
||||||
for line in answer.splitlines():
|
|
||||||
print(f" {line}")
|
|
||||||
print()
|
|
||||||
print(" ══════════════════════════════════════════════════════════════")
|
|
||||||
print()
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
asyncio.run(main())
|
|
||||||
|
|
@ -1,19 +0,0 @@
|
||||||
#!/bin/bash
|
|
||||||
set -e
|
|
||||||
|
|
||||||
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
|
|
||||||
export PLANO_URL="${PLANO_URL:-http://localhost:12000}"
|
|
||||||
export AGENT_PORT="${AGENT_PORT:-8000}"
|
|
||||||
export AGENT_URL="http://localhost:$AGENT_PORT"
|
|
||||||
|
|
||||||
cleanup() {
|
|
||||||
[ -n "$AGENT_PID" ] && kill "$AGENT_PID" 2>/dev/null
|
|
||||||
}
|
|
||||||
trap cleanup EXIT INT TERM
|
|
||||||
|
|
||||||
# Start the agent in the background
|
|
||||||
"$SCRIPT_DIR/start_agents.sh" &
|
|
||||||
AGENT_PID=$!
|
|
||||||
|
|
||||||
# Run the demo client
|
|
||||||
uv run "$SCRIPT_DIR/demo.py"
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue