mirror of
https://github.com/katanemo/plano.git
synced 2026-04-25 00:36:34 +02:00
rename x-session-id to x-routing-session-id and fix routing config field name
This commit is contained in:
parent
f699cfb059
commit
5789694d2f
11 changed files with 41 additions and 34 deletions
|
|
@ -437,6 +437,7 @@ properties:
|
|||
session_max_entries:
|
||||
type: integer
|
||||
minimum: 1
|
||||
maximum: 10000
|
||||
description: Maximum number of session-pinned routing cache entries. Default 10000.
|
||||
additionalProperties: false
|
||||
state_storage:
|
||||
|
|
|
|||
|
|
@ -1,6 +1,8 @@
|
|||
use bytes::Bytes;
|
||||
use common::configuration::{FilterPipeline, ModelAlias};
|
||||
use common::consts::{ARCH_IS_STREAMING_HEADER, ARCH_PROVIDER_HINT_HEADER, SESSION_ID_HEADER};
|
||||
use common::consts::{
|
||||
ARCH_IS_STREAMING_HEADER, ARCH_PROVIDER_HINT_HEADER, ROUTING_SESSION_ID_HEADER,
|
||||
};
|
||||
use common::llm_providers::LlmProviders;
|
||||
use hermesllm::apis::openai::Message;
|
||||
use hermesllm::apis::openai_responses::InputParam;
|
||||
|
|
@ -96,7 +98,7 @@ async fn llm_chat_inner(
|
|||
|
||||
// Session pinning: extract session ID and check cache before routing
|
||||
let session_id: Option<String> = request_headers
|
||||
.get(SESSION_ID_HEADER)
|
||||
.get(ROUTING_SESSION_ID_HEADER)
|
||||
.and_then(|h| h.to_str().ok())
|
||||
.map(|s| s.to_string());
|
||||
let pinned_model: Option<String> = if let Some(ref sid) = session_id {
|
||||
|
|
|
|||
|
|
@ -1,6 +1,6 @@
|
|||
use bytes::Bytes;
|
||||
use common::configuration::{SpanAttributes, TopLevelRoutingPreference};
|
||||
use common::consts::{REQUEST_ID_HEADER, SESSION_ID_HEADER};
|
||||
use common::consts::{REQUEST_ID_HEADER, ROUTING_SESSION_ID_HEADER};
|
||||
use common::errors::BrightStaffError;
|
||||
use hermesllm::clients::SupportedAPIsFromClient;
|
||||
use hermesllm::ProviderRequestType;
|
||||
|
|
@ -72,7 +72,7 @@ pub async fn routing_decision(
|
|||
.unwrap_or_else(|| uuid::Uuid::new_v4().to_string());
|
||||
|
||||
let session_id: Option<String> = request_headers
|
||||
.get(SESSION_ID_HEADER)
|
||||
.get(ROUTING_SESSION_ID_HEADER)
|
||||
.and_then(|h| h.to_str().ok())
|
||||
.map(|s| s.to_string());
|
||||
|
||||
|
|
|
|||
|
|
@ -20,6 +20,7 @@ use crate::router::router_model_v1;
|
|||
|
||||
const DEFAULT_SESSION_TTL_SECONDS: u64 = 600;
|
||||
const DEFAULT_SESSION_MAX_ENTRIES: usize = 10_000;
|
||||
const MAX_SESSION_MAX_ENTRIES: usize = 10_000;
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct CachedRoute {
|
||||
|
|
@ -92,7 +93,9 @@ impl RouterService {
|
|||
|
||||
let session_ttl =
|
||||
Duration::from_secs(session_ttl_seconds.unwrap_or(DEFAULT_SESSION_TTL_SECONDS));
|
||||
let session_max_entries = session_max_entries.unwrap_or(DEFAULT_SESSION_MAX_ENTRIES);
|
||||
let session_max_entries = session_max_entries
|
||||
.unwrap_or(DEFAULT_SESSION_MAX_ENTRIES)
|
||||
.min(MAX_SESSION_MAX_ENTRIES);
|
||||
|
||||
RouterService {
|
||||
router_url,
|
||||
|
|
|
|||
|
|
@ -9,7 +9,7 @@ use crate::api::open_ai::{
|
|||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct Routing {
|
||||
pub model_provider: Option<String>,
|
||||
pub llm_provider: Option<String>,
|
||||
pub model: Option<String>,
|
||||
pub session_ttl_seconds: Option<u64>,
|
||||
pub session_max_entries: Option<usize>,
|
||||
|
|
|
|||
|
|
@ -22,7 +22,7 @@ pub const X_ARCH_TOOL_CALL: &str = "x-arch-tool-call-message";
|
|||
pub const X_ARCH_FC_MODEL_RESPONSE: &str = "x-arch-fc-model-response";
|
||||
pub const ARCH_FC_MODEL_NAME: &str = "Arch-Function";
|
||||
pub const REQUEST_ID_HEADER: &str = "x-request-id";
|
||||
pub const SESSION_ID_HEADER: &str = "x-session-id";
|
||||
pub const ROUTING_SESSION_ID_HEADER: &str = "x-routing-session-id";
|
||||
pub const ENVOY_ORIGINAL_PATH_HEADER: &str = "x-envoy-original-path";
|
||||
pub const TRACE_PARENT_HEADER: &str = "traceparent";
|
||||
pub const ARCH_INTERNAL_CLUSTER_NAME: &str = "arch_internal";
|
||||
|
|
|
|||
|
|
@ -108,13 +108,13 @@ The response contains the model list — your client should try `models[0]` firs
|
|||
|
||||
## Session Pinning
|
||||
|
||||
Send an `X-Session-Id` header to pin the routing decision for a session. Once a model is selected, all subsequent requests with the same session ID return the same model without re-running routing.
|
||||
Send an `X-Routing-Session-Id` header to pin the routing decision for a session. Once a model is selected, all subsequent requests with the same session ID return the same model without re-running routing.
|
||||
|
||||
```bash
|
||||
# First call — runs routing, caches result
|
||||
curl http://localhost:12000/routing/v1/chat/completions \
|
||||
-H "Content-Type: application/json" \
|
||||
-H "X-Session-Id: my-session-123" \
|
||||
-H "X-Routing-Session-Id: my-session-123" \
|
||||
-d '{
|
||||
"model": "gpt-4o-mini",
|
||||
"messages": [{"role": "user", "content": "Write a Python function for binary search"}]
|
||||
|
|
@ -136,7 +136,7 @@ Response (first call):
|
|||
# Second call — same session, returns cached result
|
||||
curl http://localhost:12000/routing/v1/chat/completions \
|
||||
-H "Content-Type: application/json" \
|
||||
-H "X-Session-Id: my-session-123" \
|
||||
-H "X-Routing-Session-Id: my-session-123" \
|
||||
-d '{
|
||||
"model": "gpt-4o-mini",
|
||||
"messages": [{"role": "user", "content": "Now explain merge sort"}]
|
||||
|
|
@ -161,7 +161,7 @@ routing:
|
|||
session_max_entries: 10000 # default: 10000
|
||||
```
|
||||
|
||||
Without the `X-Session-Id` header, routing runs fresh every time (no breaking change).
|
||||
Without the `X-Routing-Session-Id` header, routing runs fresh every time (no breaking change).
|
||||
|
||||
## Kubernetes Deployment (Self-hosted Arch-Router on GPU)
|
||||
|
||||
|
|
|
|||
|
|
@ -114,7 +114,7 @@ echo "--- 7. Session pinning - first call (fresh routing decision) ---"
|
|||
echo ""
|
||||
curl -s "$PLANO_URL/routing/v1/chat/completions" \
|
||||
-H "Content-Type: application/json" \
|
||||
-H "X-Session-Id: demo-session-001" \
|
||||
-H "X-Routing-Session-Id: demo-session-001" \
|
||||
-d '{
|
||||
"model": "gpt-4o-mini",
|
||||
"messages": [
|
||||
|
|
@ -129,7 +129,7 @@ echo " Notice: same model returned with \"pinned\": true, routing was skipped
|
|||
echo ""
|
||||
curl -s "$PLANO_URL/routing/v1/chat/completions" \
|
||||
-H "Content-Type: application/json" \
|
||||
-H "X-Session-Id: demo-session-001" \
|
||||
-H "X-Routing-Session-Id: demo-session-001" \
|
||||
-d '{
|
||||
"model": "gpt-4o-mini",
|
||||
"messages": [
|
||||
|
|
@ -143,7 +143,7 @@ echo "--- 9. Different session gets its own fresh routing ---"
|
|||
echo ""
|
||||
curl -s "$PLANO_URL/routing/v1/chat/completions" \
|
||||
-H "Content-Type: application/json" \
|
||||
-H "X-Session-Id: demo-session-002" \
|
||||
-H "X-Routing-Session-Id: demo-session-002" \
|
||||
-d '{
|
||||
"model": "gpt-4o-mini",
|
||||
"messages": [
|
||||
|
|
|
|||
|
|
@ -1,15 +1,15 @@
|
|||
# Session Pinning Demo
|
||||
|
||||
> Consistent model selection for agentic loops using `X-Session-Id`.
|
||||
> Consistent model selection for agentic loops using `X-Routing-Session-Id`.
|
||||
|
||||
## Why Session Pinning?
|
||||
|
||||
When an agent runs in a loop — research → analyse → implement → evaluate → summarise — each step hits Plano's router independently. Because prompts vary in intent, the router may select **different models** for each step, fragmenting context mid-session.
|
||||
|
||||
**Session pinning** solves this: send an `X-Session-Id` header and the first request runs routing as usual, caching the decision. Every subsequent request with the same session ID returns the **same model**, without re-running the router.
|
||||
**Session pinning** solves this: send an `X-Routing-Session-Id` header and the first request runs routing as usual, caching the decision. Every subsequent request with the same session ID returns the **same model**, without re-running the router.
|
||||
|
||||
```
|
||||
Without pinning With pinning (X-Session-Id)
|
||||
Without pinning With pinning (X-Routing-Session-Id)
|
||||
───────────────── ──────────────────────────
|
||||
Step 1 → claude-sonnet (code_gen) Step 1 → claude-sonnet ← routed
|
||||
Step 2 → gpt-4o (reasoning) Step 2 → claude-sonnet ← pinned ✓
|
||||
|
|
@ -56,8 +56,8 @@ accumulated message history. Steps alternate between `code_generation` and
|
|||
The demo runs the loop **twice** against `/v1/chat/completions` using the
|
||||
[OpenAI SDK](https://github.com/openai/openai-python):
|
||||
|
||||
1. **Without pinning** — no `X-Session-Id`; models alternate per step
|
||||
2. **With pinning** — `X-Session-Id` header included; model is pinned from step 1
|
||||
1. **Without pinning** — no `X-Routing-Session-Id`; models alternate per step
|
||||
2. **With pinning** — `X-Routing-Session-Id` header included; model is pinned from step 1
|
||||
|
||||
Each step makes real LLM calls. Step 5's report explicitly references findings
|
||||
from earlier steps, demonstrating why coherent context requires a consistent model.
|
||||
|
|
@ -85,7 +85,7 @@ from earlier steps, demonstrating why coherent context requires a consistent mod
|
|||
✗ Without pinning: model switched 4 time(s) — gpt-4o, claude-sonnet-4-20250514
|
||||
|
||||
|
||||
Run 2: WITH Session Pinning (X-Session-Id: a1b2c3d4…)
|
||||
Run 2: WITH Session Pinning (X-Routing-Session-Id: a1b2c3d4…)
|
||||
─────────────────────────────────────────────────────────────────────
|
||||
step 1 [claude-sonnet-4-20250514] List requirements
|
||||
"Critical requirements: 1. ACID transactions for order integrity…"
|
||||
|
|
@ -112,12 +112,12 @@ from earlier steps, demonstrating why coherent context requires a consistent mod
|
|||
|
||||
### How It Works
|
||||
|
||||
Session pinning is implemented in brightstaff. When `X-Session-Id` is present:
|
||||
Session pinning is implemented in brightstaff. When `X-Routing-Session-Id` is present:
|
||||
|
||||
1. **First request** — routing runs normally, result is cached keyed by session ID
|
||||
2. **Subsequent requests** — cache hit skips routing and returns the cached model instantly
|
||||
|
||||
The `X-Session-Id` header is forwarded transparently; no changes to your OpenAI
|
||||
The `X-Routing-Session-Id` header is forwarded transparently; no changes to your OpenAI
|
||||
SDK calls beyond adding the header.
|
||||
|
||||
```python
|
||||
|
|
@ -130,7 +130,7 @@ session_id = str(uuid.uuid4())
|
|||
response = client.chat.completions.create(
|
||||
model="gpt-4o-mini",
|
||||
messages=[{"role": "user", "content": prompt}],
|
||||
extra_headers={"X-Session-Id": session_id}, # pin the session
|
||||
extra_headers={"X-Routing-Session-Id": session_id}, # pin the session
|
||||
)
|
||||
```
|
||||
|
||||
|
|
@ -146,7 +146,7 @@ routing:
|
|||
session_max_entries: 10000 # Max cached sessions before LRU eviction
|
||||
```
|
||||
|
||||
Without the `X-Session-Id` header, routing runs fresh every time — no breaking
|
||||
Without the `X-Routing-Session-Id` header, routing runs fresh every time — no breaking
|
||||
change to existing clients.
|
||||
|
||||
---
|
||||
|
|
|
|||
|
|
@ -11,9 +11,10 @@ each with its own tool-calling loop. The tasks deliberately alternate between
|
|||
code_generation and complex_reasoning intents so Plano's preference-based
|
||||
router selects different models for each task.
|
||||
|
||||
If the client sends X-Session-Id, the agent forwards it on every outbound
|
||||
call to Plano. The first task pins the model; all subsequent tasks skip the
|
||||
router and reuse it — keeping the whole session on one consistent model.
|
||||
If the client sends X-Routing-Session-Id, the agent forwards it on every
|
||||
outbound call to Plano. The first task pins the model; all subsequent tasks
|
||||
skip the router and reuse it — keeping the whole session on one consistent
|
||||
model.
|
||||
|
||||
Run standalone:
|
||||
uv run agent.py
|
||||
|
|
@ -309,12 +310,12 @@ async def run_task(
|
|||
|
||||
Each task is an independent conversation so the router sees only
|
||||
this task's intent — not the accumulated context of previous tasks.
|
||||
Session pinning via X-Session-Id pins the model from the first task
|
||||
onward, so all tasks stay on the same model.
|
||||
Session pinning via X-Routing-Session-Id pins the model from the first
|
||||
task onward, so all tasks stay on the same model.
|
||||
|
||||
Returns (answer, first_model_used).
|
||||
"""
|
||||
headers = {"X-Session-Id": session_id} if session_id else {}
|
||||
headers = {"X-Routing-Session-Id": session_id} if session_id else {}
|
||||
messages: list[ChatCompletionMessageParam] = [
|
||||
{"role": "system", "content": SYSTEM_PROMPT},
|
||||
{"role": "user", "content": prompt},
|
||||
|
|
@ -391,7 +392,7 @@ app = FastAPI(title="Research Agent", version="1.0.0")
|
|||
@app.post("/v1/chat/completions")
|
||||
async def chat(request: Request) -> JSONResponse:
|
||||
body = await request.json()
|
||||
session_id: str | None = request.headers.get("x-session-id")
|
||||
session_id: str | None = request.headers.get("x-routing-session-id")
|
||||
|
||||
log.info("request session_id=%s", session_id or "none")
|
||||
|
||||
|
|
|
|||
|
|
@ -52,7 +52,7 @@ async def wait_for_agent(timeout: int = 30) -> bool:
|
|||
async def ask_agent(query: str, session_id: str | None = None) -> dict:
|
||||
headers: dict[str, str] = {}
|
||||
if session_id:
|
||||
headers["X-Session-Id"] = session_id
|
||||
headers["X-Routing-Session-Id"] = session_id
|
||||
|
||||
async with httpx.AsyncClient(timeout=120.0) as client:
|
||||
r = await client.post(
|
||||
|
|
@ -113,7 +113,7 @@ async def main() -> None:
|
|||
print(" ╚══════════════════════════════════════════════════════════════╝")
|
||||
print()
|
||||
print(f" Agent : {AGENT_URL}")
|
||||
print(f" Query : \"{QUERY[:72]}…\"")
|
||||
print(f' Query : "{QUERY[:72]}…"')
|
||||
print()
|
||||
print(" The agent uses a tool-calling loop (get_db_benchmarks,")
|
||||
print(" get_case_studies, check_feature_support) to research the")
|
||||
|
|
@ -149,7 +149,7 @@ async def main() -> None:
|
|||
|
||||
# ── Run 2 ────────────────────────────────────────────────────────────
|
||||
print(" ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━")
|
||||
print(f" Run 2: WITH Session Pinning (X-Session-Id: {sid[:8]}…)")
|
||||
print(f" Run 2: WITH Session Pinning (X-Routing-Session-Id: {sid[:8]}…)")
|
||||
print(" ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━")
|
||||
print()
|
||||
print(" LLM turns inside the agent loop:")
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue