From 90b926c2ced6b3cecad6f758e8c56f00783589f1 Mon Sep 17 00:00:00 2001
From: Adil Hafeez <adil.hafeez@gmail.com>
Date: Wed, 15 Apr 2026 16:41:42 -0700
Subject: [PATCH 01/16] use plano-orchestrator for LLM routing, remove
 arch-router (#886)

---
 README.md                                     |   2 +-
 cli/planoai/config_generator.py               |   7 +-
 config/plano_config_schema.yaml               |   5 +-
 crates/brightstaff/src/app_state.rs           |   2 -
 .../src/handlers/agents/selector.rs           |   1 +
 .../src/handlers/integration_tests.rs         |   5 +-
 crates/brightstaff/src/handlers/llm/mod.rs    |  13 +-
 .../src/handlers/llm/model_selection.rs       |   9 +-
 .../src/handlers/routing_service.rs           |  19 +-
 crates/brightstaff/src/main.rs                |  47 +-
 crates/brightstaff/src/router/llm.rs          | 371 --------
 crates/brightstaff/src/router/mod.rs          |   3 -
 crates/brightstaff/src/router/orchestrator.rs | 308 ++++++-
 .../src/router/orchestrator_model.rs          |   3 +-
 .../src/router/orchestrator_model_v1.rs       |   2 +-
 crates/brightstaff/src/router/router_model.rs |  39 -
 .../brightstaff/src/router/router_model_v1.rs | 842 ------------------
 crates/common/src/configuration.rs            |  11 +-
 .../pretty_model_resolution.sh                |   4 +-
 .../codex_router/pretty_model_resolution.sh   |   4 +-
 .../model_routing_service/README.md           |  24 +-
 .../model_routing_service/config_k8s.yaml     |   6 +-
 .../vllm-deployment.yaml                      |  14 +-
 .../llm_routing/openclaw_routing/config.yaml  |   2 +-
 .../preference_based_routing/README.md        |   4 +-
 .../test_router_endpoint.rest                 |   6 +-
 docs/source/guides/llm_router.rst             |  56 +-
 .../includes/plano_config_full_reference.yaml |   4 +-
 .../plano_config_full_reference_rendered.yaml |   6 +-
 29 files changed, 407 insertions(+), 1412 deletions(-)
 delete mode 100644 crates/brightstaff/src/router/llm.rs
 delete mode 100644 crates/brightstaff/src/router/router_model.rs
 delete mode 100644 crates/brightstaff/src/router/router_model_v1.rs

diff --git a/README.md b/README.md
index db398507..b7ff7efc 100644
--- a/README.md
+++ b/README.md
@@ -37,7 +37,7 @@ Plano pulls rote plumbing out of your framework so you can stay focused on what
 **Jump to our [docs](https://docs.planoai.dev)** to learn how you can use Plano to improve the speed, safety and obervability of your agentic applications.
 
 > [!IMPORTANT]
-> Plano and the Arch family of LLMs (like Plano-Orchestrator-4B, Arch-Router, etc) are hosted free of charge in the US-central region to give you a great first-run developer experience of Plano. To scale and run in production, you can either run these LLMs locally or contact us on [Discord](https://discord.gg/pGZf2gcwEc) for API keys.
+> Plano and the Plano family of LLMs (like Plano-Orchestrator) are hosted free of charge in the US-central region to give you a great first-run developer experience of Plano. To scale and run in production, you can either run these LLMs locally or contact us on [Discord](https://discord.gg/pGZf2gcwEc) for API keys.
 
 ---
 
diff --git a/cli/planoai/config_generator.py b/cli/planoai/config_generator.py
index 3ffebe09..5a3d4f63 100644
--- a/cli/planoai/config_generator.py
+++ b/cli/planoai/config_generator.py
@@ -372,16 +372,15 @@ def validate_and_render_schema():
     # Build lookup of model names (already prefix-stripped by config processing)
     model_name_set = {mp.get("model") for mp in updated_model_providers}
 
-    # Auto-add arch-router provider if routing preferences exist and no provider matches the router model
-    router_model = overrides_config.get("llm_routing_model", "Arch-Router")
-    # Strip provider prefix for comparison since config processing strips prefixes from model names
+    # Auto-add plano-orchestrator provider if routing preferences exist and no provider matches the routing model
+    router_model = overrides_config.get("llm_routing_model", "Plano-Orchestrator")
     router_model_id = (
         router_model.split("/", 1)[1] if "/" in router_model else router_model
     )
     if len(model_usage_name_keys) > 0 and router_model_id not in model_name_set:
         updated_model_providers.append(
             {
-                "name": "arch-router",
+                "name": "plano-orchestrator",
                 "provider_interface": "plano",
                 "model": router_model_id,
                 "internal": True,
diff --git a/config/plano_config_schema.yaml b/config/plano_config_schema.yaml
index 95a2e5cc..d3d6a643 100644
--- a/config/plano_config_schema.yaml
+++ b/config/plano_config_schema.yaml
@@ -284,10 +284,13 @@ properties:
         description: "Path to the trusted CA bundle for upstream TLS verification. Default is '/etc/ssl/certs/ca-certificates.crt'."
       llm_routing_model:
         type: string
-        description: "Model name for the LLM router (e.g., 'Arch-Router'). Must match a model in model_providers."
+        description: "Model name for the LLM router (e.g., 'Plano-Orchestrator'). Must match a model in model_providers."
       agent_orchestration_model:
         type: string
         description: "Model name for the agent orchestrator (e.g., 'Plano-Orchestrator'). Must match a model in model_providers."
+      orchestrator_model_context_length:
+        type: integer
+        description: "Maximum token length for the orchestrator/routing model context window. Default is 8192."
   system_prompt:
     type: string
   prompt_targets:
diff --git a/crates/brightstaff/src/app_state.rs b/crates/brightstaff/src/app_state.rs
index 57707f6e..e585d2db 100644
--- a/crates/brightstaff/src/app_state.rs
+++ b/crates/brightstaff/src/app_state.rs
@@ -5,7 +5,6 @@ use common::configuration::{Agent, FilterPipeline, Listener, ModelAlias, SpanAtt
 use common::llm_providers::LlmProviders;
 use tokio::sync::RwLock;
 
-use crate::router::llm::RouterService;
 use crate::router::orchestrator::OrchestratorService;
 use crate::state::StateStorage;
 
@@ -14,7 +13,6 @@ use crate::state::StateStorage;
 /// Instead of cloning 8+ individual `Arc`s per connection, a single
 /// `Arc<AppState>` is cloned once and passed to the request handler.
 pub struct AppState {
-    pub router_service: Arc<RouterService>,
     pub orchestrator_service: Arc<OrchestratorService>,
     pub model_aliases: Option<HashMap<String, ModelAlias>>,
     pub llm_providers: Arc<RwLock<LlmProviders>>,
diff --git a/crates/brightstaff/src/handlers/agents/selector.rs b/crates/brightstaff/src/handlers/agents/selector.rs
index 8225a003..e0467163 100644
--- a/crates/brightstaff/src/handlers/agents/selector.rs
+++ b/crates/brightstaff/src/handlers/agents/selector.rs
@@ -177,6 +177,7 @@ mod tests {
             "http://localhost:8080".to_string(),
             "test-model".to_string(),
             "plano-orchestrator".to_string(),
+            crate::router::orchestrator_model_v1::MAX_TOKEN_LEN,
         ))
     }
 
diff --git a/crates/brightstaff/src/handlers/integration_tests.rs b/crates/brightstaff/src/handlers/integration_tests.rs
index 499fbfca..c826dc50 100644
--- a/crates/brightstaff/src/handlers/integration_tests.rs
+++ b/crates/brightstaff/src/handlers/integration_tests.rs
@@ -23,6 +23,7 @@ mod tests {
             "http://localhost:8080".to_string(),
             "test-model".to_string(),
             "plano-orchestrator".to_string(),
+            crate::router::orchestrator_model_v1::MAX_TOKEN_LEN,
         ))
     }
 
@@ -147,8 +148,8 @@ mod tests {
 
     #[tokio::test]
     async fn test_error_handling_flow() {
-        let router_service = create_test_orchestrator_service();
-        let agent_selector = AgentSelector::new(router_service);
+        let orchestrator_service = create_test_orchestrator_service();
+        let agent_selector = AgentSelector::new(orchestrator_service);
 
         // Test listener not found
         let result = agent_selector.find_listener(Some("nonexistent"), &[]);
diff --git a/crates/brightstaff/src/handlers/llm/mod.rs b/crates/brightstaff/src/handlers/llm/mod.rs
index 5e108c56..8f00e4b6 100644
--- a/crates/brightstaff/src/handlers/llm/mod.rs
+++ b/crates/brightstaff/src/handlers/llm/mod.rs
@@ -22,7 +22,6 @@ pub(crate) mod model_selection;
 
 use crate::app_state::AppState;
 use crate::handlers::agents::pipeline::PipelineProcessor;
-use crate::handlers::extract_or_generate_traceparent;
 use crate::handlers::extract_request_id;
 use crate::handlers::full;
 use crate::state::response_state_processor::ResponsesStateProcessor;
@@ -92,22 +91,20 @@ async fn llm_chat_inner(
         }
     });
 
-    let traceparent = extract_or_generate_traceparent(&request_headers);
-
     // Session pinning: extract session ID and check cache before routing
     let session_id: Option<String> = request_headers
         .get(MODEL_AFFINITY_HEADER)
         .and_then(|h| h.to_str().ok())
         .map(|s| s.to_string());
     let tenant_id: Option<String> = state
-        .router_service
+        .orchestrator_service
         .tenant_header()
         .and_then(|hdr| request_headers.get(hdr))
         .and_then(|v| v.to_str().ok())
         .map(|s| s.to_string());
     let pinned_model: Option<String> = if let Some(ref sid) = session_id {
         state
-            .router_service
+            .orchestrator_service
             .get_cached_route(sid, tenant_id.as_deref())
             .await
             .map(|c| c.model_name)
@@ -287,9 +284,8 @@ async fn llm_chat_inner(
         let routing_result = match async {
             set_service_name(operation_component::ROUTING);
             router_chat_get_upstream_model(
-                Arc::clone(&state.router_service),
+                Arc::clone(&state.orchestrator_service),
                 client_request,
-                &traceparent,
                 &request_path,
                 &request_id,
                 inline_routing_preferences,
@@ -315,10 +311,9 @@ async fn llm_chat_inner(
             alias_resolved_model.clone()
         };
 
-        // Cache the routing decision so subsequent requests with the same session ID are pinned
         if let Some(ref sid) = session_id {
             state
-                .router_service
+                .orchestrator_service
                 .cache_route(sid.clone(), tenant_id.as_deref(), model.clone(), route_name)
                 .await;
         }
diff --git a/crates/brightstaff/src/handlers/llm/model_selection.rs b/crates/brightstaff/src/handlers/llm/model_selection.rs
index 1f5aea71..1b4315e7 100644
--- a/crates/brightstaff/src/handlers/llm/model_selection.rs
+++ b/crates/brightstaff/src/handlers/llm/model_selection.rs
@@ -5,7 +5,7 @@ use hyper::StatusCode;
 use std::sync::Arc;
 use tracing::{debug, info, warn};
 
-use crate::router::llm::RouterService;
+use crate::router::orchestrator::OrchestratorService;
 use crate::streaming::truncate_message;
 use crate::tracing::routing;
 
@@ -37,9 +37,8 @@ impl RoutingError {
 /// * `Ok(RoutingResult)` - Contains the selected model name and span ID
 /// * `Err(RoutingError)` - Contains error details and optional span ID
 pub async fn router_chat_get_upstream_model(
-    router_service: Arc<RouterService>,
+    orchestrator_service: Arc<OrchestratorService>,
     client_request: ProviderRequestType,
-    traceparent: &str,
     request_path: &str,
     request_id: &str,
     inline_routing_preferences: Option<Vec<TopLevelRoutingPreference>>,
@@ -99,11 +98,9 @@ pub async fn router_chat_get_upstream_model(
     // Capture start time for routing span
     let routing_start_time = std::time::Instant::now();
 
-    // Attempt to determine route using the router service
-    let routing_result = router_service
+    let routing_result = orchestrator_service
         .determine_route(
             &chat_request.messages,
-            traceparent,
             inline_routing_preferences,
             request_id,
         )
diff --git a/crates/brightstaff/src/handlers/routing_service.rs b/crates/brightstaff/src/handlers/routing_service.rs
index 3365b6e9..5fc0d3b9 100644
--- a/crates/brightstaff/src/handlers/routing_service.rs
+++ b/crates/brightstaff/src/handlers/routing_service.rs
@@ -12,7 +12,7 @@ use tracing::{debug, info, info_span, warn, Instrument};
 
 use super::extract_or_generate_traceparent;
 use crate::handlers::llm::model_selection::router_chat_get_upstream_model;
-use crate::router::llm::RouterService;
+use crate::router::orchestrator::OrchestratorService;
 use crate::tracing::{collect_custom_trace_attributes, operation_component, set_service_name};
 
 /// Extracts `routing_preferences` from a JSON body, returning the cleaned body bytes
@@ -60,7 +60,7 @@ struct RoutingDecisionResponse {
 
 pub async fn routing_decision(
     request: Request<hyper::body::Incoming>,
-    router_service: Arc<RouterService>,
+    orchestrator_service: Arc<OrchestratorService>,
     request_path: String,
     span_attributes: &Option<SpanAttributes>,
 ) -> Result<Response<BoxBody<Bytes, hyper::Error>>, hyper::Error> {
@@ -76,7 +76,7 @@ pub async fn routing_decision(
         .and_then(|h| h.to_str().ok())
         .map(|s| s.to_string());
 
-    let tenant_id: Option<String> = router_service
+    let tenant_id: Option<String> = orchestrator_service
         .tenant_header()
         .and_then(|hdr| request_headers.get(hdr))
         .and_then(|v| v.to_str().ok())
@@ -94,7 +94,7 @@ pub async fn routing_decision(
 
     routing_decision_inner(
         request,
-        router_service,
+        orchestrator_service,
         request_id,
         request_path,
         request_headers,
@@ -109,7 +109,7 @@ pub async fn routing_decision(
 #[allow(clippy::too_many_arguments)]
 async fn routing_decision_inner(
     request: Request<hyper::body::Incoming>,
-    router_service: Arc<RouterService>,
+    orchestrator_service: Arc<OrchestratorService>,
     request_id: String,
     request_path: String,
     request_headers: hyper::HeaderMap,
@@ -133,9 +133,8 @@ async fn routing_decision_inner(
         .unwrap_or("unknown")
         .to_string();
 
-    // Session pinning: check cache before doing any routing work
     if let Some(ref sid) = session_id {
-        if let Some(cached) = router_service
+        if let Some(cached) = orchestrator_service
             .get_cached_route(sid, tenant_id.as_deref())
             .await
         {
@@ -202,9 +201,8 @@ async fn routing_decision_inner(
     };
 
     let routing_result = router_chat_get_upstream_model(
-        Arc::clone(&router_service),
+        Arc::clone(&orchestrator_service),
         client_request,
-        &traceparent,
         &request_path,
         &request_id,
         inline_routing_preferences,
@@ -213,9 +211,8 @@ async fn routing_decision_inner(
 
     match routing_result {
         Ok(result) => {
-            // Cache the result if session_id is present
             if let Some(ref sid) = session_id {
-                router_service
+                orchestrator_service
                     .cache_route(
                         sid.clone(),
                         tenant_id.as_deref(),
diff --git a/crates/brightstaff/src/main.rs b/crates/brightstaff/src/main.rs
index 73102a97..40ac429d 100644
--- a/crates/brightstaff/src/main.rs
+++ b/crates/brightstaff/src/main.rs
@@ -5,7 +5,6 @@ use brightstaff::handlers::function_calling::function_calling_chat_handler;
 use brightstaff::handlers::llm::llm_chat;
 use brightstaff::handlers::models::list_models;
 use brightstaff::handlers::routing_service::routing_decision;
-use brightstaff::router::llm::RouterService;
 use brightstaff::router::model_metrics::ModelMetricsService;
 use brightstaff::router::orchestrator::OrchestratorService;
 use brightstaff::session_cache::init_session_cache;
@@ -37,8 +36,6 @@ use tokio::sync::RwLock;
 use tracing::{debug, info, warn};
 
 const BIND_ADDRESS: &str = "0.0.0.0:9091";
-const DEFAULT_ROUTING_LLM_PROVIDER: &str = "arch-router";
-const DEFAULT_ROUTING_MODEL_NAME: &str = "Arch-Router";
 const DEFAULT_ORCHESTRATOR_LLM_PROVIDER: &str = "plano-orchestrator";
 const DEFAULT_ORCHESTRATOR_MODEL_NAME: &str = "Plano-Orchestrator";
 
@@ -161,20 +158,6 @@ async fn init_app_state(
 
     let overrides = config.overrides.clone().unwrap_or_default();
 
-    let routing_model_name: String = overrides
-        .llm_routing_model
-        .as_deref()
-        .map(|m| m.split_once('/').map(|(_, id)| id).unwrap_or(m))
-        .unwrap_or(DEFAULT_ROUTING_MODEL_NAME)
-        .to_string();
-
-    let routing_llm_provider = config
-        .model_providers
-        .iter()
-        .find(|p| p.model.as_deref() == Some(routing_model_name.as_str()))
-        .map(|p| p.name.clone())
-        .unwrap_or_else(|| DEFAULT_ROUTING_LLM_PROVIDER.to_string());
-
     let session_ttl_seconds = config.routing.as_ref().and_then(|r| r.session_ttl_seconds);
     let session_cache = init_session_cache(config).await?;
 
@@ -304,20 +287,11 @@ async fn init_app_state(
         .and_then(|r| r.session_cache.as_ref())
         .and_then(|c| c.tenant_header.clone());
 
-    let router_service = Arc::new(RouterService::new(
-        config.routing_preferences.clone(),
-        metrics_service,
-        format!("{llm_provider_url}{CHAT_COMPLETIONS_PATH}"),
-        routing_model_name,
-        routing_llm_provider,
-        session_ttl_seconds,
-        session_cache,
-        session_tenant_header,
-    ));
-
+    // Resolve model name: prefer llm_routing_model override, then agent_orchestration_model, then default.
     let orchestrator_model_name: String = overrides
-        .agent_orchestration_model
+        .llm_routing_model
         .as_deref()
+        .or(overrides.agent_orchestration_model.as_deref())
         .map(|m| m.split_once('/').map(|(_, id)| id).unwrap_or(m))
         .unwrap_or(DEFAULT_ORCHESTRATOR_MODEL_NAME)
         .to_string();
@@ -329,10 +303,20 @@ async fn init_app_state(
         .map(|p| p.name.clone())
         .unwrap_or_else(|| DEFAULT_ORCHESTRATOR_LLM_PROVIDER.to_string());
 
-    let orchestrator_service = Arc::new(OrchestratorService::new(
+    let orchestrator_max_tokens = overrides
+        .orchestrator_model_context_length
+        .unwrap_or(brightstaff::router::orchestrator_model_v1::MAX_TOKEN_LEN);
+
+    let orchestrator_service = Arc::new(OrchestratorService::with_routing(
         format!("{llm_provider_url}{CHAT_COMPLETIONS_PATH}"),
         orchestrator_model_name,
         orchestrator_llm_provider,
+        config.routing_preferences.clone(),
+        metrics_service,
+        session_ttl_seconds,
+        session_cache,
+        session_tenant_header,
+        orchestrator_max_tokens,
     ));
 
     let state_storage = init_state_storage(config).await?;
@@ -343,7 +327,6 @@ async fn init_app_state(
         .and_then(|tracing| tracing.span_attributes.clone());
 
     Ok(AppState {
-        router_service,
         orchestrator_service,
         model_aliases: config.model_aliases.clone(),
         llm_providers: Arc::new(RwLock::new(llm_providers)),
@@ -430,7 +413,7 @@ async fn route(
         ) {
             return routing_decision(
                 req,
-                Arc::clone(&state.router_service),
+                Arc::clone(&state.orchestrator_service),
                 stripped,
                 &state.span_attributes,
             )
diff --git a/crates/brightstaff/src/router/llm.rs b/crates/brightstaff/src/router/llm.rs
deleted file mode 100644
index b1a74641..00000000
--- a/crates/brightstaff/src/router/llm.rs
+++ /dev/null
@@ -1,371 +0,0 @@
-use std::{borrow::Cow, collections::HashMap, sync::Arc, time::Duration};
-
-use common::{
-    configuration::TopLevelRoutingPreference,
-    consts::{ARCH_PROVIDER_HINT_HEADER, REQUEST_ID_HEADER, TRACE_PARENT_HEADER},
-};
-
-use super::router_model::{ModelUsagePreference, RoutingPreference};
-use hermesllm::apis::openai::Message;
-use hyper::header;
-use thiserror::Error;
-use tracing::{debug, info};
-
-use super::http::{self, post_and_extract_content};
-use super::model_metrics::ModelMetricsService;
-use super::router_model::RouterModel;
-
-use crate::router::router_model_v1;
-use crate::session_cache::SessionCache;
-
-pub use crate::session_cache::CachedRoute;
-
-const DEFAULT_SESSION_TTL_SECONDS: u64 = 600;
-
-pub struct RouterService {
-    router_url: String,
-    client: reqwest::Client,
-    router_model: Arc<dyn RouterModel>,
-    routing_provider_name: String,
-    top_level_preferences: HashMap<String, TopLevelRoutingPreference>,
-    metrics_service: Option<Arc<ModelMetricsService>>,
-    session_cache: Arc<dyn SessionCache>,
-    session_ttl: Duration,
-    tenant_header: Option<String>,
-}
-
-#[derive(Debug, Error)]
-pub enum RoutingError {
-    #[error(transparent)]
-    Http(#[from] http::HttpError),
-
-    #[error("Router model error: {0}")]
-    RouterModelError(#[from] super::router_model::RoutingModelError),
-}
-
-pub type Result<T> = std::result::Result<T, RoutingError>;
-
-impl RouterService {
-    #[allow(clippy::too_many_arguments)]
-    pub fn new(
-        top_level_prefs: Option<Vec<TopLevelRoutingPreference>>,
-        metrics_service: Option<Arc<ModelMetricsService>>,
-        router_url: String,
-        routing_model_name: String,
-        routing_provider_name: String,
-        session_ttl_seconds: Option<u64>,
-        session_cache: Arc<dyn SessionCache>,
-        tenant_header: Option<String>,
-    ) -> Self {
-        let top_level_preferences: HashMap<String, TopLevelRoutingPreference> = top_level_prefs
-            .map_or_else(HashMap::new, |prefs| {
-                prefs.into_iter().map(|p| (p.name.clone(), p)).collect()
-            });
-
-        // Build sentinel routes for RouterModelV1: route_name → first model.
-        // RouterModelV1 uses this to build its prompt; RouterService overrides
-        // the model selection via rank_models() after the route is determined.
-        let sentinel_routes: HashMap<String, Vec<RoutingPreference>> = top_level_preferences
-            .iter()
-            .filter_map(|(name, pref)| {
-                pref.models.first().map(|first_model| {
-                    (
-                        first_model.clone(),
-                        vec![RoutingPreference {
-                            name: name.clone(),
-                            description: pref.description.clone(),
-                        }],
-                    )
-                })
-            })
-            .collect();
-
-        let router_model = Arc::new(router_model_v1::RouterModelV1::new(
-            sentinel_routes,
-            routing_model_name,
-            router_model_v1::MAX_TOKEN_LEN,
-        ));
-
-        let session_ttl =
-            Duration::from_secs(session_ttl_seconds.unwrap_or(DEFAULT_SESSION_TTL_SECONDS));
-
-        RouterService {
-            router_url,
-            client: reqwest::Client::new(),
-            router_model,
-            routing_provider_name,
-            top_level_preferences,
-            metrics_service,
-            session_cache,
-            session_ttl,
-            tenant_header,
-        }
-    }
-
-    /// Name of the HTTP header used to scope cache keys by tenant, if configured.
-    #[must_use]
-    pub fn tenant_header(&self) -> Option<&str> {
-        self.tenant_header.as_deref()
-    }
-
-    /// Build the cache key, optionally scoped by tenant: `{tenant_id}:{session_id}` or `{session_id}`.
-    /// Returns a borrowed key when no tenant prefix is needed, avoiding an allocation.
-    fn session_key<'a>(tenant_id: Option<&str>, session_id: &'a str) -> Cow<'a, str> {
-        match tenant_id {
-            Some(t) => Cow::Owned(format!("{t}:{session_id}")),
-            None => Cow::Borrowed(session_id),
-        }
-    }
-
-    /// Look up a cached routing decision by session ID.
-    /// Returns None if not found or expired.
-    pub async fn get_cached_route(
-        &self,
-        session_id: &str,
-        tenant_id: Option<&str>,
-    ) -> Option<CachedRoute> {
-        self.session_cache
-            .get(&Self::session_key(tenant_id, session_id))
-            .await
-    }
-
-    /// Store a routing decision in the session cache.
-    pub async fn cache_route(
-        &self,
-        session_id: String,
-        tenant_id: Option<&str>,
-        model_name: String,
-        route_name: Option<String>,
-    ) {
-        self.session_cache
-            .put(
-                &Self::session_key(tenant_id, &session_id),
-                CachedRoute {
-                    model_name,
-                    route_name,
-                },
-                self.session_ttl,
-            )
-            .await;
-    }
-
-    pub async fn determine_route(
-        &self,
-        messages: &[Message],
-        traceparent: &str,
-        inline_routing_preferences: Option<Vec<TopLevelRoutingPreference>>,
-        request_id: &str,
-    ) -> Result<Option<(String, Vec<String>)>> {
-        if messages.is_empty() {
-            return Ok(None);
-        }
-
-        // Build inline top-level map from request if present (inline overrides config).
-        let inline_top_map: Option<HashMap<String, TopLevelRoutingPreference>> =
-            inline_routing_preferences
-                .map(|prefs| prefs.into_iter().map(|p| (p.name.clone(), p)).collect());
-
-        // No routing defined — skip the router call entirely.
-        if inline_top_map.is_none() && self.top_level_preferences.is_empty() {
-            return Ok(None);
-        }
-
-        // For inline overrides, build synthetic ModelUsagePreference list so RouterModelV1
-        // generates the correct prompt (route name + description pairs).
-        // For config-level prefs the sentinel routes are already baked into RouterModelV1.
-        let effective_usage_preferences: Option<Vec<ModelUsagePreference>> =
-            inline_top_map.as_ref().map(|inline_map| {
-                inline_map
-                    .values()
-                    .map(|p| ModelUsagePreference {
-                        model: p.models.first().cloned().unwrap_or_default(),
-                        routing_preferences: vec![RoutingPreference {
-                            name: p.name.clone(),
-                            description: p.description.clone(),
-                        }],
-                    })
-                    .collect()
-            });
-
-        let router_request = self
-            .router_model
-            .generate_request(messages, &effective_usage_preferences);
-
-        debug!(
-            model = %self.router_model.get_model_name(),
-            endpoint = %self.router_url,
-            "sending request to arch-router"
-        );
-
-        let body = serde_json::to_string(&router_request)
-            .map_err(super::router_model::RoutingModelError::from)?;
-        debug!(body = %body, "arch router request");
-
-        let mut headers = header::HeaderMap::new();
-        headers.insert(
-            header::CONTENT_TYPE,
-            header::HeaderValue::from_static("application/json"),
-        );
-        if let Ok(val) = header::HeaderValue::from_str(&self.routing_provider_name) {
-            headers.insert(
-                header::HeaderName::from_static(ARCH_PROVIDER_HINT_HEADER),
-                val,
-            );
-        }
-        if let Ok(val) = header::HeaderValue::from_str(traceparent) {
-            headers.insert(header::HeaderName::from_static(TRACE_PARENT_HEADER), val);
-        }
-        if let Ok(val) = header::HeaderValue::from_str(request_id) {
-            headers.insert(header::HeaderName::from_static(REQUEST_ID_HEADER), val);
-        }
-        headers.insert(
-            header::HeaderName::from_static("model"),
-            header::HeaderValue::from_static("arch-router"),
-        );
-
-        let Some((content, elapsed)) =
-            post_and_extract_content(&self.client, &self.router_url, headers, body).await?
-        else {
-            return Ok(None);
-        };
-
-        // Parse the route name from the router response.
-        let parsed = self
-            .router_model
-            .parse_response(&content, &effective_usage_preferences)?;
-
-        let result = if let Some((route_name, _sentinel)) = parsed {
-            let top_pref = inline_top_map
-                .as_ref()
-                .and_then(|m| m.get(&route_name))
-                .or_else(|| self.top_level_preferences.get(&route_name));
-
-            if let Some(pref) = top_pref {
-                let ranked = match &self.metrics_service {
-                    Some(svc) => svc.rank_models(&pref.models, &pref.selection_policy).await,
-                    None => pref.models.clone(),
-                };
-                Some((route_name, ranked))
-            } else {
-                None
-            }
-        } else {
-            None
-        };
-
-        info!(
-            content = %content.replace("\n", "\\n"),
-            selected_model = ?result,
-            response_time_ms = elapsed.as_millis(),
-            "arch-router determined route"
-        );
-
-        Ok(result)
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-    use crate::session_cache::memory::MemorySessionCache;
-
-    fn make_router_service(ttl_seconds: u64, max_entries: usize) -> RouterService {
-        let session_cache = Arc::new(MemorySessionCache::new(max_entries));
-        RouterService::new(
-            None,
-            None,
-            "http://localhost:12001/v1/chat/completions".to_string(),
-            "Arch-Router".to_string(),
-            "arch-router".to_string(),
-            Some(ttl_seconds),
-            session_cache,
-            None,
-        )
-    }
-
-    #[tokio::test]
-    async fn test_cache_miss_returns_none() {
-        let svc = make_router_service(600, 100);
-        assert!(svc
-            .get_cached_route("unknown-session", None)
-            .await
-            .is_none());
-    }
-
-    #[tokio::test]
-    async fn test_cache_hit_returns_cached_route() {
-        let svc = make_router_service(600, 100);
-        svc.cache_route(
-            "s1".to_string(),
-            None,
-            "gpt-4o".to_string(),
-            Some("code".to_string()),
-        )
-        .await;
-
-        let cached = svc.get_cached_route("s1", None).await.unwrap();
-        assert_eq!(cached.model_name, "gpt-4o");
-        assert_eq!(cached.route_name, Some("code".to_string()));
-    }
-
-    #[tokio::test]
-    async fn test_cache_expired_entry_returns_none() {
-        let svc = make_router_service(0, 100);
-        svc.cache_route("s1".to_string(), None, "gpt-4o".to_string(), None)
-            .await;
-        assert!(svc.get_cached_route("s1", None).await.is_none());
-    }
-
-    #[tokio::test]
-    async fn test_expired_entries_not_returned() {
-        let svc = make_router_service(0, 100);
-        svc.cache_route("s1".to_string(), None, "gpt-4o".to_string(), None)
-            .await;
-        svc.cache_route("s2".to_string(), None, "claude".to_string(), None)
-            .await;
-
-        // Entries with TTL=0 should be expired immediately
-        assert!(svc.get_cached_route("s1", None).await.is_none());
-        assert!(svc.get_cached_route("s2", None).await.is_none());
-    }
-
-    #[tokio::test]
-    async fn test_cache_evicts_oldest_when_full() {
-        let svc = make_router_service(600, 2);
-        svc.cache_route("s1".to_string(), None, "model-a".to_string(), None)
-            .await;
-        tokio::time::sleep(Duration::from_millis(10)).await;
-        svc.cache_route("s2".to_string(), None, "model-b".to_string(), None)
-            .await;
-
-        svc.cache_route("s3".to_string(), None, "model-c".to_string(), None)
-            .await;
-
-        // s1 should be evicted (oldest); s2 and s3 should remain
-        assert!(svc.get_cached_route("s1", None).await.is_none());
-        assert!(svc.get_cached_route("s2", None).await.is_some());
-        assert!(svc.get_cached_route("s3", None).await.is_some());
-    }
-
-    #[tokio::test]
-    async fn test_cache_update_existing_session_does_not_evict() {
-        let svc = make_router_service(600, 2);
-        svc.cache_route("s1".to_string(), None, "model-a".to_string(), None)
-            .await;
-        svc.cache_route("s2".to_string(), None, "model-b".to_string(), None)
-            .await;
-
-        svc.cache_route(
-            "s1".to_string(),
-            None,
-            "model-a-updated".to_string(),
-            Some("route".to_string()),
-        )
-        .await;
-
-        // Both sessions should still be present
-        let s1 = svc.get_cached_route("s1", None).await.unwrap();
-        assert_eq!(s1.model_name, "model-a-updated");
-        assert!(svc.get_cached_route("s2", None).await.is_some());
-    }
-}
diff --git a/crates/brightstaff/src/router/mod.rs b/crates/brightstaff/src/router/mod.rs
index 2d9d00a7..2ef0d11a 100644
--- a/crates/brightstaff/src/router/mod.rs
+++ b/crates/brightstaff/src/router/mod.rs
@@ -1,8 +1,5 @@
 pub(crate) mod http;
-pub mod llm;
 pub mod model_metrics;
 pub mod orchestrator;
 pub mod orchestrator_model;
 pub mod orchestrator_model_v1;
-pub mod router_model;
-pub mod router_model_v1;
diff --git a/crates/brightstaff/src/router/orchestrator.rs b/crates/brightstaff/src/router/orchestrator.rs
index c75aa64b..7aaf70a2 100644
--- a/crates/brightstaff/src/router/orchestrator.rs
+++ b/crates/brightstaff/src/router/orchestrator.rs
@@ -1,7 +1,7 @@
-use std::{collections::HashMap, sync::Arc};
+use std::{borrow::Cow, collections::HashMap, sync::Arc, time::Duration};
 
 use common::{
-    configuration::{AgentUsagePreference, OrchestrationPreference},
+    configuration::{AgentUsagePreference, OrchestrationPreference, TopLevelRoutingPreference},
     consts::{ARCH_PROVIDER_HINT_HEADER, REQUEST_ID_HEADER},
 };
 use hermesllm::apis::openai::Message;
@@ -12,15 +12,26 @@ use thiserror::Error;
 use tracing::{debug, info};
 
 use super::http::{self, post_and_extract_content};
+use super::model_metrics::ModelMetricsService;
 use super::orchestrator_model::OrchestratorModel;
 
 use crate::router::orchestrator_model_v1;
+use crate::session_cache::SessionCache;
+
+pub use crate::session_cache::CachedRoute;
+
+const DEFAULT_SESSION_TTL_SECONDS: u64 = 600;
 
 pub struct OrchestratorService {
     orchestrator_url: String,
     client: reqwest::Client,
     orchestrator_model: Arc<dyn OrchestratorModel>,
     orchestrator_provider_name: String,
+    top_level_preferences: HashMap<String, TopLevelRoutingPreference>,
+    metrics_service: Option<Arc<ModelMetricsService>>,
+    session_cache: Option<Arc<dyn SessionCache>>,
+    session_ttl: Duration,
+    tenant_header: Option<String>,
 }
 
 #[derive(Debug, Error)]
@@ -39,13 +50,12 @@ impl OrchestratorService {
         orchestrator_url: String,
         orchestration_model_name: String,
         orchestrator_provider_name: String,
+        max_token_length: usize,
     ) -> Self {
-        let agent_orchestrations: HashMap<String, Vec<OrchestrationPreference>> = HashMap::new();
-
         let orchestrator_model = Arc::new(orchestrator_model_v1::OrchestratorModelV1::new(
-            agent_orchestrations,
-            orchestration_model_name.clone(),
-            orchestrator_model_v1::MAX_TOKEN_LEN,
+            HashMap::new(),
+            orchestration_model_name,
+            max_token_length,
         ));
 
         OrchestratorService {
@@ -53,9 +63,182 @@ impl OrchestratorService {
             client: reqwest::Client::new(),
             orchestrator_model,
             orchestrator_provider_name,
+            top_level_preferences: HashMap::new(),
+            metrics_service: None,
+            session_cache: None,
+            session_ttl: Duration::from_secs(DEFAULT_SESSION_TTL_SECONDS),
+            tenant_header: None,
         }
     }
 
+    #[allow(clippy::too_many_arguments)]
+    pub fn with_routing(
+        orchestrator_url: String,
+        orchestration_model_name: String,
+        orchestrator_provider_name: String,
+        top_level_prefs: Option<Vec<TopLevelRoutingPreference>>,
+        metrics_service: Option<Arc<ModelMetricsService>>,
+        session_ttl_seconds: Option<u64>,
+        session_cache: Arc<dyn SessionCache>,
+        tenant_header: Option<String>,
+        max_token_length: usize,
+    ) -> Self {
+        let top_level_preferences: HashMap<String, TopLevelRoutingPreference> = top_level_prefs
+            .map_or_else(HashMap::new, |prefs| {
+                prefs.into_iter().map(|p| (p.name.clone(), p)).collect()
+            });
+
+        let orchestrator_model = Arc::new(orchestrator_model_v1::OrchestratorModelV1::new(
+            HashMap::new(),
+            orchestration_model_name,
+            max_token_length,
+        ));
+
+        let session_ttl =
+            Duration::from_secs(session_ttl_seconds.unwrap_or(DEFAULT_SESSION_TTL_SECONDS));
+
+        OrchestratorService {
+            orchestrator_url,
+            client: reqwest::Client::new(),
+            orchestrator_model,
+            orchestrator_provider_name,
+            top_level_preferences,
+            metrics_service,
+            session_cache: Some(session_cache),
+            session_ttl,
+            tenant_header,
+        }
+    }
+
+    // ---- Session cache methods ----
+
+    #[must_use]
+    pub fn tenant_header(&self) -> Option<&str> {
+        self.tenant_header.as_deref()
+    }
+
+    fn session_key<'a>(tenant_id: Option<&str>, session_id: &'a str) -> Cow<'a, str> {
+        match tenant_id {
+            Some(t) => Cow::Owned(format!("{t}:{session_id}")),
+            None => Cow::Borrowed(session_id),
+        }
+    }
+
+    pub async fn get_cached_route(
+        &self,
+        session_id: &str,
+        tenant_id: Option<&str>,
+    ) -> Option<CachedRoute> {
+        let cache = self.session_cache.as_ref()?;
+        cache.get(&Self::session_key(tenant_id, session_id)).await
+    }
+
+    pub async fn cache_route(
+        &self,
+        session_id: String,
+        tenant_id: Option<&str>,
+        model_name: String,
+        route_name: Option<String>,
+    ) {
+        if let Some(ref cache) = self.session_cache {
+            cache
+                .put(
+                    &Self::session_key(tenant_id, &session_id),
+                    CachedRoute {
+                        model_name,
+                        route_name,
+                    },
+                    self.session_ttl,
+                )
+                .await;
+        }
+    }
+
+    // ---- LLM routing ----
+
+    pub async fn determine_route(
+        &self,
+        messages: &[Message],
+        inline_routing_preferences: Option<Vec<TopLevelRoutingPreference>>,
+        request_id: &str,
+    ) -> Result<Option<(String, Vec<String>)>> {
+        if messages.is_empty() {
+            return Ok(None);
+        }
+
+        let inline_top_map: Option<HashMap<String, TopLevelRoutingPreference>> =
+            inline_routing_preferences
+                .map(|prefs| prefs.into_iter().map(|p| (p.name.clone(), p)).collect());
+
+        if inline_top_map.is_none() && self.top_level_preferences.is_empty() {
+            return Ok(None);
+        }
+
+        let effective_source = inline_top_map
+            .as_ref()
+            .unwrap_or(&self.top_level_preferences);
+
+        let effective_prefs: Vec<AgentUsagePreference> = effective_source
+            .values()
+            .map(|p| AgentUsagePreference {
+                model: p.models.first().cloned().unwrap_or_default(),
+                orchestration_preferences: vec![OrchestrationPreference {
+                    name: p.name.clone(),
+                    description: p.description.clone(),
+                }],
+            })
+            .collect();
+
+        let orchestration_result = self
+            .determine_orchestration(
+                messages,
+                Some(effective_prefs),
+                Some(request_id.to_string()),
+            )
+            .await?;
+
+        let result = if let Some(ref routes) = orchestration_result {
+            if routes.len() > 1 {
+                let all_routes: Vec<&str> = routes.iter().map(|(name, _)| name.as_str()).collect();
+                info!(
+                    routes = ?all_routes,
+                    using = %all_routes.first().unwrap_or(&"none"),
+                    "plano-orchestrator detected multiple intents, using first"
+                );
+            }
+
+            if let Some((route_name, _)) = routes.first() {
+                let top_pref = inline_top_map
+                    .as_ref()
+                    .and_then(|m| m.get(route_name))
+                    .or_else(|| self.top_level_preferences.get(route_name));
+
+                if let Some(pref) = top_pref {
+                    let ranked = match &self.metrics_service {
+                        Some(svc) => svc.rank_models(&pref.models, &pref.selection_policy).await,
+                        None => pref.models.clone(),
+                    };
+                    Some((route_name.clone(), ranked))
+                } else {
+                    None
+                }
+            } else {
+                None
+            }
+        } else {
+            None
+        };
+
+        info!(
+            selected_model = ?result,
+            "plano-orchestrator determined route"
+        );
+
+        Ok(result)
+    }
+
+    // ---- Agent orchestration (existing) ----
+
     pub async fn determine_orchestration(
         &self,
         messages: &[Message],
@@ -80,12 +263,12 @@ impl OrchestratorService {
         debug!(
             model = %self.orchestrator_model.get_model_name(),
             endpoint = %self.orchestrator_url,
-            "sending request to arch-orchestrator"
+            "sending request to plano-orchestrator"
         );
 
         let body = serde_json::to_string(&orchestrator_request)
             .map_err(super::orchestrator_model::OrchestratorModelError::from)?;
-        debug!(body = %body, "arch orchestrator request");
+        debug!(body = %body, "plano-orchestrator request");
 
         let mut headers = header::HeaderMap::new();
         headers.insert(
@@ -98,7 +281,6 @@ impl OrchestratorService {
                 .unwrap_or_else(|_| header::HeaderValue::from_static("plano-orchestrator")),
         );
 
-        // Inject OpenTelemetry trace context from current span
         global::get_text_map_propagator(|propagator| {
             let cx =
                 tracing_opentelemetry::OpenTelemetrySpanExt::context(&tracing::Span::current());
@@ -130,9 +312,113 @@ impl OrchestratorService {
             content = %content.replace("\n", "\\n"),
             selected_routes = ?parsed,
             response_time_ms = elapsed.as_millis(),
-            "arch-orchestrator determined routes"
+            "plano-orchestrator determined routes"
         );
 
         Ok(parsed)
     }
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::session_cache::memory::MemorySessionCache;
+
+    fn make_orchestrator_service(ttl_seconds: u64, max_entries: usize) -> OrchestratorService {
+        let session_cache = Arc::new(MemorySessionCache::new(max_entries));
+        OrchestratorService::with_routing(
+            "http://localhost:12001/v1/chat/completions".to_string(),
+            "Plano-Orchestrator".to_string(),
+            "plano-orchestrator".to_string(),
+            None,
+            None,
+            Some(ttl_seconds),
+            session_cache,
+            None,
+            orchestrator_model_v1::MAX_TOKEN_LEN,
+        )
+    }
+
+    #[tokio::test]
+    async fn test_cache_miss_returns_none() {
+        let svc = make_orchestrator_service(600, 100);
+        assert!(svc
+            .get_cached_route("unknown-session", None)
+            .await
+            .is_none());
+    }
+
+    #[tokio::test]
+    async fn test_cache_hit_returns_cached_route() {
+        let svc = make_orchestrator_service(600, 100);
+        svc.cache_route(
+            "s1".to_string(),
+            None,
+            "gpt-4o".to_string(),
+            Some("code".to_string()),
+        )
+        .await;
+
+        let cached = svc.get_cached_route("s1", None).await.unwrap();
+        assert_eq!(cached.model_name, "gpt-4o");
+        assert_eq!(cached.route_name, Some("code".to_string()));
+    }
+
+    #[tokio::test]
+    async fn test_cache_expired_entry_returns_none() {
+        let svc = make_orchestrator_service(0, 100);
+        svc.cache_route("s1".to_string(), None, "gpt-4o".to_string(), None)
+            .await;
+        assert!(svc.get_cached_route("s1", None).await.is_none());
+    }
+
+    #[tokio::test]
+    async fn test_expired_entries_not_returned() {
+        let svc = make_orchestrator_service(0, 100);
+        svc.cache_route("s1".to_string(), None, "gpt-4o".to_string(), None)
+            .await;
+        svc.cache_route("s2".to_string(), None, "claude".to_string(), None)
+            .await;
+
+        assert!(svc.get_cached_route("s1", None).await.is_none());
+        assert!(svc.get_cached_route("s2", None).await.is_none());
+    }
+
+    #[tokio::test]
+    async fn test_cache_evicts_oldest_when_full() {
+        let svc = make_orchestrator_service(600, 2);
+        svc.cache_route("s1".to_string(), None, "model-a".to_string(), None)
+            .await;
+        tokio::time::sleep(Duration::from_millis(10)).await;
+        svc.cache_route("s2".to_string(), None, "model-b".to_string(), None)
+            .await;
+
+        svc.cache_route("s3".to_string(), None, "model-c".to_string(), None)
+            .await;
+
+        assert!(svc.get_cached_route("s1", None).await.is_none());
+        assert!(svc.get_cached_route("s2", None).await.is_some());
+        assert!(svc.get_cached_route("s3", None).await.is_some());
+    }
+
+    #[tokio::test]
+    async fn test_cache_update_existing_session_does_not_evict() {
+        let svc = make_orchestrator_service(600, 2);
+        svc.cache_route("s1".to_string(), None, "model-a".to_string(), None)
+            .await;
+        svc.cache_route("s2".to_string(), None, "model-b".to_string(), None)
+            .await;
+
+        svc.cache_route(
+            "s1".to_string(),
+            None,
+            "model-a-updated".to_string(),
+            Some("route".to_string()),
+        )
+        .await;
+
+        let s1 = svc.get_cached_route("s1", None).await.unwrap();
+        assert_eq!(s1.model_name, "model-a-updated");
+        assert!(svc.get_cached_route("s2", None).await.is_some());
+    }
+}
diff --git a/crates/brightstaff/src/router/orchestrator_model.rs b/crates/brightstaff/src/router/orchestrator_model.rs
index 19c78ca3..a6b32b8e 100644
--- a/crates/brightstaff/src/router/orchestrator_model.rs
+++ b/crates/brightstaff/src/router/orchestrator_model.rs
@@ -11,8 +11,7 @@ pub enum OrchestratorModelError {
 pub type Result<T> = std::result::Result<T, OrchestratorModelError>;
 
 /// OrchestratorModel trait for handling orchestration requests.
-/// Unlike RouterModel which returns a single route, OrchestratorModel
-/// can return multiple routes as the model output format is:
+/// Returns multiple routes as the model output format is:
 /// {"route": ["route_name_1", "route_name_2", ...]}
 pub trait OrchestratorModel: Send + Sync {
     fn generate_request(
diff --git a/crates/brightstaff/src/router/orchestrator_model_v1.rs b/crates/brightstaff/src/router/orchestrator_model_v1.rs
index ec4d2d12..75e5c586 100644
--- a/crates/brightstaff/src/router/orchestrator_model_v1.rs
+++ b/crates/brightstaff/src/router/orchestrator_model_v1.rs
@@ -8,7 +8,7 @@ use tracing::{debug, warn};
 
 use super::orchestrator_model::{OrchestratorModel, OrchestratorModelError};
 
-pub const MAX_TOKEN_LEN: usize = 2048; // Default max token length for the orchestration model
+pub const MAX_TOKEN_LEN: usize = 8192; // Default max token length for the orchestration model
 
 /// Custom JSON formatter that produces spaced JSON (space after colons and commas), same as JSON in python
 struct SpacedJsonFormatter;
diff --git a/crates/brightstaff/src/router/router_model.rs b/crates/brightstaff/src/router/router_model.rs
deleted file mode 100644
index 4fe023a3..00000000
--- a/crates/brightstaff/src/router/router_model.rs
+++ /dev/null
@@ -1,39 +0,0 @@
-use hermesllm::apis::openai::{ChatCompletionsRequest, Message};
-use serde::{Deserialize, Serialize};
-use thiserror::Error;
-
-#[derive(Debug, Error)]
-pub enum RoutingModelError {
-    #[error("Failed to parse JSON: {0}")]
-    JsonError(#[from] serde_json::Error),
-}
-
-pub type Result<T> = std::result::Result<T, RoutingModelError>;
-
-/// Internal route descriptor passed to the router model to build its prompt.
-#[derive(Debug, Clone, Serialize, Deserialize)]
-pub struct RoutingPreference {
-    pub name: String,
-    pub description: String,
-}
-
-/// Groups a model with its routing preferences (used internally by RouterModelV1).
-#[derive(Debug, Clone, Serialize, Deserialize)]
-pub struct ModelUsagePreference {
-    pub model: String,
-    pub routing_preferences: Vec<RoutingPreference>,
-}
-
-pub trait RouterModel: Send + Sync {
-    fn generate_request(
-        &self,
-        messages: &[Message],
-        usage_preferences: &Option<Vec<ModelUsagePreference>>,
-    ) -> ChatCompletionsRequest;
-    fn parse_response(
-        &self,
-        content: &str,
-        usage_preferences: &Option<Vec<ModelUsagePreference>>,
-    ) -> Result<Option<(String, String)>>;
-    fn get_model_name(&self) -> String;
-}
diff --git a/crates/brightstaff/src/router/router_model_v1.rs b/crates/brightstaff/src/router/router_model_v1.rs
deleted file mode 100644
index e1189c94..00000000
--- a/crates/brightstaff/src/router/router_model_v1.rs
+++ /dev/null
@@ -1,842 +0,0 @@
-use std::collections::HashMap;
-
-use super::router_model::{ModelUsagePreference, RoutingPreference};
-use hermesllm::apis::openai::{ChatCompletionsRequest, Message, MessageContent, Role};
-use hermesllm::transforms::lib::ExtractText;
-use serde::{Deserialize, Serialize};
-use tracing::{debug, warn};
-
-use super::router_model::{RouterModel, RoutingModelError};
-
-pub const MAX_TOKEN_LEN: usize = 2048; // Default max token length for the routing model
-pub const ARCH_ROUTER_V1_SYSTEM_PROMPT: &str = r#"
-You are a helpful assistant designed to find the best suited route.
-You are provided with route description within <routes></routes> XML tags:
-<routes>
-{routes}
-</routes>
-
-<conversation>
-{conversation}
-</conversation>
-
-Your task is to decide which route is best suit with user intent on the conversation in <conversation></conversation> XML tags.  Follow the instruction:
-1. If the latest intent from user is irrelevant or user intent is full filled, response with other route {"route": "other"}.
-2. You must analyze the route descriptions and find the best match route for user latest intent.
-3. You only response the name of the route that best matches the user's request, use the exact name in the <routes></routes>.
-
-Based on your analysis, provide your response in the following JSON formats if you decide to match any route:
-{"route": "route_name"}
-"#;
-
-pub type Result<T> = std::result::Result<T, RoutingModelError>;
-pub struct RouterModelV1 {
-    llm_route_json_str: String,
-    llm_route_to_model_map: HashMap<String, String>,
-    routing_model: String,
-    max_token_length: usize,
-}
-impl RouterModelV1 {
-    pub fn new(
-        llm_routes: HashMap<String, Vec<RoutingPreference>>,
-        routing_model: String,
-        max_token_length: usize,
-    ) -> Self {
-        let llm_route_values: Vec<RoutingPreference> =
-            llm_routes.values().flatten().cloned().collect();
-        let llm_route_json_str =
-            serde_json::to_string(&llm_route_values).unwrap_or_else(|_| "[]".to_string());
-        let llm_route_to_model_map: HashMap<String, String> = llm_routes
-            .iter()
-            .flat_map(|(model, prefs)| prefs.iter().map(|pref| (pref.name.clone(), model.clone())))
-            .collect();
-
-        RouterModelV1 {
-            routing_model,
-            max_token_length,
-            llm_route_json_str,
-            llm_route_to_model_map,
-        }
-    }
-}
-
-#[derive(Debug, Clone, Serialize, Deserialize)]
-struct LlmRouterResponse {
-    pub route: Option<String>,
-}
-
-const TOKEN_LENGTH_DIVISOR: usize = 4; // Approximate token length divisor for UTF-8 characters
-
-impl RouterModel for RouterModelV1 {
-    fn generate_request(
-        &self,
-        messages: &[Message],
-        usage_preferences_from_request: &Option<Vec<ModelUsagePreference>>,
-    ) -> ChatCompletionsRequest {
-        // remove system prompt, tool calls, tool call response and messages without content
-        // if content is empty its likely a tool call
-        // when role == tool its tool call response
-        let messages_vec = messages
-            .iter()
-            .filter(|m| {
-                m.role != Role::System
-                    && m.role != Role::Developer
-                    && m.role != Role::Tool
-                    && !m.content.extract_text().is_empty()
-            })
-            .collect::<Vec<&Message>>();
-
-        // Following code is to ensure that the conversation does not exceed max token length
-        // Note: we use a simple heuristic to estimate token count based on character length to optimize for performance
-        let mut token_count = ARCH_ROUTER_V1_SYSTEM_PROMPT.len() / TOKEN_LENGTH_DIVISOR;
-        let mut selected_messages_list_reversed: Vec<&Message> = vec![];
-        for (selected_messsage_count, message) in messages_vec.iter().rev().enumerate() {
-            let message_token_count = message.content.extract_text().len() / TOKEN_LENGTH_DIVISOR;
-            token_count += message_token_count;
-            if token_count > self.max_token_length {
-                debug!(
-                    token_count = token_count,
-                    max_tokens = self.max_token_length,
-                    selected = selected_messsage_count,
-                    total = messages_vec.len(),
-                    "token count exceeds max, truncating conversation"
-                );
-                if message.role == Role::User {
-                    // If message that exceeds max token length is from user, we need to keep it
-                    selected_messages_list_reversed.push(message);
-                }
-                break;
-            }
-            // If we are here, it means that the message is within the max token length
-            selected_messages_list_reversed.push(message);
-        }
-
-        if selected_messages_list_reversed.is_empty() {
-            debug!("no messages selected, using last message");
-            if let Some(last_message) = messages_vec.last() {
-                selected_messages_list_reversed.push(last_message);
-            }
-        }
-
-        // ensure that first and last selected message is from user
-        if let Some(first_message) = selected_messages_list_reversed.first() {
-            if first_message.role != Role::User {
-                warn!("last message is not from user, may lead to incorrect routing");
-            }
-        }
-        if let Some(last_message) = selected_messages_list_reversed.last() {
-            if last_message.role != Role::User {
-                warn!("first message is not from user, may lead to incorrect routing");
-            }
-        }
-
-        // Reverse the selected messages to maintain the conversation order
-        let selected_conversation_list = selected_messages_list_reversed
-            .iter()
-            .rev()
-            .map(|message| {
-                Message {
-                    role: message.role.clone(),
-                    // we can unwrap here because we have already filtered out messages without content
-                    content: Some(MessageContent::Text(
-                        message
-                            .content
-                            .as_ref()
-                            .map_or(String::new(), |c| c.to_string()),
-                    )),
-                    name: None,
-                    tool_calls: None,
-                    tool_call_id: None,
-                }
-            })
-            .collect::<Vec<Message>>();
-
-        // Generate the router request message based on the usage preferences.
-        // If preferences are passed in request then we use them otherwise we use the default routing model preferences.
-        let router_message = match convert_to_router_preferences(usage_preferences_from_request) {
-            Some(prefs) => generate_router_message(&prefs, &selected_conversation_list),
-            None => generate_router_message(&self.llm_route_json_str, &selected_conversation_list),
-        };
-
-        ChatCompletionsRequest {
-            model: self.routing_model.clone(),
-            messages: vec![Message {
-                content: Some(MessageContent::Text(router_message)),
-                role: Role::User,
-                name: None,
-                tool_calls: None,
-                tool_call_id: None,
-            }],
-            temperature: Some(0.01),
-            ..Default::default()
-        }
-    }
-
-    fn parse_response(
-        &self,
-        content: &str,
-        usage_preferences: &Option<Vec<ModelUsagePreference>>,
-    ) -> Result<Option<(String, String)>> {
-        if content.is_empty() {
-            return Ok(None);
-        }
-        let router_resp_fixed = fix_json_response(content);
-        let router_response: LlmRouterResponse = serde_json::from_str(router_resp_fixed.as_str())?;
-
-        let selected_route = router_response.route.unwrap_or_default().to_string();
-
-        if selected_route.is_empty() || selected_route == "other" {
-            return Ok(None);
-        }
-
-        if let Some(usage_preferences) = usage_preferences {
-            // If usage preferences are defined, we need to find the model that matches the selected route
-            let model_name: Option<String> = usage_preferences
-                .iter()
-                .map(|pref| {
-                    pref.routing_preferences
-                        .iter()
-                        .find(|routing_pref| routing_pref.name == selected_route)
-                        .map(|_| pref.model.clone())
-                })
-                .find_map(|model| model);
-
-            if let Some(model_name) = model_name {
-                return Ok(Some((selected_route, model_name)));
-            } else {
-                warn!(
-                    route = %selected_route,
-                    preferences = ?usage_preferences,
-                    "no matching model found for route"
-                );
-                return Ok(None);
-            }
-        }
-
-        // If no usage preferences are passed in request then use the default routing model preferences
-        if let Some(model) = self.llm_route_to_model_map.get(&selected_route).cloned() {
-            return Ok(Some((selected_route, model)));
-        }
-
-        warn!(
-            route = %selected_route,
-            preferences = ?self.llm_route_to_model_map,
-            "no model found for route"
-        );
-
-        Ok(None)
-    }
-
-    fn get_model_name(&self) -> String {
-        self.routing_model.clone()
-    }
-}
-
-fn generate_router_message(prefs: &str, selected_conversation_list: &Vec<Message>) -> String {
-    ARCH_ROUTER_V1_SYSTEM_PROMPT
-        .replace("{routes}", prefs)
-        .replace(
-            "{conversation}",
-            &serde_json::to_string(&selected_conversation_list).unwrap_or_default(),
-        )
-}
-
-fn convert_to_router_preferences(
-    prefs_from_request: &Option<Vec<ModelUsagePreference>>,
-) -> Option<String> {
-    if let Some(usage_preferences) = prefs_from_request {
-        let routing_preferences = usage_preferences
-            .iter()
-            .flat_map(|pref| {
-                pref.routing_preferences
-                    .iter()
-                    .map(|routing_pref| RoutingPreference {
-                        name: routing_pref.name.clone(),
-                        description: routing_pref.description.clone(),
-                    })
-            })
-            .collect::<Vec<RoutingPreference>>();
-
-        return Some(serde_json::to_string(&routing_preferences).unwrap_or_default());
-    }
-
-    None
-}
-
-fn fix_json_response(body: &str) -> String {
-    let mut updated_body = body.to_string();
-
-    updated_body = updated_body.replace("'", "\"");
-
-    if updated_body.contains("\\n") {
-        updated_body = updated_body.replace("\\n", "");
-    }
-
-    if updated_body.starts_with("```json") {
-        updated_body = updated_body
-            .strip_prefix("```json")
-            .unwrap_or(&updated_body)
-            .to_string();
-    }
-
-    if updated_body.ends_with("```") {
-        updated_body = updated_body
-            .strip_suffix("```")
-            .unwrap_or(&updated_body)
-            .to_string();
-    }
-
-    updated_body
-}
-
-impl std::fmt::Debug for dyn RouterModel {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        write!(f, "RouterModel")
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-    use pretty_assertions::assert_eq;
-
-    #[test]
-    fn test_system_prompt_format() {
-        let expected_prompt = r#"
-You are a helpful assistant designed to find the best suited route.
-You are provided with route description within <routes></routes> XML tags:
-<routes>
-[{"name":"Image generation","description":"generating image"}]
-</routes>
-
-<conversation>
-[{"role":"user","content":"hi"},{"role":"assistant","content":"Hello! How can I assist you today?"},{"role":"user","content":"given the image In style of Andy Warhol, portrait of Bart and Lisa Simpson"}]
-</conversation>
-
-Your task is to decide which route is best suit with user intent on the conversation in <conversation></conversation> XML tags.  Follow the instruction:
-1. If the latest intent from user is irrelevant or user intent is full filled, response with other route {"route": "other"}.
-2. You must analyze the route descriptions and find the best match route for user latest intent.
-3. You only response the name of the route that best matches the user's request, use the exact name in the <routes></routes>.
-
-Based on your analysis, provide your response in the following JSON formats if you decide to match any route:
-{"route": "route_name"}
-"#;
-        let routes_str = r#"
-          {
-            "gpt-4o": [
-              {"name": "Image generation", "description": "generating image"}
-            ]
-        }
-        "#;
-        let llm_routes =
-            serde_json::from_str::<HashMap<String, Vec<RoutingPreference>>>(routes_str).unwrap();
-        let routing_model = "test-model".to_string();
-        let router = RouterModelV1::new(llm_routes, routing_model, usize::MAX);
-
-        let conversation_str = r#"
-                    [
-                        {
-                            "role": "user",
-                            "content": "hi"
-                        },
-                        {
-                            "role": "assistant",
-                            "content": "Hello! How can I assist you today?"
-                        },
-                        {
-                            "role": "user",
-                            "content": "given the image In style of Andy Warhol, portrait of Bart and Lisa Simpson"
-                        }
-                    ]
-        "#;
-        let conversation: Vec<Message> = serde_json::from_str(conversation_str).unwrap();
-
-        let req = router.generate_request(&conversation, &None);
-
-        let prompt = req.messages[0].content.extract_text();
-
-        assert_eq!(expected_prompt, prompt);
-    }
-
-    #[test]
-    fn test_system_prompt_format_usage_preferences() {
-        let expected_prompt = r#"
-You are a helpful assistant designed to find the best suited route.
-You are provided with route description within <routes></routes> XML tags:
-<routes>
-[{"name":"code-generation","description":"generating new code snippets, functions, or boilerplate based on user prompts or requirements"}]
-</routes>
-
-<conversation>
-[{"role":"user","content":"hi"},{"role":"assistant","content":"Hello! How can I assist you today?"},{"role":"user","content":"given the image In style of Andy Warhol, portrait of Bart and Lisa Simpson"}]
-</conversation>
-
-Your task is to decide which route is best suit with user intent on the conversation in <conversation></conversation> XML tags.  Follow the instruction:
-1. If the latest intent from user is irrelevant or user intent is full filled, response with other route {"route": "other"}.
-2. You must analyze the route descriptions and find the best match route for user latest intent.
-3. You only response the name of the route that best matches the user's request, use the exact name in the <routes></routes>.
-
-Based on your analysis, provide your response in the following JSON formats if you decide to match any route:
-{"route": "route_name"}
-"#;
-        let routes_str = r#"
-          {
-            "gpt-4o": [
-              {"name": "Image generation", "description": "generating image"}
-            ]
-        }
-        "#;
-        let llm_routes =
-            serde_json::from_str::<HashMap<String, Vec<RoutingPreference>>>(routes_str).unwrap();
-        let routing_model = "test-model".to_string();
-        let router = RouterModelV1::new(llm_routes, routing_model, usize::MAX);
-
-        let conversation_str = r#"
-                    [
-                        {
-                            "role": "user",
-                            "content": "hi"
-                        },
-                        {
-                            "role": "assistant",
-                            "content": "Hello! How can I assist you today?"
-                        },
-                        {
-                            "role": "user",
-                            "content": "given the image In style of Andy Warhol, portrait of Bart and Lisa Simpson"
-                        }
-                    ]
-        "#;
-        let conversation: Vec<Message> = serde_json::from_str(conversation_str).unwrap();
-
-        let usage_preferences = Some(vec![ModelUsagePreference {
-            model: "claude/claude-3-7-sonnet".to_string(),
-            routing_preferences: vec![RoutingPreference {
-                name: "code-generation".to_string(),
-                description: "generating new code snippets, functions, or boilerplate based on user prompts or requirements".to_string(),
-            }],
-        }]);
-        let req = router.generate_request(&conversation, &usage_preferences);
-
-        let prompt = req.messages[0].content.extract_text();
-
-        assert_eq!(expected_prompt, prompt);
-    }
-
-    #[test]
-    fn test_conversation_exceed_token_count() {
-        let expected_prompt = r#"
-You are a helpful assistant designed to find the best suited route.
-You are provided with route description within <routes></routes> XML tags:
-<routes>
-[{"name":"Image generation","description":"generating image"}]
-</routes>
-
-<conversation>
-[{"role":"user","content":"given the image In style of Andy Warhol, portrait of Bart and Lisa Simpson"}]
-</conversation>
-
-Your task is to decide which route is best suit with user intent on the conversation in <conversation></conversation> XML tags.  Follow the instruction:
-1. If the latest intent from user is irrelevant or user intent is full filled, response with other route {"route": "other"}.
-2. You must analyze the route descriptions and find the best match route for user latest intent.
-3. You only response the name of the route that best matches the user's request, use the exact name in the <routes></routes>.
-
-Based on your analysis, provide your response in the following JSON formats if you decide to match any route:
-{"route": "route_name"}
-"#;
-
-        let routes_str = r#"
-          {
-            "gpt-4o": [
-              {"name": "Image generation", "description": "generating image"}
-            ]
-        }
-        "#;
-        let llm_routes =
-            serde_json::from_str::<HashMap<String, Vec<RoutingPreference>>>(routes_str).unwrap();
-        let routing_model = "test-model".to_string();
-        let router = RouterModelV1::new(llm_routes, routing_model, 235);
-
-        let conversation_str = r#"
-                    [
-                        {
-                            "role": "user",
-                            "content": "hi"
-                        },
-                        {
-                            "role": "assistant",
-                            "content": "Hello! How can I assist you today?"
-                        },
-                        {
-                            "role": "user",
-                            "content": "given the image In style of Andy Warhol, portrait of Bart and Lisa Simpson"
-                        }
-                    ]
-        "#;
-
-        let conversation: Vec<Message> = serde_json::from_str(conversation_str).unwrap();
-
-        let req = router.generate_request(&conversation, &None);
-
-        let prompt = req.messages[0].content.extract_text();
-
-        assert_eq!(expected_prompt, prompt);
-    }
-
-    #[test]
-    fn test_conversation_exceed_token_count_large_single_message() {
-        let expected_prompt = r#"
-You are a helpful assistant designed to find the best suited route.
-You are provided with route description within <routes></routes> XML tags:
-<routes>
-[{"name":"Image generation","description":"generating image"}]
-</routes>
-
-<conversation>
-[{"role":"user","content":"given the image In style of Andy Warhol, portrait of Bart and Lisa Simpson and this is a very long message that exceeds the max token length of the routing model, so it should be truncated and only the last user message should be included in the conversation for routing."}]
-</conversation>
-
-Your task is to decide which route is best suit with user intent on the conversation in <conversation></conversation> XML tags.  Follow the instruction:
-1. If the latest intent from user is irrelevant or user intent is full filled, response with other route {"route": "other"}.
-2. You must analyze the route descriptions and find the best match route for user latest intent.
-3. You only response the name of the route that best matches the user's request, use the exact name in the <routes></routes>.
-
-Based on your analysis, provide your response in the following JSON formats if you decide to match any route:
-{"route": "route_name"}
-"#;
-
-        let routes_str = r#"
-          {
-            "gpt-4o": [
-              {"name": "Image generation", "description": "generating image"}
-            ]
-        }
-        "#;
-        let llm_routes =
-            serde_json::from_str::<HashMap<String, Vec<RoutingPreference>>>(routes_str).unwrap();
-
-        let routing_model = "test-model".to_string();
-        let router = RouterModelV1::new(llm_routes, routing_model, 200);
-
-        let conversation_str = r#"
-                    [
-                        {
-                            "role": "user",
-                            "content": "hi"
-                        },
-                        {
-                            "role": "assistant",
-                            "content": "Hello! How can I assist you today?"
-                        },
-                        {
-                            "role": "user",
-                            "content": "given the image In style of Andy Warhol, portrait of Bart and Lisa Simpson and this is a very long message that exceeds the max token length of the routing model, so it should be truncated and only the last user message should be included in the conversation for routing."
-                        }
-                    ]
-        "#;
-
-        let conversation: Vec<Message> = serde_json::from_str(conversation_str).unwrap();
-
-        let req = router.generate_request(&conversation, &None);
-
-        let prompt = req.messages[0].content.extract_text();
-
-        assert_eq!(expected_prompt, prompt);
-    }
-
-    #[test]
-    fn test_conversation_trim_upto_user_message() {
-        let expected_prompt = r#"
-You are a helpful assistant designed to find the best suited route.
-You are provided with route description within <routes></routes> XML tags:
-<routes>
-[{"name":"Image generation","description":"generating image"}]
-</routes>
-
-<conversation>
-[{"role":"user","content":"given the image In style of Andy Warhol"},{"role":"assistant","content":"ok here is the image"},{"role":"user","content":"pls give me another image about Bart and Lisa"}]
-</conversation>
-
-Your task is to decide which route is best suit with user intent on the conversation in <conversation></conversation> XML tags.  Follow the instruction:
-1. If the latest intent from user is irrelevant or user intent is full filled, response with other route {"route": "other"}.
-2. You must analyze the route descriptions and find the best match route for user latest intent.
-3. You only response the name of the route that best matches the user's request, use the exact name in the <routes></routes>.
-
-Based on your analysis, provide your response in the following JSON formats if you decide to match any route:
-{"route": "route_name"}
-"#;
-
-        let routes_str = r#"
-          {
-            "gpt-4o": [
-              {"name": "Image generation", "description": "generating image"}
-            ]
-        }
-        "#;
-        let llm_routes =
-            serde_json::from_str::<HashMap<String, Vec<RoutingPreference>>>(routes_str).unwrap();
-        let routing_model = "test-model".to_string();
-        let router = RouterModelV1::new(llm_routes, routing_model, 230);
-
-        let conversation_str = r#"
-                    [
-                        {
-                            "role": "user",
-                            "content": "hi"
-                        },
-                        {
-                            "role": "assistant",
-                            "content": "Hello! How can I assist you today?"
-                        },
-                        {
-                            "role": "user",
-                            "content": "given the image In style of Andy Warhol"
-                        },
-                        {
-                            "role": "assistant",
-                            "content": "ok here is the image"
-                        },
-                        {
-                            "role": "user",
-                            "content": "pls give me another image about Bart and Lisa"
-                        }
-                    ]
-        "#;
-
-        let conversation: Vec<Message> = serde_json::from_str(conversation_str).unwrap();
-
-        let req = router.generate_request(&conversation, &None);
-
-        let prompt = req.messages[0].content.extract_text();
-
-        assert_eq!(expected_prompt, prompt);
-    }
-
-    #[test]
-    fn test_non_text_input() {
-        let expected_prompt = r#"
-You are a helpful assistant designed to find the best suited route.
-You are provided with route description within <routes></routes> XML tags:
-<routes>
-[{"name":"Image generation","description":"generating image"}]
-</routes>
-
-<conversation>
-[{"role":"user","content":"hi"},{"role":"assistant","content":"Hello! How can I assist you today?"},{"role":"user","content":"given the image In style of Andy Warhol, portrait of Bart and Lisa Simpson"}]
-</conversation>
-
-Your task is to decide which route is best suit with user intent on the conversation in <conversation></conversation> XML tags.  Follow the instruction:
-1. If the latest intent from user is irrelevant or user intent is full filled, response with other route {"route": "other"}.
-2. You must analyze the route descriptions and find the best match route for user latest intent.
-3. You only response the name of the route that best matches the user's request, use the exact name in the <routes></routes>.
-
-Based on your analysis, provide your response in the following JSON formats if you decide to match any route:
-{"route": "route_name"}
-"#;
-        let routes_str = r#"
-          {
-            "gpt-4o": [
-              {"name": "Image generation", "description": "generating image"}
-            ]
-        }
-        "#;
-        let llm_routes =
-            serde_json::from_str::<HashMap<String, Vec<RoutingPreference>>>(routes_str).unwrap();
-        let routing_model = "test-model".to_string();
-        let router = RouterModelV1::new(llm_routes, routing_model, usize::MAX);
-
-        let conversation_str = r#"
-                    [
-                        {
-                            "role": "user",
-                            "content": [
-                              {
-                                "type": "text",
-                                "text": "hi"
-                              },
-                              {
-                                "type": "image_url",
-                                "image_url": {
-                                  "url": "https://example.com/image.png"
-                                }
-                              }
-                            ]
-                        },
-                        {
-                            "role": "assistant",
-                            "content": "Hello! How can I assist you today?"
-                        },
-                        {
-                            "role": "user",
-                            "content": "given the image In style of Andy Warhol, portrait of Bart and Lisa Simpson"
-                        }
-                    ]
-        "#;
-        let conversation: Vec<Message> = serde_json::from_str(conversation_str).unwrap();
-
-        let req = router.generate_request(&conversation, &None);
-
-        let prompt = req.messages[0].content.extract_text();
-
-        assert_eq!(expected_prompt, prompt);
-    }
-
-    #[test]
-    fn test_skip_tool_call() {
-        let expected_prompt = r#"
-You are a helpful assistant designed to find the best suited route.
-You are provided with route description within <routes></routes> XML tags:
-<routes>
-[{"name":"Image generation","description":"generating image"}]
-</routes>
-
-<conversation>
-[{"role":"user","content":"What's the weather like in Tokyo?"},{"role":"assistant","content":"The current weather in Tokyo is 22°C and sunny."},{"role":"user","content":"What about in New York?"}]
-</conversation>
-
-Your task is to decide which route is best suit with user intent on the conversation in <conversation></conversation> XML tags.  Follow the instruction:
-1. If the latest intent from user is irrelevant or user intent is full filled, response with other route {"route": "other"}.
-2. You must analyze the route descriptions and find the best match route for user latest intent.
-3. You only response the name of the route that best matches the user's request, use the exact name in the <routes></routes>.
-
-Based on your analysis, provide your response in the following JSON formats if you decide to match any route:
-{"route": "route_name"}
-"#;
-        let routes_str = r#"
-          {
-            "gpt-4o": [
-              {"name": "Image generation", "description": "generating image"}
-            ]
-        }
-        "#;
-        let llm_routes =
-            serde_json::from_str::<HashMap<String, Vec<RoutingPreference>>>(routes_str).unwrap();
-        let routing_model = "test-model".to_string();
-        let router = RouterModelV1::new(llm_routes, routing_model, usize::MAX);
-
-        let conversation_str = r#"
-                                                [
-                                                  {
-                                                    "role": "user",
-                                                    "content": "What's the weather like in Tokyo?"
-                                                  },
-                                                  {
-                                                    "role": "assistant",
-                                                    "content": "",
-                                                    "tool_calls": [
-                                                      {
-                                                        "id": "toolcall-abc123",
-                                                        "type": "function",
-                                                        "function": {
-                                                          "name": "get_weather",
-                                                          "arguments": "{ \"location\": \"Tokyo\" }"
-                                                        }
-                                                      }
-                                                    ]
-                                                  },
-                                                  {
-                                                    "role": "tool",
-                                                    "tool_call_id": "toolcall-abc123",
-                                                    "content": "{ \"temperature\": \"22°C\", \"condition\": \"Sunny\" }"
-                                                  },
-                                                  {
-                                                    "role": "assistant",
-                                                    "content": "The current weather in Tokyo is 22°C and sunny."
-                                                  },
-                                                  {
-                                                    "role": "user",
-                                                    "content": "What about in New York?"
-                                                  }
-                                                ]
-        "#;
-
-        // expects conversation to look like this
-
-        // [
-        //   {
-        //     "role": "user",
-        //     "content": "What's the weather like in Tokyo?"
-        //   },
-        //   {
-        //     "role": "assistant",
-        //     "content": "The current weather in Tokyo is 22°C and sunny."
-        //   },
-        //   {
-        //     "role": "user",
-        //     "content": "What about in New York?"
-        //   }
-        // ]
-
-        let conversation: Vec<Message> = serde_json::from_str(conversation_str).unwrap();
-
-        let req: ChatCompletionsRequest = router.generate_request(&conversation, &None);
-
-        let prompt = req.messages[0].content.extract_text();
-
-        assert_eq!(expected_prompt, prompt);
-    }
-
-    #[test]
-    fn test_parse_response() {
-        let routes_str = r#"
-          {
-            "gpt-4o": [
-              {"name": "Image generation", "description": "generating image"}
-            ]
-        }
-        "#;
-        let llm_routes =
-            serde_json::from_str::<HashMap<String, Vec<RoutingPreference>>>(routes_str).unwrap();
-
-        let router = RouterModelV1::new(llm_routes, "test-model".to_string(), 2000);
-
-        // Case 1: Valid JSON with non-empty route
-        let input = r#"{"route": "Image generation"}"#;
-        let result = router.parse_response(input, &None).unwrap();
-        assert_eq!(
-            result,
-            Some(("Image generation".to_string(), "gpt-4o".to_string()))
-        );
-
-        // Case 2: Valid JSON with empty route
-        let input = r#"{"route": ""}"#;
-        let result = router.parse_response(input, &None).unwrap();
-        assert_eq!(result, None);
-
-        // Case 3: Valid JSON with null route
-        let input = r#"{"route": null}"#;
-        let result = router.parse_response(input, &None).unwrap();
-        assert_eq!(result, None);
-
-        // Case 4: JSON missing route field
-        let input = r#"{}"#;
-        let result = router.parse_response(input, &None).unwrap();
-        assert_eq!(result, None);
-
-        // Case 4.1: empty string
-        let input = r#""#;
-        let result = router.parse_response(input, &None).unwrap();
-        assert_eq!(result, None);
-
-        // Case 5: Malformed JSON
-        let input = r#"{"route": "route1""#; // missing closing }
-        let result = router.parse_response(input, &None);
-        assert!(result.is_err());
-
-        // Case 6: Single quotes and \n in JSON
-        let input = "{'route': 'Image generation'}\\n";
-        let result = router.parse_response(input, &None).unwrap();
-        assert_eq!(
-            result,
-            Some(("Image generation".to_string(), "gpt-4o".to_string()))
-        );
-
-        // Case 7: Code block marker
-        let input = "```json\n{\"route\": \"Image generation\"}\n```";
-        let result = router.parse_response(input, &None).unwrap();
-        assert_eq!(
-            result,
-            Some(("Image generation".to_string(), "gpt-4o".to_string()))
-        );
-    }
-}
diff --git a/crates/common/src/configuration.rs b/crates/common/src/configuration.rs
index 10114274..125a986d 100644
--- a/crates/common/src/configuration.rs
+++ b/crates/common/src/configuration.rs
@@ -233,6 +233,7 @@ pub struct Overrides {
     pub use_agent_orchestrator: Option<bool>,
     pub llm_routing_model: Option<String>,
     pub agent_orchestration_model: Option<String>,
+    pub orchestrator_model_context_length: Option<usize>,
 }
 
 #[derive(Debug, Clone, Serialize, Deserialize, Default)]
@@ -729,13 +730,6 @@ mod test {
                 internal: None,
                 ..Default::default()
             },
-            LlmProvider {
-                name: "arch-router".to_string(),
-                provider_interface: LlmProviderType::Plano,
-                model: Some("Arch-Router".to_string()),
-                internal: Some(true),
-                ..Default::default()
-            },
             LlmProvider {
                 name: "plano-orchestrator".to_string(),
                 provider_interface: LlmProviderType::Plano,
@@ -747,13 +741,10 @@ mod test {
 
         let models = providers.into_models();
 
-        // Should only have 1 model: openai-gpt4
         assert_eq!(models.data.len(), 1);
 
-        // Verify internal models are excluded from /v1/models
         let model_ids: Vec<String> = models.data.iter().map(|m| m.id.clone()).collect();
         assert!(model_ids.contains(&"openai-gpt4".to_string()));
-        assert!(!model_ids.contains(&"arch-router".to_string()));
         assert!(!model_ids.contains(&"plano-orchestrator".to_string()));
     }
 }
diff --git a/demos/llm_routing/claude_code_router/pretty_model_resolution.sh b/demos/llm_routing/claude_code_router/pretty_model_resolution.sh
index b6187e65..3902a63e 100644
--- a/demos/llm_routing/claude_code_router/pretty_model_resolution.sh
+++ b/demos/llm_routing/claude_code_router/pretty_model_resolution.sh
@@ -1,6 +1,6 @@
 #!/usr/bin/env bash
 # Pretty-print Plano MODEL_RESOLUTION lines from docker logs
-# - hides Arch-Router
+# - hides Plano-Orchestrator
 # - prints timestamp
 # - colors MODEL_RESOLUTION red
 # - colors req_model cyan
@@ -9,7 +9,7 @@
 
 docker logs -f plano 2>&1 \
 | awk '
-/MODEL_RESOLUTION:/ && $0 !~ /Arch-Router/ {
+/MODEL_RESOLUTION:/ && $0 !~ /Plano-Orchestrator/ {
   # extract timestamp between first [ and ]
   ts=""
   if (match($0, /\[[0-9-]+ [0-9:.]+\]/)) {
diff --git a/demos/llm_routing/codex_router/pretty_model_resolution.sh b/demos/llm_routing/codex_router/pretty_model_resolution.sh
index b6187e65..3902a63e 100644
--- a/demos/llm_routing/codex_router/pretty_model_resolution.sh
+++ b/demos/llm_routing/codex_router/pretty_model_resolution.sh
@@ -1,6 +1,6 @@
 #!/usr/bin/env bash
 # Pretty-print Plano MODEL_RESOLUTION lines from docker logs
-# - hides Arch-Router
+# - hides Plano-Orchestrator
 # - prints timestamp
 # - colors MODEL_RESOLUTION red
 # - colors req_model cyan
@@ -9,7 +9,7 @@
 
 docker logs -f plano 2>&1 \
 | awk '
-/MODEL_RESOLUTION:/ && $0 !~ /Arch-Router/ {
+/MODEL_RESOLUTION:/ && $0 !~ /Plano-Orchestrator/ {
   # extract timestamp between first [ and ]
   ts=""
   if (match($0, /\[[0-9-]+ [0-9:.]+\]/)) {
diff --git a/demos/llm_routing/model_routing_service/README.md b/demos/llm_routing/model_routing_service/README.md
index 4687b47c..eaec32c7 100644
--- a/demos/llm_routing/model_routing_service/README.md
+++ b/demos/llm_routing/model_routing_service/README.md
@@ -6,7 +6,7 @@ Plano is an AI-native proxy and data plane for agentic apps — with built-in or
 ┌───────────┐      ┌─────────────────────────────────┐      ┌──────────────┐
 │  Client   │ ───► │  Plano                          │ ───► │  OpenAI      │
 │  (any     │      │                                 │      │  Anthropic   │
-│  language)│      │  Arch-Router (1.5B model)       │      │  Any Provider│
+│  language)│      │  Plano-Orchestrator              │      │  Any Provider│
 └───────────┘      │  analyzes intent → picks model  │      └──────────────┘
                    └─────────────────────────────────┘
 ```
@@ -39,17 +39,17 @@ routing_preferences:
 
 When a request arrives, Plano:
 
-1. Sends the conversation + route descriptions to Arch-Router for intent classification
+1. Sends the conversation + route descriptions to Plano-Orchestrator for intent classification
 2. Looks up the matched route and returns its candidate models
 3. Returns an ordered list — client uses `models[0]`, falls back to `models[1]` on 429/5xx
 
 ```
 1. Request arrives          → "Write binary search in Python"
-2. Arch-Router classifies   → route: "code_generation"
+2. Plano-Orchestrator classifies → route: "code_generation"
 3. Response                 → models: ["anthropic/claude-sonnet-4-20250514", "openai/gpt-4o"]
 ```
 
-No match? Arch-Router returns `null` route → client falls back to the model in the original request.
+No match? Plano-Orchestrator returns an empty route → client falls back to the model in the original request.
 
 The `/routing/v1/*` endpoints return the routing decision **without** forwarding to the LLM — useful for testing routing behavior before going to production.
 
@@ -163,9 +163,9 @@ routing:
 
 Without the `X-Model-Affinity` header, routing runs fresh every time (no breaking change).
 
-## Kubernetes Deployment (Self-hosted Arch-Router on GPU)
+## Kubernetes Deployment (Self-hosted Plano-Orchestrator on GPU)
 
-To run Arch-Router in-cluster using vLLM instead of the default hosted endpoint:
+To run Plano-Orchestrator in-cluster using vLLM instead of the default hosted endpoint:
 
 **0. Check your GPU node labels and taints**
 
@@ -176,10 +176,10 @@ kubectl get node <gpu-node-name> -o jsonpath='{.spec.taints}'
 
 GPU nodes commonly have a `nvidia.com/gpu:NoSchedule` taint — `vllm-deployment.yaml` includes a matching toleration. If you have multiple GPU node pools and need to pin to a specific one, uncomment and set the `nodeSelector` in `vllm-deployment.yaml` using the label for your cloud provider.
 
-**1. Deploy Arch-Router and Plano:**
+**1. Deploy Plano-Orchestrator and Plano:**
 
 ```bash
-# arch-router deployment
+# plano-orchestrator deployment
 kubectl apply -f vllm-deployment.yaml
 
 # plano deployment
@@ -197,8 +197,8 @@ kubectl apply -f plano-deployment.yaml
 **3. Wait for both pods to be ready:**
 
 ```bash
-# Arch-Router downloads the model (~1 min) then vLLM loads it (~2 min)
-kubectl get pods -l app=arch-router -w
+# Plano-Orchestrator downloads the model (~1 min) then vLLM loads it (~2 min)
+kubectl get pods -l app=plano-orchestrator -w
 kubectl rollout status deployment/plano
 ```
 
@@ -209,10 +209,10 @@ kubectl port-forward svc/plano 12000:12000
 ./demo.sh
 ```
 
-To confirm requests are hitting your in-cluster Arch-Router (not just health checks):
+To confirm requests are hitting your in-cluster Plano-Orchestrator (not just health checks):
 
 ```bash
-kubectl logs -l app=arch-router -f --tail=0
+kubectl logs -l app=plano-orchestrator -f --tail=0
 # Look for POST /v1/chat/completions entries
 ```
 
diff --git a/demos/llm_routing/model_routing_service/config_k8s.yaml b/demos/llm_routing/model_routing_service/config_k8s.yaml
index bdf98bfa..49f452a9 100644
--- a/demos/llm_routing/model_routing_service/config_k8s.yaml
+++ b/demos/llm_routing/model_routing_service/config_k8s.yaml
@@ -1,7 +1,7 @@
 version: v0.3.0
 
 overrides:
-  llm_routing_model: plano/Arch-Router
+  llm_routing_model: plano/Plano-Orchestrator
 
 listeners:
   - type: model
@@ -10,8 +10,8 @@ listeners:
 
 model_providers:
 
-  - model: plano/Arch-Router
-    base_url: http://arch-router:10000
+  - model: plano/Plano-Orchestrator
+    base_url: http://plano-orchestrator:10000
 
   - model: openai/gpt-4o-mini
     access_key: $OPENAI_API_KEY
diff --git a/demos/llm_routing/model_routing_service/vllm-deployment.yaml b/demos/llm_routing/model_routing_service/vllm-deployment.yaml
index 1debe15e..b384b1c4 100644
--- a/demos/llm_routing/model_routing_service/vllm-deployment.yaml
+++ b/demos/llm_routing/model_routing_service/vllm-deployment.yaml
@@ -1,18 +1,18 @@
 apiVersion: apps/v1
 kind: Deployment
 metadata:
-  name: arch-router
+  name: plano-orchestrator
   labels:
-    app: arch-router
+    app: plano-orchestrator
 spec:
   replicas: 1
   selector:
     matchLabels:
-      app: arch-router
+      app: plano-orchestrator
   template:
     metadata:
       labels:
-        app: arch-router
+        app: plano-orchestrator
     spec:
       tolerations:
         - key: nvidia.com/gpu
@@ -53,7 +53,7 @@ spec:
             - "--tokenizer"
             - "katanemo/Arch-Router-1.5B"
             - "--served-model-name"
-            - "Arch-Router"
+            - "Plano-Orchestrator"
             - "--gpu-memory-utilization"
             - "0.3"
             - "--tensor-parallel-size"
@@ -94,10 +94,10 @@ spec:
 apiVersion: v1
 kind: Service
 metadata:
-  name: arch-router
+  name: plano-orchestrator
 spec:
   selector:
-    app: arch-router
+    app: plano-orchestrator
   ports:
     - name: http
       port: 10000
diff --git a/demos/llm_routing/openclaw_routing/config.yaml b/demos/llm_routing/openclaw_routing/config.yaml
index 9690e747..aed0a2c1 100644
--- a/demos/llm_routing/openclaw_routing/config.yaml
+++ b/demos/llm_routing/openclaw_routing/config.yaml
@@ -1,7 +1,7 @@
 version: v0.1.0
 
 overrides:
-  llm_routing_model: Arch-Router
+  llm_routing_model: Plano-Orchestrator
 
 listeners:
   egress_traffic:
diff --git a/demos/llm_routing/preference_based_routing/README.md b/demos/llm_routing/preference_based_routing/README.md
index 7b8d3b25..533e4906 100644
--- a/demos/llm_routing/preference_based_routing/README.md
+++ b/demos/llm_routing/preference_based_routing/README.md
@@ -32,9 +32,9 @@ planoai up config.yaml
 
 3. Test with curl or open AnythingLLM http://localhost:3001/
 
-## Running with local Arch-Router (via Ollama)
+## Running with local routing model (via Ollama)
 
-By default, Plano uses a hosted Arch-Router endpoint. To self-host Arch-Router locally using Ollama:
+By default, Plano uses a hosted Plano-Orchestrator endpoint. To self-host a routing model locally using Ollama:
 
 1. Install [Ollama](https://ollama.ai) and pull the model:
 ```bash
diff --git a/demos/llm_routing/preference_based_routing/test_router_endpoint.rest b/demos/llm_routing/preference_based_routing/test_router_endpoint.rest
index 72686a70..13a3f924 100644
--- a/demos/llm_routing/preference_based_routing/test_router_endpoint.rest
+++ b/demos/llm_routing/preference_based_routing/test_router_endpoint.rest
@@ -22,11 +22,11 @@ Content-Type: application/json
 
 ### get model list from arch-function
 GET https://archfc.katanemo.dev/v1/models HTTP/1.1
-model: Arch-Router
+model: Plano-Orchestrator
 
-### get model list from Arch-Router (notice model header)
+### get model list from Plano-Orchestrator (notice model header)
 GET https://archfc.katanemo.dev/v1/models HTTP/1.1
-model: Arch-Router
+model: Plano-Orchestrator
 
 
 ### test try code generating
diff --git a/docs/source/guides/llm_router.rst b/docs/source/guides/llm_router.rst
index 25b78db5..5539dddc 100644
--- a/docs/source/guides/llm_router.rst
+++ b/docs/source/guides/llm_router.rst
@@ -133,16 +133,16 @@ Clients use semantic names:
 
 .. _preference_aligned_routing:
 
-Preference-aligned routing (Arch-Router)
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Preference-aligned routing (Plano-Orchestrator)
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-Preference-aligned routing uses the `Arch-Router <https://huggingface.co/katanemo/Arch-Router-1.5B>`_ model to pick the best LLM based on domain, action, and your configured preferences instead of hard-coding a model.
+Preference-aligned routing uses the `Plano-Orchestrator <https://huggingface.co/katanemo/Plano-Orchestrator-30B-A3B>`_ model to pick the best LLM based on domain, action, and your configured preferences instead of hard-coding a model.
 
 - **Domain**: High-level topic of the request (e.g., legal, healthcare, programming).
 - **Action**: What the user wants to do (e.g., summarize, generate code, translate).
 - **Routing preferences**: Your mapping from (domain, action) to preferred models.
 
-Arch-Router analyzes each prompt to infer domain and action, then applies your preferences to select a model. This decouples **routing policy** (how to choose) from **model assignment** (what to run), making routing transparent, controllable, and easy to extend as you add or swap models.
+Plano-Orchestrator analyzes each prompt to infer domain and action, then applies your preferences to select a model. This decouples **routing policy** (how to choose) from **model assignment** (what to run), making routing transparent, controllable, and easy to extend as you add or swap models.
 
 Configuration
 ^^^^^^^^^^^^^
@@ -187,21 +187,21 @@ Clients can let the router decide or still specify aliases:
 
 .. code-block:: python
 
-    # Let Arch-Router choose based on content
+    # Let Plano-Orchestrator choose based on content
     response = client.chat.completions.create(
         messages=[{"role": "user", "content": "Write a creative story about space exploration"}]
         # No model specified - router will analyze and choose claude-sonnet-4-5
     )
 
 
-Arch-Router
------------
-The `Arch-Router <https://huggingface.co/katanemo/Arch-Router-1.5B>`_ is a state-of-the-art **preference-based routing model** specifically designed to address the limitations of traditional LLM routing. This compact 1.5B model delivers production-ready performance with low latency and high accuracy while solving key routing challenges.
+Plano-Orchestrator
+-------------------
+Plano-Orchestrator is a **preference-based routing model** specifically designed to address the limitations of traditional LLM routing. It delivers production-ready performance with low latency and high accuracy while solving key routing challenges.
 
 **Addressing Traditional Routing Limitations:**
 
 **Human Preference Alignment**
-Unlike benchmark-driven approaches, Arch-Router learns to match queries with human preferences by using domain-action mappings that capture subjective evaluation criteria, ensuring routing decisions align with real-world user needs.
+Unlike benchmark-driven approaches, Plano-Orchestrator learns to match queries with human preferences by using domain-action mappings that capture subjective evaluation criteria, ensuring routing decisions align with real-world user needs.
 
 **Flexible Model Integration**
 The system supports seamlessly adding new models for routing without requiring retraining or architectural modifications, enabling dynamic adaptation to evolving model landscapes.
@@ -209,15 +209,15 @@ The system supports seamlessly adding new models for routing without requiring r
 **Preference-Encoded Routing**
 Provides a practical mechanism to encode user preferences through domain-action mappings, offering transparent and controllable routing decisions that can be customized for specific use cases.
 
-To support effective routing, Arch-Router introduces two key concepts:
+To support effective routing, Plano-Orchestrator introduces two key concepts:
 
 - **Domain** – the high-level thematic category or subject matter of a request (e.g., legal, healthcare, programming).
 
 - **Action** – the specific type of operation the user wants performed (e.g., summarization, code generation, booking appointment, translation).
 
-Both domain and action configs are associated with preferred models or model variants. At inference time, Arch-Router analyzes the incoming prompt to infer its domain and action using semantic similarity, task indicators, and contextual cues. It then applies the user-defined routing preferences to select the model best suited to handle the request.
+Both domain and action configs are associated with preferred models or model variants. At inference time, Plano-Orchestrator analyzes the incoming prompt to infer its domain and action using semantic similarity, task indicators, and contextual cues. It then applies the user-defined routing preferences to select the model best suited to handle the request.
 
-In summary, Arch-Router demonstrates:
+In summary, Plano-Orchestrator demonstrates:
 
 - **Structured Preference Routing**: Aligns prompt request with model strengths using explicit domain–action mappings.
 
@@ -228,10 +228,10 @@ In summary, Arch-Router demonstrates:
 - **Production-Ready Performance**: Optimized for low-latency, high-throughput applications in multi-model environments.
 
 
-Self-hosting Arch-Router
-------------------------
+Self-hosting Plano-Orchestrator
+-------------------------------
 
-By default, Plano uses a hosted Arch-Router endpoint. To run Arch-Router locally, you can serve the model yourself using either **Ollama** or **vLLM**.
+By default, Plano uses a hosted Plano-Orchestrator endpoint. To run Plano-Orchestrator locally, you can serve the model yourself using either **Ollama** or **vLLM**.
 
 Using Ollama (recommended for local development)
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -240,7 +240,7 @@ Using Ollama (recommended for local development)
 
    Download and install from `ollama.ai <https://ollama.ai>`_.
 
-2. **Pull and serve Arch-Router**
+2. **Pull and serve the routing model**
 
    .. code-block:: bash
 
@@ -249,7 +249,7 @@ Using Ollama (recommended for local development)
 
    This downloads the quantized GGUF model from HuggingFace and starts serving on ``http://localhost:11434``.
 
-3. **Configure Plano to use local Arch-Router**
+3. **Configure Plano to use local routing model**
 
    .. code-block:: yaml
 
@@ -313,7 +313,7 @@ vLLM provides higher throughput and GPU optimizations suitable for production de
            --load-format gguf \
            --chat-template ${SNAPSHOT_DIR}template.jinja \
            --tokenizer katanemo/Arch-Router-1.5B \
-           --served-model-name Arch-Router \
+           --served-model-name Plano-Orchestrator \
            --gpu-memory-utilization 0.3 \
            --tensor-parallel-size 1 \
            --enable-prefix-caching
@@ -323,10 +323,10 @@ vLLM provides higher throughput and GPU optimizations suitable for production de
    .. code-block:: yaml
 
        overrides:
-         llm_routing_model: plano/Arch-Router
+         llm_routing_model: plano/Plano-Orchestrator
 
        model_providers:
-         - model: plano/Arch-Router
+         - model: plano/Plano-Orchestrator
            base_url: http://<your-server-ip>:10000
 
          - model: openai/gpt-5.2
@@ -350,14 +350,14 @@ vLLM provides higher throughput and GPU optimizations suitable for production de
 Using vLLM on Kubernetes (GPU nodes)
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-For teams running Kubernetes, Arch-Router and Plano can be deployed as in-cluster services.
+For teams running Kubernetes, Plano-Orchestrator and Plano can be deployed as in-cluster services.
 The ``demos/llm_routing/model_routing_service/`` directory includes ready-to-use manifests:
 
-- ``vllm-deployment.yaml`` — Arch-Router served by vLLM, with an init container to download
+- ``vllm-deployment.yaml`` — Plano-Orchestrator served by vLLM, with an init container to download
   the model from HuggingFace
-- ``plano-deployment.yaml`` — Plano proxy configured to use the in-cluster Arch-Router
+- ``plano-deployment.yaml`` — Plano proxy configured to use the in-cluster Plano-Orchestrator
 - ``config_k8s.yaml`` — Plano config with ``llm_routing_model`` pointing at
-  ``http://arch-router:10000`` instead of the default hosted endpoint
+  ``http://plano-orchestrator:10000`` instead of the default hosted endpoint
 
 Key things to know before deploying:
 
@@ -504,7 +504,7 @@ This configuration allows clients to:
 
 Example Use Cases
 -----------------
-Here are common scenarios where Arch-Router excels:
+Here are common scenarios where Plano-Orchestrator excels:
 
 - **Coding Tasks**: Distinguish between code generation requests ("write a Python function"), debugging needs ("fix this error"), and code optimization ("make this faster"), routing each to appropriately specialized models.
 
@@ -545,10 +545,10 @@ Best practices
 Unsupported Features
 --------------------
 
-The following features are **not supported** by the Arch-Router model:
+The following features are **not supported** by the Plano-Orchestrator routing model:
 
 - **Multi-modality**: The model is not trained to process raw image or audio inputs. It can handle textual queries *about* these modalities (e.g., "generate an image of a cat"), but cannot interpret encoded multimedia data directly.
 
-- **Function calling**: Arch-Router is designed for **semantic preference matching**, not exact intent classification or tool execution. For structured function invocation, use models in the Plano Function Calling collection instead.
+- **Function calling**: Plano-Orchestrator is designed for **semantic preference matching**, not exact intent classification or tool execution. For structured function invocation, use models in the Plano Function Calling collection instead.
 
-- **System prompt dependency**: Arch-Router routes based solely on the user’s conversation history. It does not use or rely on system prompts for routing decisions.
+- **System prompt dependency**: Plano-Orchestrator routes based solely on the user’s conversation history. It does not use or rely on system prompts for routing decisions.
diff --git a/docs/source/resources/includes/plano_config_full_reference.yaml b/docs/source/resources/includes/plano_config_full_reference.yaml
index e9c89175..1d544727 100644
--- a/docs/source/resources/includes/plano_config_full_reference.yaml
+++ b/docs/source/resources/includes/plano_config_full_reference.yaml
@@ -34,7 +34,7 @@ model_providers:
 
   # routing_preferences: tags a model with named capabilities so Plano's LLM router
   # can select the best model for each request based on intent. Requires the
-  # Arch-Router model (or equivalent) to be configured in overrides.llm_routing_model.
+  # Plano-Orchestrator model (or equivalent) to be configured in overrides.llm_routing_model.
   # Each preference has a name (short label) and a description (used for intent matching).
   - model: groq/llama-3.3-70b-versatile
     access_key: $GROQ_API_KEY
@@ -170,7 +170,7 @@ overrides:
   # Path to the trusted CA bundle for upstream TLS verification
   upstream_tls_ca_path: /etc/ssl/certs/ca-certificates.crt
   # Model used for intent-based LLM routing (must be listed in model_providers)
-  llm_routing_model: Arch-Router
+  llm_routing_model: Plano-Orchestrator
   # Model used for agent orchestration (must be listed in model_providers)
   agent_orchestration_model: Plano-Orchestrator
 
diff --git a/docs/source/resources/includes/plano_config_full_reference_rendered.yaml b/docs/source/resources/includes/plano_config_full_reference_rendered.yaml
index 8b1fb26b..4992ce3b 100644
--- a/docs/source/resources/includes/plano_config_full_reference_rendered.yaml
+++ b/docs/source/resources/includes/plano_config_full_reference_rendered.yaml
@@ -157,8 +157,8 @@ model_providers:
   protocol: https
   provider_interface: openai
 - internal: true
-  model: Arch-Router
-  name: arch-router
+  model: Plano-Orchestrator
+  name: plano-orchestrator
   provider_interface: plano
 - internal: true
   model: Arch-Function
@@ -170,7 +170,7 @@ model_providers:
   provider_interface: plano
 overrides:
   agent_orchestration_model: Plano-Orchestrator
-  llm_routing_model: Arch-Router
+  llm_routing_model: Plano-Orchestrator
   optimize_context_window: true
   prompt_target_intent_matching_threshold: 0.7
   upstream_connect_timeout: 10s

From d39d7ddd1c97fd7384592f441b836a227f995e38 Mon Sep 17 00:00:00 2001
From: Adil Hafeez <adil.hafeez@gmail.com>
Date: Wed, 15 Apr 2026 16:49:50 -0700
Subject: [PATCH 02/16] release 0.4.19 (#887)

---
 .github/workflows/ci.yml                             | 4 ++--
 apps/www/src/components/Hero.tsx                     | 2 +-
 build_filter_image.sh                                | 2 +-
 cli/planoai/__init__.py                              | 2 +-
 cli/planoai/consts.py                                | 2 +-
 cli/pyproject.toml                                   | 2 +-
 demos/llm_routing/preference_based_routing/README.md | 2 +-
 docs/source/conf.py                                  | 2 +-
 docs/source/get_started/quickstart.rst               | 4 ++--
 docs/source/resources/deployment.rst                 | 4 ++--
 10 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 60963ac1..9e8d3223 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -133,13 +133,13 @@ jobs:
           load: true
           tags: |
             ${{ env.PLANO_DOCKER_IMAGE }}
-            ${{ env.DOCKER_IMAGE }}:0.4.18
+            ${{ env.DOCKER_IMAGE }}:0.4.19
             ${{ env.DOCKER_IMAGE }}:latest
           cache-from: type=gha
           cache-to: type=gha,mode=max
 
       - name: Save image as artifact
-        run: docker save ${{ env.PLANO_DOCKER_IMAGE }} ${{ env.DOCKER_IMAGE }}:0.4.18 ${{ env.DOCKER_IMAGE }}:latest -o /tmp/plano-image.tar
+        run: docker save ${{ env.PLANO_DOCKER_IMAGE }} ${{ env.DOCKER_IMAGE }}:0.4.19 ${{ env.DOCKER_IMAGE }}:latest -o /tmp/plano-image.tar
 
       - name: Upload image artifact
         uses: actions/upload-artifact@v6
diff --git a/apps/www/src/components/Hero.tsx b/apps/www/src/components/Hero.tsx
index 4b2684a0..05e615b9 100644
--- a/apps/www/src/components/Hero.tsx
+++ b/apps/www/src/components/Hero.tsx
@@ -24,7 +24,7 @@ export function Hero() {
             >
               <div className="inline-flex flex-wrap items-center gap-1.5 sm:gap-2 px-3 sm:px-4 py-1 rounded-full bg-[rgba(185,191,255,0.4)] border border-[var(--secondary)] shadow backdrop-blur hover:bg-[rgba(185,191,255,0.6)] transition-colors cursor-pointer">
                 <span className="text-xs sm:text-sm font-medium text-black/65">
-                  v0.4.18
+                  v0.4.19
                 </span>
                 <span className="text-xs sm:text-sm font-medium text-black ">
                   —
diff --git a/build_filter_image.sh b/build_filter_image.sh
index 01e3201e..73e51b61 100644
--- a/build_filter_image.sh
+++ b/build_filter_image.sh
@@ -1 +1 @@
-docker build  -f Dockerfile . -t katanemo/plano -t katanemo/plano:0.4.18
+docker build  -f Dockerfile . -t katanemo/plano -t katanemo/plano:0.4.19
diff --git a/cli/planoai/__init__.py b/cli/planoai/__init__.py
index 0f408a67..2492d40c 100644
--- a/cli/planoai/__init__.py
+++ b/cli/planoai/__init__.py
@@ -1,3 +1,3 @@
 """Plano CLI - Intelligent Prompt Gateway."""
 
-__version__ = "0.4.18"
+__version__ = "0.4.19"
diff --git a/cli/planoai/consts.py b/cli/planoai/consts.py
index fa39fecb..af76d7cf 100644
--- a/cli/planoai/consts.py
+++ b/cli/planoai/consts.py
@@ -5,7 +5,7 @@ PLANO_COLOR = "#969FF4"
 
 SERVICE_NAME_ARCHGW = "plano"
 PLANO_DOCKER_NAME = "plano"
-PLANO_DOCKER_IMAGE = os.getenv("PLANO_DOCKER_IMAGE", "katanemo/plano:0.4.18")
+PLANO_DOCKER_IMAGE = os.getenv("PLANO_DOCKER_IMAGE", "katanemo/plano:0.4.19")
 DEFAULT_OTEL_TRACING_GRPC_ENDPOINT = "http://localhost:4317"
 
 # Native mode constants
diff --git a/cli/pyproject.toml b/cli/pyproject.toml
index b6ea8178..1864a915 100644
--- a/cli/pyproject.toml
+++ b/cli/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "planoai"
-version = "0.4.18"
+version = "0.4.19"
 description = "Python-based CLI tool to manage Plano."
 authors = [{name = "Katanemo Labs, Inc."}]
 readme = "README.md"
diff --git a/demos/llm_routing/preference_based_routing/README.md b/demos/llm_routing/preference_based_routing/README.md
index 533e4906..f04fcf06 100644
--- a/demos/llm_routing/preference_based_routing/README.md
+++ b/demos/llm_routing/preference_based_routing/README.md
@@ -3,7 +3,7 @@ This demo shows how you can use user preferences to route user prompts to approp
 
 ## How to start the demo
 
-Make sure you have Plano CLI installed (`pip install planoai==0.4.18` or `uv tool install planoai==0.4.18`).
+Make sure you have Plano CLI installed (`pip install planoai==0.4.19` or `uv tool install planoai==0.4.19`).
 
 ```bash
 cd demos/llm_routing/preference_based_routing
diff --git a/docs/source/conf.py b/docs/source/conf.py
index 1d1eb66f..401f6cff 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -17,7 +17,7 @@ from sphinxawesome_theme.postprocess import Icons
 project = "Plano Docs"
 copyright = "2026, Katanemo Labs, a DigitalOcean Company"
 author = "Katanemo Labs, Inc"
-release = " v0.4.18"
+release = " v0.4.19"
 
 # -- General configuration ---------------------------------------------------
 # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration
diff --git a/docs/source/get_started/quickstart.rst b/docs/source/get_started/quickstart.rst
index f46f5e9f..6f1a86ac 100644
--- a/docs/source/get_started/quickstart.rst
+++ b/docs/source/get_started/quickstart.rst
@@ -43,7 +43,7 @@ Plano's CLI allows you to manage and interact with the Plano efficiently. To ins
 
 .. code-block:: console
 
-   $ uv tool install planoai==0.4.18
+   $ uv tool install planoai==0.4.19
 
 **Option 2: Install with pip (Traditional)**
 
@@ -51,7 +51,7 @@ Plano's CLI allows you to manage and interact with the Plano efficiently. To ins
 
    $ python -m venv venv
    $ source venv/bin/activate   # On Windows, use: venv\Scripts\activate
-   $ pip install planoai==0.4.18
+   $ pip install planoai==0.4.19
 
 
 .. _llm_routing_quickstart:
diff --git a/docs/source/resources/deployment.rst b/docs/source/resources/deployment.rst
index 1bcdee34..1aab49c9 100644
--- a/docs/source/resources/deployment.rst
+++ b/docs/source/resources/deployment.rst
@@ -65,7 +65,7 @@ Create a ``docker-compose.yml`` file with the following configuration:
    # docker-compose.yml
    services:
      plano:
-       image: katanemo/plano:0.4.18
+       image: katanemo/plano:0.4.19
        container_name: plano
        ports:
          - "10000:10000" # ingress (client -> plano)
@@ -153,7 +153,7 @@ Create a ``plano-deployment.yaml``:
        spec:
          containers:
            - name: plano
-             image: katanemo/plano:0.4.18
+             image: katanemo/plano:0.4.19
              ports:
                - containerPort: 12000  # LLM gateway (chat completions, model routing)
                  name: llm-gateway

From 743d074184af534e39011ac1bd2c940590046fbe Mon Sep 17 00:00:00 2001
From: Musa <malikmusa1323@gmail.com>
Date: Thu, 16 Apr 2026 13:16:51 -0700
Subject: [PATCH 03/16] add Plano agent skills framework and rule set (#797)

* feat: add initial documentation for Plano Agent Skills

* feat: readme with examples

* feat: add detailed skills documentation and examples for Plano

---------

Co-authored-by: Adil Hafeez <adil.hafeez@gmail.com>
---
 skills/AGENTS.md                              | 2109 +++++++++++++++++
 skills/README.md                              |  243 ++
 skills/metadata.json                          |    8 +
 skills/package-lock.json                      |  594 +++++
 skills/package.json                           |   31 +
 skills/plano-advanced-patterns/SKILL.md       |   32 +
 skills/plano-agent-orchestration/SKILL.md     |   32 +
 skills/plano-agent-skills/SKILL.md            |   53 +
 skills/plano-cli-operations/SKILL.md          |   34 +
 skills/plano-config-fundamentals/SKILL.md     |   34 +
 skills/plano-deployment-security/SKILL.md     |   33 +
 skills/plano-filter-guardrails/SKILL.md       |   33 +
 skills/plano-observability-debugging/SKILL.md |   33 +
 skills/plano-routing-model-selection/SKILL.md |   34 +
 skills/rules/_sections.md                     |   16 +
 skills/rules/_template.md                     |   26 +
 skills/rules/advanced-multi-listener.md       |  139 ++
 skills/rules/advanced-prompt-targets.md       |  128 +
 skills/rules/agent-descriptions.md            |   75 +
 skills/rules/agent-orchestration.md           |   88 +
 skills/rules/cli-agent.md                     |   86 +
 skills/rules/cli-generate.md                  |   91 +
 skills/rules/cli-init.md                      |   66 +
 skills/rules/cli-startup.md                   |   80 +
 skills/rules/config-listeners.md              |   64 +
 skills/rules/config-providers.md              |   64 +
 skills/rules/config-secrets.md                |   72 +
 skills/rules/config-version.md                |   44 +
 skills/rules/deploy-docker.md                 |   80 +
 skills/rules/deploy-health.md                 |   90 +
 skills/rules/deploy-state.md                  |   85 +
 skills/rules/filter-guardrails.md             |   81 +
 skills/rules/filter-mcp.md                    |   59 +
 skills/rules/filter-ordering.md               |   78 +
 skills/rules/observe-span-attributes.md       |   80 +
 skills/rules/observe-trace-query.md           |   85 +
 skills/rules/observe-tracing.md               |   80 +
 skills/rules/routing-aliases.md               |   77 +
 skills/rules/routing-default.md               |   70 +
 skills/rules/routing-passthrough.md           |   69 +
 skills/rules/routing-preferences.md           |   73 +
 skills/src/build.ts                           |  262 ++
 skills/src/extract-tests.ts                   |  147 ++
 skills/src/validate.ts                        |  156 ++
 skills/test-cases.json                        |  353 +++
 skills/tsconfig.json                          |   15 +
 46 files changed, 6282 insertions(+)
 create mode 100644 skills/AGENTS.md
 create mode 100644 skills/README.md
 create mode 100644 skills/metadata.json
 create mode 100644 skills/package-lock.json
 create mode 100644 skills/package.json
 create mode 100644 skills/plano-advanced-patterns/SKILL.md
 create mode 100644 skills/plano-agent-orchestration/SKILL.md
 create mode 100644 skills/plano-agent-skills/SKILL.md
 create mode 100644 skills/plano-cli-operations/SKILL.md
 create mode 100644 skills/plano-config-fundamentals/SKILL.md
 create mode 100644 skills/plano-deployment-security/SKILL.md
 create mode 100644 skills/plano-filter-guardrails/SKILL.md
 create mode 100644 skills/plano-observability-debugging/SKILL.md
 create mode 100644 skills/plano-routing-model-selection/SKILL.md
 create mode 100644 skills/rules/_sections.md
 create mode 100644 skills/rules/_template.md
 create mode 100644 skills/rules/advanced-multi-listener.md
 create mode 100644 skills/rules/advanced-prompt-targets.md
 create mode 100644 skills/rules/agent-descriptions.md
 create mode 100644 skills/rules/agent-orchestration.md
 create mode 100644 skills/rules/cli-agent.md
 create mode 100644 skills/rules/cli-generate.md
 create mode 100644 skills/rules/cli-init.md
 create mode 100644 skills/rules/cli-startup.md
 create mode 100644 skills/rules/config-listeners.md
 create mode 100644 skills/rules/config-providers.md
 create mode 100644 skills/rules/config-secrets.md
 create mode 100644 skills/rules/config-version.md
 create mode 100644 skills/rules/deploy-docker.md
 create mode 100644 skills/rules/deploy-health.md
 create mode 100644 skills/rules/deploy-state.md
 create mode 100644 skills/rules/filter-guardrails.md
 create mode 100644 skills/rules/filter-mcp.md
 create mode 100644 skills/rules/filter-ordering.md
 create mode 100644 skills/rules/observe-span-attributes.md
 create mode 100644 skills/rules/observe-trace-query.md
 create mode 100644 skills/rules/observe-tracing.md
 create mode 100644 skills/rules/routing-aliases.md
 create mode 100644 skills/rules/routing-default.md
 create mode 100644 skills/rules/routing-passthrough.md
 create mode 100644 skills/rules/routing-preferences.md
 create mode 100644 skills/src/build.ts
 create mode 100644 skills/src/extract-tests.ts
 create mode 100644 skills/src/validate.ts
 create mode 100644 skills/test-cases.json
 create mode 100644 skills/tsconfig.json

diff --git a/skills/AGENTS.md b/skills/AGENTS.md
new file mode 100644
index 00000000..61fd7228
--- /dev/null
+++ b/skills/AGENTS.md
@@ -0,0 +1,2109 @@
+# Plano Agent Skills
+
+> Best practices for building agents and agentic applications with Plano — the AI-native proxy and dataplane. Covers configuration, routing, agent orchestration, filter chains, observability, CLI operations, and deployment patterns.
+
+**Version:** 1.0.0 | **Organization:** Plano
+
+---
+
+## Table of Contents
+
+- [Section 1: Configuration Fundamentals](#section-1)
+  - [1.1 Always Specify a Supported Config Version](#always-specify-a-supported-config-version)
+  - [1.2 Choose the Right Listener Type for Your Use Case](#choose-the-right-listener-type-for-your-use-case)
+  - [1.3 Register Model Providers with Correct Format Identifiers](#register-model-providers-with-correct-format-identifiers)
+  - [1.4 Use Environment Variable Substitution for All Secrets](#use-environment-variable-substitution-for-all-secrets)
+- [Section 2: Routing & Model Selection](#section-2)
+  - [2.1 Always Set Exactly One Default Model Provider](#always-set-exactly-one-default-model-provider)
+  - [2.2 Use Model Aliases for Semantic, Stable Model References](#use-model-aliases-for-semantic-stable-model-references)
+  - [2.3 Use Passthrough Auth for Proxy and Multi-Tenant Setups](#use-passthrough-auth-for-proxy-and-multi-tenant-setups)
+  - [2.4 Write Task-Specific Routing Preference Descriptions](#write-task-specific-routing-preference-descriptions)
+- [Section 3: Agent Orchestration](#section-3)
+  - [3.1 Register All Sub-Agents in Both `agents` and `listeners.agents`](#register-all-sub-agents-in-both-agents-and-listenersagents)
+  - [3.2 Write Capability-Focused Agent Descriptions for Accurate Routing](#write-capability-focused-agent-descriptions-for-accurate-routing)
+- [Section 4: Filter Chains & Guardrails](#section-4)
+  - [4.1 Configure MCP Filters with Explicit Type and Transport](#configure-mcp-filters-with-explicit-type-and-transport)
+  - [4.2 Configure Prompt Guards with Actionable Rejection Messages](#configure-prompt-guards-with-actionable-rejection-messages)
+  - [4.3 Order Filter Chains with Guards First, Enrichment Last](#order-filter-chains-with-guards-first-enrichment-last)
+- [Section 5: Observability & Debugging](#section-5)
+  - [5.1 Add Custom Span Attributes for Correlation and Filtering](#add-custom-span-attributes-for-correlation-and-filtering)
+  - [5.2 Enable Tracing with Appropriate Sampling for Your Environment](#enable-tracing-with-appropriate-sampling-for-your-environment)
+  - [5.3 Use `planoai trace` to Inspect Routing Decisions](#use-planoai-trace-to-inspect-routing-decisions)
+- [Section 6: CLI Operations](#section-6)
+  - [6.1 Follow the `planoai up` Validation Workflow Before Debugging Runtime Issues](#follow-the-planoai-up-validation-workflow-before-debugging-runtime-issues)
+  - [6.2 Generate Prompt Targets from Python Functions with `planoai generate_prompt_targets`](#generate-prompt-targets-from-python-functions-with-planoai-generateprompttargets)
+  - [6.3 Use `planoai cli_agent` to Connect Claude Code Through Plano](#use-planoai-cliagent-to-connect-claude-code-through-plano)
+  - [6.4 Use `planoai init` Templates to Bootstrap New Projects Correctly](#use-planoai-init-templates-to-bootstrap-new-projects-correctly)
+- [Section 7: Deployment & Security](#section-7)
+  - [7.1 Understand Plano's Docker Network Topology for Agent URL Configuration](#understand-planos-docker-network-topology-for-agent-url-configuration)
+  - [7.2 Use PostgreSQL State Storage for Multi-Turn Conversations in Production](#use-postgresql-state-storage-for-multi-turn-conversations-in-production)
+  - [7.3 Verify Listener Health Before Sending Requests](#verify-listener-health-before-sending-requests)
+- [Section 8: Advanced Patterns](#section-8)
+  - [8.1 Combine Multiple Listener Types for Layered Agent Architectures](#combine-multiple-listener-types-for-layered-agent-architectures)
+  - [8.2 Design Prompt Targets with Precise Parameter Schemas](#design-prompt-targets-with-precise-parameter-schemas)
+
+---
+
+## Section 1: Configuration Fundamentals
+
+*Core config.yaml structure, versioning, listener types, and provider setup — the entry point for every Plano deployment.*
+
+### 1.1 Always Specify a Supported Config Version
+
+**Impact:** `CRITICAL` — Plano rejects configs with missing or unsupported version fields — the version field gates all other validation
+**Tags:** `config`, `versioning`, `validation`
+
+## Always Specify a Supported Config Version
+
+Every Plano `config.yaml` must include a `version` field at the top level. Plano validates configs against a versioned JSON schema — an unrecognized or missing version will cause `planoai up` to fail immediately with a schema validation error before the container starts.
+
+**Incorrect (missing or invalid version):**
+
+```yaml
+# No version field — fails schema validation
+listeners:
+  - type: model
+    name: model_listener
+    port: 12000
+
+model_providers:
+  - model: openai/gpt-4o
+    access_key: $OPENAI_API_KEY
+```
+
+**Correct (explicit supported version):**
+
+```yaml
+version: v0.3.0
+
+listeners:
+  - type: model
+    name: model_listener
+    port: 12000
+
+model_providers:
+  - model: openai/gpt-4o
+    access_key: $OPENAI_API_KEY
+    default: true
+```
+
+Use the latest supported version unless you are targeting a specific deployed Plano image. Current supported versions: `v0.1`, `v0.1.0`, `0.1-beta`, `v0.2.0`, `v0.3.0`. Prefer `v0.3.0` for all new projects.
+
+Reference: https://github.com/katanemo/archgw/blob/main/config/plano_config_schema.yaml
+
+---
+
+### 1.2 Choose the Right Listener Type for Your Use Case
+
+**Impact:** `CRITICAL` — The listener type determines the entire request processing pipeline — choosing the wrong type means features like prompt functions or agent routing are unavailable
+**Tags:** `config`, `listeners`, `architecture`, `routing`
+
+## Choose the Right Listener Type for Your Use Case
+
+Plano supports three listener types, each serving a distinct purpose. `listeners` is the only required top-level array in a Plano config. Every listener needs at minimum a `type`, `name`, and `port`.
+
+| Type | Use When | Key Feature |
+|------|----------|-------------|
+| `model` | You want an OpenAI-compatible LLM gateway | Routes to multiple LLM providers, supports model aliases and routing preferences |
+| `prompt` | You want LLM-callable custom functions | Define `prompt_targets` that the LLM dispatches as function calls |
+| `agent` | You want multi-agent orchestration | Routes user requests to specialized sub-agents by matching agent descriptions |
+
+**Incorrect (using `model` when agents need orchestration):**
+
+```yaml
+version: v0.3.0
+
+# Wrong: a model listener cannot route to backend agent services
+listeners:
+  - type: model
+    name: main
+    port: 12000
+
+agents:
+  - id: weather_agent
+    url: http://host.docker.internal:8001
+```
+
+**Correct (use `agent` listener for multi-agent systems):**
+
+```yaml
+version: v0.3.0
+
+agents:
+  - id: weather_agent
+    url: http://host.docker.internal:8001
+  - id: travel_agent
+    url: http://host.docker.internal:8002
+
+listeners:
+  - type: agent
+    name: orchestrator
+    port: 8000
+    router: plano_orchestrator_v1
+    agents:
+      - id: weather_agent
+        description: Provides real-time weather, forecasts, and conditions for any city.
+      - id: travel_agent
+        description: Books flights, hotels, and travel itineraries.
+
+model_providers:
+  - model: openai/gpt-4o
+    access_key: $OPENAI_API_KEY
+    default: true
+```
+
+A single Plano instance can expose multiple listeners on different ports, each with a different type, to serve different clients simultaneously.
+
+Reference: https://github.com/katanemo/archgw
+
+---
+
+### 1.3 Register Model Providers with Correct Format Identifiers
+
+**Impact:** `CRITICAL` — Incorrect provider format causes request translation failures — Plano must know the wire format each provider expects
+**Tags:** `config`, `model-providers`, `llm`, `api-format`
+
+## Register Model Providers with Correct Format Identifiers
+
+Plano translates requests between its internal format and each provider's API. The `model` field uses `provider/model-name` syntax which determines both the upstream endpoint and the request/response translation layer. Some providers require an explicit `provider_interface` override.
+
+**Provider format reference:**
+
+| Model prefix | Wire format | Example |
+|---|---|---|
+| `openai/*` | OpenAI | `openai/gpt-4o` |
+| `anthropic/*` | Anthropic | `anthropic/claude-sonnet-4-20250514` |
+| `gemini/*` | Google Gemini | `gemini/gemini-2.0-flash` |
+| `mistral/*` | Mistral | `mistral/mistral-large-latest` |
+| `groq/*` | Groq | `groq/llama-3.3-70b-versatile` |
+| `deepseek/*` | DeepSeek | `deepseek/deepseek-chat` |
+| `xai/*` | Grok (OpenAI-compat) | `xai/grok-2` |
+| `together_ai/*` | Together.ai | `together_ai/meta-llama/Llama-3` |
+| `custom/*` | Requires `provider_interface` | `custom/my-local-model` |
+
+**Incorrect (missing provider prefix, ambiguous format):**
+
+```yaml
+model_providers:
+  - model: gpt-4o            # Missing openai/ prefix — Plano cannot route this
+    access_key: $OPENAI_API_KEY
+
+  - model: claude-3-5-sonnet # Missing anthropic/ prefix
+    access_key: $ANTHROPIC_API_KEY
+```
+
+**Correct (explicit provider prefixes):**
+
+```yaml
+model_providers:
+  - model: openai/gpt-4o
+    access_key: $OPENAI_API_KEY
+    default: true
+
+  - model: anthropic/claude-sonnet-4-20250514
+    access_key: $ANTHROPIC_API_KEY
+
+  - model: gemini/gemini-2.0-flash
+    access_key: $GOOGLE_API_KEY
+```
+
+**For local or self-hosted models (Ollama, LiteLLM, vLLM):**
+
+```yaml
+model_providers:
+  - model: custom/llama3
+    base_url: http://host.docker.internal:11434/v1   # Ollama endpoint
+    provider_interface: openai                        # Ollama speaks OpenAI format
+    default: true
+```
+
+Always set `default: true` on exactly one provider per listener so Plano has a fallback when routing preferences do not match.
+
+Reference: https://github.com/katanemo/archgw
+
+---
+
+### 1.4 Use Environment Variable Substitution for All Secrets
+
+**Impact:** `CRITICAL` — Hardcoded API keys in config.yaml will be committed to version control and exposed in Docker container inspect output
+**Tags:** `config`, `security`, `secrets`, `api-keys`, `environment-variables`
+
+## Use Environment Variable Substitution for All Secrets
+
+Plano supports `$VAR_NAME` substitution in config values. This applies to `access_key` fields, `connection_string` for state storage, and `http_headers` in prompt targets and endpoints. Never hardcode credentials — Plano reads them from environment variables or a `.env` file at startup via `planoai up`.
+
+**Incorrect (hardcoded secrets):**
+
+```yaml
+version: v0.3.0
+
+model_providers:
+  - model: openai/gpt-4o
+    access_key: abcdefghijklmnopqrstuvwxyz...   # Hardcoded — never do this
+
+state_storage:
+  type: postgres
+  connection_string: "postgresql://admin:mysecretpassword@prod-db:5432/plano"
+
+prompt_targets:
+  - name: get_data
+    endpoint:
+      name: my_api
+      http_headers:
+        Authorization: "Bearer abcdefghijklmnopqrstuvwxyz"   # Hardcoded token
+```
+
+**Correct (environment variable substitution):**
+
+```yaml
+version: v0.3.0
+
+model_providers:
+  - model: openai/gpt-4o
+    access_key: $OPENAI_API_KEY
+    default: true
+
+  - model: anthropic/claude-sonnet-4-20250514
+    access_key: $ANTHROPIC_API_KEY
+
+state_storage:
+  type: postgres
+  connection_string: "postgresql://${DB_USER}:${DB_PASS}@${DB_HOST}:5432/${DB_NAME}"
+
+prompt_targets:
+  - name: get_data
+    endpoint:
+      name: my_api
+      http_headers:
+        Authorization: "Bearer $MY_API_TOKEN"
+```
+
+**`.env` file pattern (loaded automatically by `planoai up`):**
+
+```bash
+# .env — add to .gitignore
+OPENAI_API_KEY=abcdefghijklmnopqrstuvwxyz...
+ANTHROPIC_API_KEY=abcdefghijklmnopqrstuvwxyz...
+DB_USER=plano
+DB_PASS=secure-password
+DB_HOST=localhost
+MY_API_TOKEN=abcdefghijklmnopqrstuvwxyz...
+```
+
+Plano also accepts keys set directly in the shell environment. Variables referenced in config but not found at startup cause `planoai up` to fail with a clear error listing the missing keys.
+
+Reference: https://github.com/katanemo/archgw
+
+---
+
+## Section 2: Routing & Model Selection
+
+*Intelligent LLM routing using preferences, aliases, and defaults to match tasks to the best model.*
+
+### 2.1 Always Set Exactly One Default Model Provider
+
+**Impact:** `HIGH` — Without a default provider, Plano has no fallback when routing preferences do not match — requests with unclassified intent will fail
+**Tags:** `routing`, `defaults`, `model-providers`, `reliability`
+
+## Always Set Exactly One Default Model Provider
+
+When a request does not match any routing preference, Plano forwards it to the `default: true` provider. Without a default, unmatched requests fail. If multiple providers are marked `default: true`, Plano uses the first one — which can produce unexpected behavior.
+
+**Incorrect (no default provider set):**
+
+```yaml
+version: v0.3.0
+
+model_providers:
+  - model: openai/gpt-4o-mini     # No default: true anywhere
+    access_key: $OPENAI_API_KEY
+    routing_preferences:
+      - name: summarization
+        description: Summarizing documents and extracting key points
+
+  - model: openai/gpt-4o
+    access_key: $OPENAI_API_KEY
+    routing_preferences:
+      - name: code_generation
+        description: Writing new functions and implementing algorithms
+```
+
+**Incorrect (multiple defaults — ambiguous):**
+
+```yaml
+model_providers:
+  - model: openai/gpt-4o-mini
+    default: true               # First default
+    access_key: $OPENAI_API_KEY
+
+  - model: openai/gpt-4o
+    default: true               # Second default — confusing
+    access_key: $OPENAI_API_KEY
+```
+
+**Correct (exactly one default, covering unmatched requests):**
+
+```yaml
+version: v0.3.0
+
+model_providers:
+  - model: openai/gpt-4o-mini
+    access_key: $OPENAI_API_KEY
+    default: true               # Handles general/unclassified requests
+    routing_preferences:
+      - name: summarization
+        description: Summarizing documents, articles, and meeting notes
+      - name: classification
+        description: Categorizing inputs, labeling, and intent detection
+
+  - model: openai/gpt-4o
+    access_key: $OPENAI_API_KEY
+    routing_preferences:
+      - name: code_generation
+        description: Writing, debugging, and reviewing code
+      - name: complex_reasoning
+        description: Multi-step math, logical analysis, research synthesis
+```
+
+Choose your most cost-effective capable model as the default — it handles all traffic that doesn't match specialized preferences.
+
+Reference: [https://github.com/katanemo/archgw](https://github.com/katanemo/archgw)
+
+---
+
+### 2.2 Use Model Aliases for Semantic, Stable Model References
+
+**Impact:** `MEDIUM` — Hardcoded model names in client code require code changes when you swap providers; aliases let you update routing in config.yaml alone
+**Tags:** `routing`, `model-aliases`, `maintainability`, `client-integration`
+
+## Use Model Aliases for Semantic, Stable Model References
+
+`model_aliases` map human-readable names to specific model identifiers. Client applications reference the alias, not the underlying model. When you want to upgrade from `gpt-4o` to a new model, you change one line in `config.yaml` — not every client calling the API.
+
+**Incorrect (clients hardcode specific model names):**
+
+```yaml
+# config.yaml — no aliases defined
+version: v0.3.0
+
+listeners:
+  - type: model
+    name: model_listener
+    port: 12000
+
+model_providers:
+  - model: openai/gpt-4o
+    access_key: $OPENAI_API_KEY
+    default: true
+```
+
+```python
+# Client code — brittle, must be updated when model changes
+client.chat.completions.create(model="gpt-4o", ...)
+```
+
+**Correct (semantic aliases, stable client contracts):**
+
+```yaml
+version: v0.3.0
+
+listeners:
+  - type: model
+    name: model_listener
+    port: 12000
+
+model_providers:
+  - model: openai/gpt-4o-mini
+    access_key: $OPENAI_API_KEY
+    default: true
+  - model: openai/gpt-4o
+    access_key: $OPENAI_API_KEY
+  - model: anthropic/claude-sonnet-4-20250514
+    access_key: $ANTHROPIC_API_KEY
+
+model_aliases:
+  plano.fast.v1:
+    target: gpt-4o-mini          # Cheap, fast — for high-volume tasks
+
+  plano.smart.v1:
+    target: gpt-4o               # High capability — for complex reasoning
+
+  plano.creative.v1:
+    target: claude-sonnet-4-20250514  # Strong creative writing and analysis
+
+  plano.v1:
+    target: gpt-4o               # Default production alias
+```
+
+```python
+# Client code — stable, alias is the contract
+client.chat.completions.create(model="plano.smart.v1", ...)
+```
+
+**Alias naming conventions:**
+- `<org>.<purpose>.<version>` — e.g., `plano.fast.v1`, `acme.code.v2`
+- Bumping `.v2` → `.v3` lets you run old and new aliases simultaneously during rollouts
+- `plano.v1` as a canonical default gives clients a single stable entry point
+
+Reference: https://github.com/katanemo/archgw
+
+---
+
+### 2.3 Use Passthrough Auth for Proxy and Multi-Tenant Setups
+
+**Impact:** `MEDIUM` — Without passthrough auth, self-hosted proxy services (LiteLLM, vLLM, etc.) reject Plano's requests because the wrong Authorization header is sent
+**Tags:** `routing`, `authentication`, `proxy`, `litellm`, `multi-tenant`
+
+## Use Passthrough Auth for Proxy and Multi-Tenant Setups
+
+When routing to a self-hosted LLM proxy (LiteLLM, vLLM, OpenRouter, Azure APIM) or in multi-tenant setups where clients supply their own keys, set `passthrough_auth: true`. This forwards the client's `Authorization` header rather than Plano's configured `access_key`. Combine with a `base_url` pointing to the proxy.
+
+**Incorrect (Plano sends its own key to a proxy that expects the client's key):**
+
+```yaml
+model_providers:
+  - model: custom/proxy
+    base_url: http://host.docker.internal:8000
+    access_key: $SOME_KEY    # Plano overwrites the client's auth — proxy rejects it
+```
+
+**Correct (forward client Authorization header to the proxy):**
+
+```yaml
+version: v0.3.0
+
+listeners:
+  - type: model
+    name: model_listener
+    port: 12000
+
+model_providers:
+  - model: custom/litellm-proxy
+    base_url: http://host.docker.internal:4000    # LiteLLM server
+    provider_interface: openai                    # LiteLLM uses OpenAI format
+    passthrough_auth: true                        # Forward client's Bearer token
+    default: true
+```
+
+**Multi-tenant pattern (client supplies their own API key):**
+
+```yaml
+model_providers:
+  # Plano acts as a passthrough gateway; each client has their own OpenAI key
+  - model: openai/gpt-4o
+    passthrough_auth: true    # No access_key here — client's key is forwarded
+    default: true
+```
+
+**Combined: proxy for some models, Plano-managed for others:**
+
+```yaml
+model_providers:
+  - model: openai/gpt-4o-mini
+    access_key: $OPENAI_API_KEY    # Plano manages this key
+    default: true
+    routing_preferences:
+      - name: quick tasks
+        description: Short answers, simple lookups, fast completions
+
+  - model: custom/vllm-llama
+    base_url: http://gpu-server:8000
+    provider_interface: openai
+    passthrough_auth: true         # vLLM cluster handles its own auth
+    routing_preferences:
+      - name: long context
+        description: Processing very long documents, multi-document analysis
+```
+
+Reference: https://github.com/katanemo/archgw
+
+---
+
+### 2.4 Write Task-Specific Routing Preference Descriptions
+
+**Impact:** `HIGH` — Vague preference descriptions cause Plano's internal router LLM to misclassify requests, routing expensive tasks to cheap models and vice versa
+**Tags:** `routing`, `model-selection`, `preferences`, `llm-routing`
+
+## Write Task-Specific Routing Preference Descriptions
+
+Plano's `plano_orchestrator_v1` router uses a 1.5B preference-aligned LLM to classify incoming requests against your `routing_preferences` descriptions. It routes the request to the first provider whose preferences match. Description quality directly determines routing accuracy.
+
+**Incorrect (vague, overlapping descriptions):**
+
+```yaml
+model_providers:
+  - model: openai/gpt-4o-mini
+    access_key: $OPENAI_API_KEY
+    default: true
+    routing_preferences:
+      - name: simple
+        description: easy tasks      # Too vague — what is "easy"?
+
+  - model: openai/gpt-4o
+    access_key: $OPENAI_API_KEY
+    routing_preferences:
+      - name: hard
+        description: hard tasks      # Too vague — overlaps with "easy"
+```
+
+**Correct (specific, distinct task descriptions):**
+
+```yaml
+model_providers:
+  - model: openai/gpt-4o-mini
+    access_key: $OPENAI_API_KEY
+    default: true
+    routing_preferences:
+      - name: summarization
+        description: >
+          Summarizing documents, articles, emails, or meeting transcripts.
+          Extracting key points, generating TL;DR sections, condensing long text.
+      - name: classification
+        description: >
+          Categorizing inputs, sentiment analysis, spam detection,
+          intent classification, labeling structured data fields.
+      - name: translation
+        description: >
+          Translating text between languages, localization tasks.
+
+  - model: openai/gpt-4o
+    access_key: $OPENAI_API_KEY
+    routing_preferences:
+      - name: code_generation
+        description: >
+          Writing new functions, classes, or modules from scratch.
+          Implementing algorithms, boilerplate generation, API integrations.
+      - name: code_review
+        description: >
+          Reviewing code for bugs, security vulnerabilities, performance issues.
+          Suggesting refactors, explaining complex code, debugging errors.
+      - name: complex_reasoning
+        description: >
+          Multi-step math problems, logical deduction, strategic planning,
+          research synthesis requiring chain-of-thought reasoning.
+```
+
+**Key principles for good preference descriptions:**
+- Use concrete action verbs: "writing", "reviewing", "translating", "summarizing"
+- List 3–5 specific sub-tasks or synonyms for each preference
+- Ensure preferences across providers are mutually exclusive in scope
+- Test with representative queries using `planoai trace` and `--where` filters to verify routing decisions
+
+Reference: https://github.com/katanemo/archgw
+
+---
+
+## Section 3: Agent Orchestration
+
+*Multi-agent patterns, agent descriptions, and orchestration strategies for building agentic applications.*
+
+### 3.1 Register All Sub-Agents in Both `agents` and `listeners.agents`
+
+**Impact:** `CRITICAL` — An agent registered only in `agents` but not referenced in a listener's agent list is unreachable; an agent listed in a listener but missing from `agents` causes a startup error
+**Tags:** `agent`, `orchestration`, `config`, `multi-agent`
+
+## Register All Sub-Agents in Both `agents` and `listeners.agents`
+
+Plano's agent system has two separate concepts: the global `agents` array (defines the agent's ID and backend URL) and the `listeners[].agents` array (controls which agents are available to an orchestrator and provides their routing descriptions). Both must reference the same agent ID.
+
+**Incorrect (agent defined globally but not referenced in listener):**
+
+```yaml
+version: v0.3.0
+
+agents:
+  - id: weather_agent
+    url: http://host.docker.internal:8001
+  - id: news_agent              # Defined but never referenced in any listener
+    url: http://host.docker.internal:8002
+
+listeners:
+  - type: agent
+    name: orchestrator
+    port: 8000
+    router: plano_orchestrator_v1
+    agents:
+      - id: weather_agent
+        description: Provides weather forecasts and current conditions.
+      # news_agent is missing here — the orchestrator cannot route to it
+```
+
+**Incorrect (listener references an agent ID not in the global agents list):**
+
+```yaml
+agents:
+  - id: weather_agent
+    url: http://host.docker.internal:8001
+
+listeners:
+  - type: agent
+    name: orchestrator
+    port: 8000
+    router: plano_orchestrator_v1
+    agents:
+      - id: weather_agent
+        description: Provides weather forecasts.
+      - id: flights_agent        # ID not in global agents[] — startup error
+        description: Provides flight status information.
+```
+
+**Correct (every agent ID appears in both places):**
+
+```yaml
+version: v0.3.0
+
+agents:
+  - id: weather_agent
+    url: http://host.docker.internal:8001
+  - id: flights_agent
+    url: http://host.docker.internal:8002
+  - id: hotels_agent
+    url: http://host.docker.internal:8003
+
+model_providers:
+  - model: openai/gpt-4o
+    access_key: $OPENAI_API_KEY
+    default: true
+
+listeners:
+  - type: agent
+    name: travel_orchestrator
+    port: 8000
+    router: plano_orchestrator_v1
+    agents:
+      - id: weather_agent
+        description: Real-time weather, forecasts, and climate data for any city.
+      - id: flights_agent
+        description: Live flight status, schedules, gates, and delays.
+      - id: hotels_agent
+        description: Hotel search, availability, pricing, and booking.
+        default: true    # Fallback if no other agent matches
+```
+
+Set `default: true` on one agent in each listener's agents list to handle unmatched requests. The agent's URL in the global `agents` array is the HTTP endpoint Plano forwards matching requests to — it must be reachable from within the Docker container (use `host.docker.internal` for services on the host).
+
+Reference: https://github.com/katanemo/archgw
+
+---
+
+### 3.2 Write Capability-Focused Agent Descriptions for Accurate Routing
+
+**Impact:** `HIGH` — The orchestrator LLM routes requests purely by reading agent descriptions — poor descriptions cause misroutes to the wrong specialized agent
+**Tags:** `agent`, `orchestration`, `descriptions`, `routing`, `multi-agent`
+
+## Write Capability-Focused Agent Descriptions for Accurate Routing
+
+In an `agent` listener, Plano's orchestrator reads each agent's `description` and routes user requests to the best-matching agent. This is LLM-based intent matching — the description is the entire specification the router sees. Write it as a capability manifest: what can this agent do, what data does it have access to, and what types of requests should it handle?
+
+**Incorrect (generic, overlapping descriptions):**
+
+```yaml
+listeners:
+  - type: agent
+    name: orchestrator
+    port: 8000
+    router: plano_orchestrator_v1
+    agents:
+      - id: agent_1
+        description: Helps users with information    # Too generic — matches everything
+
+      - id: agent_2
+        description: Also helps users               # Indistinguishable from agent_1
+```
+
+**Correct (specific capabilities, distinct domains, concrete examples):**
+
+```yaml
+version: v0.3.0
+
+agents:
+  - id: weather_agent
+    url: http://host.docker.internal:8001
+  - id: flight_agent
+    url: http://host.docker.internal:8002
+  - id: hotel_agent
+    url: http://host.docker.internal:8003
+
+listeners:
+  - type: agent
+    name: travel_orchestrator
+    port: 8000
+    router: plano_orchestrator_v1
+    agents:
+      - id: weather_agent
+        description: >
+          Provides real-time weather conditions and multi-day forecasts for any city
+          worldwide. Handles questions about temperature, precipitation, wind, humidity,
+          sunrise/sunset times, and severe weather alerts. Examples: "What's the weather
+          in Tokyo?", "Will it rain in London this weekend?", "Sunrise time in New York."
+
+      - id: flight_agent
+        description: >
+          Provides live flight status, schedules, gate information, delays, and
+          aircraft details for any flight number or route between airports.
+          Handles questions about departures, arrivals, and airline information.
+          Examples: "Is AA123 on time?", "Flights from JFK to LAX tomorrow."
+
+      - id: hotel_agent
+        description: >
+          Searches and books hotel accommodations, compares room types, pricing,
+          and availability. Handles check-in/check-out dates, amenities, and
+          cancellation policies. Examples: "Hotels near Times Square for next Friday."
+```
+
+**Description writing checklist:**
+- State the primary domain in the first sentence
+- List 3–5 specific data types or question categories this agent handles
+- Include 2–3 concrete example user queries in quotes
+- Avoid capability overlap between agents — if they overlap, the router will split traffic unpredictably
+- Keep descriptions under 150 words — the orchestrator reads all descriptions per request
+
+Reference: https://github.com/katanemo/archgw
+
+---
+
+## Section 4: Filter Chains & Guardrails
+
+*Request/response processing pipelines — ordering, MCP integration, and safety guardrails.*
+
+### 4.1 Configure MCP Filters with Explicit Type and Transport
+
+**Impact:** `MEDIUM` — Omitting type and transport fields relies on defaults that may not match your MCP server's protocol implementation
+**Tags:** `filter`, `mcp`, `integration`, `configuration`
+
+## Configure MCP Filters with Explicit Type and Transport
+
+Plano filters integrate with external services via MCP (Model Context Protocol) or plain HTTP. MCP filters call a specific tool on a remote MCP server. Always specify `type`, `transport`, and optionally `tool` (defaults to the filter `id`) to ensure Plano connects correctly to your filter implementation.
+
+**Incorrect (minimal filter definition relying on all defaults):**
+
+```yaml
+filters:
+  - id: my_guard          # Plano infers type=mcp, transport=streamable-http, tool=my_guard
+    url: http://localhost:10500
+    # If your MCP server uses a different tool name or transport, this silently misroutes
+```
+
+**Correct (explicit configuration for each filter):**
+
+```yaml
+version: v0.3.0
+
+filters:
+  - id: input_guards
+    url: http://host.docker.internal:10500
+    type: mcp                        # Explicitly MCP protocol
+    transport: streamable-http       # Streamable HTTP transport
+    tool: input_guards               # MCP tool name (matches MCP server registration)
+
+  - id: query_rewriter
+    url: http://host.docker.internal:10501
+    type: mcp
+    transport: streamable-http
+    tool: rewrite_query              # Tool name differs from filter ID — explicit is safer
+
+  - id: custom_validator
+    url: http://host.docker.internal:10503
+    type: http                       # Plain HTTP filter (not MCP)
+    # No tool field for HTTP filters
+```
+
+**MCP filter implementation contract:**
+Your MCP server must expose a tool matching the `tool` name. The tool receives the request payload and must return either:
+- A modified request (to pass through with changes)
+- A rejection response (to short-circuit the pipeline)
+
+**HTTP filter alternative** — use `type: http` for simpler request/response interceptors that don't need the MCP protocol:
+
+```yaml
+filters:
+  - id: auth_validator
+    url: http://host.docker.internal:9000/validate
+    type: http    # Plano POSTs the request, expects the modified request back
+```
+
+Reference: https://github.com/katanemo/archgw
+
+---
+
+### 4.2 Configure Prompt Guards with Actionable Rejection Messages
+
+**Impact:** `MEDIUM` — A generic or empty rejection message leaves users confused about why their request was blocked and unable to rephrase appropriately
+**Tags:** `filter`, `guardrails`, `jailbreak`, `security`, `ux`
+
+## Configure Prompt Guards with Actionable Rejection Messages
+
+Plano has built-in `prompt_guards` for detecting jailbreak attempts. When triggered, Plano returns the `on_exception.message` instead of forwarding the request. Write messages that explain the restriction and suggest what the user can do instead — both for user experience and to reduce support burden.
+
+**Incorrect (no message configured — returns a generic error):**
+
+```yaml
+version: v0.3.0
+
+prompt_guards:
+  input_guards:
+    jailbreak:
+      on_exception: {}    # Empty — returns unhelpful generic error
+```
+
+**Incorrect (cryptic technical message):**
+
+```yaml
+prompt_guards:
+  input_guards:
+    jailbreak:
+      on_exception:
+        message: "Error code 403: guard triggered"    # Unhelpful to the user
+```
+
+**Correct (clear, actionable, brand-appropriate message):**
+
+```yaml
+version: v0.3.0
+
+prompt_guards:
+  input_guards:
+    jailbreak:
+      on_exception:
+        message: >
+          I'm not able to help with that request. This assistant is designed
+          to help with [your use case, e.g., customer support, coding questions].
+          Please rephrase your question or contact support@yourdomain.com
+          if you believe this is an error.
+```
+
+**Combining prompt_guards with MCP filter guardrails:**
+
+```yaml
+# Built-in jailbreak detection (fast, no external service needed)
+prompt_guards:
+  input_guards:
+    jailbreak:
+      on_exception:
+        message: "This request cannot be processed. Please ask about our products and services."
+
+# MCP-based custom guards for additional policy enforcement
+filters:
+  - id: topic_restriction
+    url: http://host.docker.internal:10500
+    type: mcp
+    transport: streamable-http
+    tool: topic_restriction    # Custom filter for domain-specific restrictions
+
+listeners:
+  - type: agent
+    name: customer_support
+    port: 8000
+    router: plano_orchestrator_v1
+    agents:
+      - id: support_agent
+        description: Customer support assistant for product questions and order issues.
+        filter_chain:
+          - topic_restriction    # Additional custom topic filtering
+```
+
+`prompt_guards` applies globally to all listeners. Use `filter_chain` on individual agents for per-agent policies.
+
+Reference: https://github.com/katanemo/archgw
+
+---
+
+### 4.3 Order Filter Chains with Guards First, Enrichment Last
+
+**Impact:** `HIGH` — Running context builders before input guards means jailbreak attempts get RAG-enriched context before being blocked — wasting compute and risking data exposure
+**Tags:** `filter`, `guardrails`, `security`, `pipeline`, `ordering`
+
+## Order Filter Chains with Guards First, Enrichment Last
+
+A `filter_chain` is an ordered list of filter IDs applied sequentially to each request. The order is semantically meaningful: each filter receives the output of the previous one. Safety and validation filters must run first to short-circuit bad requests before expensive enrichment filters process them.
+
+**Recommended filter chain order:**
+
+1. **Input guards** — jailbreak detection, PII detection, topic restrictions (reject early)
+2. **Query rewriting** — normalize or enhance the user query
+3. **Context building** — RAG retrieval, tool lookup, knowledge injection (expensive)
+4. **Output guards** — validate or sanitize LLM response before returning
+
+**Incorrect (context built before guards — wasteful and potentially unsafe):**
+
+```yaml
+filters:
+  - id: context_builder
+    url: http://host.docker.internal:10502    # Runs expensive RAG retrieval first
+  - id: query_rewriter
+    url: http://host.docker.internal:10501
+  - id: input_guards
+    url: http://host.docker.internal:10500    # Guards run last — jailbreak gets context
+
+listeners:
+  - type: agent
+    name: rag_orchestrator
+    port: 8000
+    router: plano_orchestrator_v1
+    agents:
+      - id: rag_agent
+        filter_chain:
+          - context_builder   # Wrong: expensive enrichment before safety check
+          - query_rewriter
+          - input_guards
+```
+
+**Correct (guards block bad requests before any enrichment):**
+
+```yaml
+version: v0.3.0
+
+filters:
+  - id: input_guards
+    url: http://host.docker.internal:10500
+    type: mcp
+    transport: streamable-http
+  - id: query_rewriter
+    url: http://host.docker.internal:10501
+    type: mcp
+    transport: streamable-http
+  - id: context_builder
+    url: http://host.docker.internal:10502
+    type: mcp
+    transport: streamable-http
+
+listeners:
+  - type: agent
+    name: rag_orchestrator
+    port: 8000
+    router: plano_orchestrator_v1
+    agents:
+      - id: rag_agent
+        description: Answers questions using internal knowledge base documents.
+        filter_chain:
+          - input_guards      # 1. Block jailbreaks and policy violations
+          - query_rewriter    # 2. Normalize the safe query
+          - context_builder   # 3. Retrieve relevant context for the clean query
+```
+
+Different agents within the same listener can have different filter chains — a public-facing agent may need all guards while an internal admin agent may skip them.
+
+Reference: https://github.com/katanemo/archgw
+
+---
+
+## Section 5: Observability & Debugging
+
+*OpenTelemetry tracing, log levels, span attributes, and sampling for production visibility.*
+
+### 5.1 Add Custom Span Attributes for Correlation and Filtering
+
+**Impact:** `MEDIUM` — Without custom span attributes, traces cannot be filtered by user, session, or environment — making production debugging significantly harder
+**Tags:** `observability`, `tracing`, `span-attributes`, `correlation`
+
+## Add Custom Span Attributes for Correlation and Filtering
+
+Plano can automatically extract HTTP request headers and attach them as span attributes, plus attach static key-value pairs to every span. This enables filtering traces by user, session, tenant, environment, or any other dimension that matters to your application.
+
+**Incorrect (no span attributes — traces are unfiltered blobs):**
+
+```yaml
+tracing:
+  random_sampling: 20
+  # No span_attributes — cannot filter by user, session, or environment
+```
+
+**Correct (rich span attributes for production correlation):**
+
+```yaml
+version: v0.3.0
+
+tracing:
+  random_sampling: 20
+  trace_arch_internal: true
+
+  span_attributes:
+    # Match all headers with this prefix, then map to span attributes by:
+    # 1) stripping the prefix and 2) converting hyphens to dots
+    header_prefixes:
+      - x-katanemo-
+
+    # Static attributes added to every span from this Plano instance
+    static:
+      environment: production
+      service.name: plano-gateway
+      deployment.region: us-east-1
+      service.version: "2.1.0"
+      team: platform-engineering
+```
+
+**Sending correlation headers from client code:**
+
+```python
+import httpx
+
+response = httpx.post(
+    "http://localhost:12000/v1/chat/completions",
+    headers={
+        "x-katanemo-request-id": "req_abc123",
+        "x-katanemo-user-id": "usr_12",
+        "x-katanemo-session-id": "sess_xyz456",
+        "x-katanemo-tenant-id": "acme-corp",
+    },
+    json={"model": "plano.v1", "messages": [...]}
+)
+```
+
+**Querying by custom attribute:**
+
+```bash
+# Find all requests from a specific user
+planoai trace --where user.id=usr_12
+
+# Find all traces from production environment
+planoai trace --where environment=production
+
+# Find traces from a specific tenant
+planoai trace --where tenant.id=acme-corp
+```
+
+Header prefix matching is a prefix match. With `x-katanemo-`, these mappings apply:
+
+- `x-katanemo-user-id` -> `user.id`
+- `x-katanemo-tenant-id` -> `tenant.id`
+- `x-katanemo-request-id` -> `request.id`
+
+Reference: [https://github.com/katanemo/archgw](https://github.com/katanemo/archgw)
+
+---
+
+### 5.2 Enable Tracing with Appropriate Sampling for Your Environment
+
+**Impact:** `HIGH` — Without tracing enabled, debugging routing decisions, latency issues, and model selection is guesswork — traces are the primary observability primitive in Plano
+**Tags:** `observability`, `tracing`, `opentelemetry`, `otel`, `debugging`
+
+## Enable Tracing with Appropriate Sampling for Your Environment
+
+Plano emits OpenTelemetry (OTEL) traces for every request, capturing routing decisions, LLM provider selection, filter chain execution, and response latency. Traces are the best tool for understanding why a request was routed to a particular model and debugging unexpected behavior.
+
+**Incorrect (no tracing configured — flying blind in production):**
+
+```yaml
+version: v0.3.0
+
+listeners:
+  - type: model
+    name: model_listener
+    port: 12000
+
+model_providers:
+  - model: openai/gpt-4o
+    access_key: $OPENAI_API_KEY
+    default: true
+
+# No tracing block — no visibility into routing, latency, or errors
+```
+
+**Correct (tracing enabled with environment-appropriate sampling):**
+
+```yaml
+version: v0.3.0
+
+listeners:
+  - type: model
+    name: model_listener
+    port: 12000
+
+model_providers:
+  - model: openai/gpt-4o
+    access_key: $OPENAI_API_KEY
+    default: true
+
+tracing:
+  random_sampling: 100              # 100% for development/debugging
+  trace_arch_internal: true         # Include Plano's internal routing spans
+```
+
+**Production configuration (sampled to control volume):**
+
+```yaml
+tracing:
+  random_sampling: 10               # Sample 10% of requests in production
+  trace_arch_internal: false        # Skip internal spans to reduce noise
+  span_attributes:
+    header_prefixes:
+      - x-katanemo-               # Match all x-katanemo-* headers
+    static:
+      environment: production
+      service.name: my-plano-service
+      version: "1.0.0"
+```
+
+With `x-katanemo-` configured, Plano maps headers to attributes by stripping the prefix and converting hyphens to dots:
+
+- `x-katanemo-user-id` -> `user.id`
+- `x-katanemo-session-id` -> `session.id`
+- `x-katanemo-request-id` -> `request.id`
+
+**Starting the trace collector:**
+
+```bash
+# Start Plano with built-in OTEL collector
+planoai up config.yaml --with-tracing
+```
+
+Sampling rates: 100% for dev/staging, 5–20% for high-traffic production, 100% for low-traffic production. `trace_arch_internal: true` adds spans showing which routing preference matched — essential for debugging preference configuration.
+
+Reference: [https://github.com/katanemo/archgw](https://github.com/katanemo/archgw)
+
+---
+
+### 5.3 Use `planoai trace` to Inspect Routing Decisions
+
+**Impact:** `MEDIUM-HIGH` — The trace CLI lets you verify which model was selected, why, and how long each step took — without setting up a full OTEL backend
+**Tags:** `observability`, `tracing`, `cli`, `debugging`, `routing`
+
+## Use `planoai trace` to Inspect Routing Decisions
+
+`planoai trace` provides a built-in trace viewer backed by an in-memory OTEL collector. Use it to inspect routing decisions, verify preference matching, measure filter latency, and debug failed requests — all from the CLI without configuring Jaeger, Zipkin, or another backend.
+
+**Workflow: start collector, run requests, then inspect traces:**
+
+```bash
+# 1. Start Plano with the built-in trace collector (recommended)
+planoai up config.yaml --with-tracing
+
+# 2. Send test requests through Plano
+curl http://localhost:12000/v1/chat/completions \
+  -H "Content-Type: application/json" \
+  -d '{"model": "plano.v1", "messages": [{"role": "user", "content": "Write a Python function to sort a list"}]}'
+
+# 3. Show the latest trace
+planoai trace
+```
+
+You can also run the trace listener directly:
+
+```bash
+planoai trace listen # available on a process ID running OTEL collector
+```
+
+Stop the background trace listener:
+
+```bash
+planoai trace down
+```
+
+**Useful trace viewer patterns:**
+
+```bash
+# Show latest trace (default target is "last")
+planoai trace
+
+# List available trace IDs
+planoai trace --list
+
+# Show all traces
+planoai trace any
+
+# Show a specific trace (short 8-char or full 32-char ID)
+planoai trace 7f4e9a1c
+planoai trace 7f4e9a1c0d9d4a0bb9bf5a8a7d13f62a
+
+# Filter by specific span attributes (AND semantics for repeated --where)
+planoai trace any --where llm.model=gpt-4o-mini
+
+# Filter by user ID (if header prefix is x-katanemo-, x-katanemo-user-id maps to user.id)
+planoai trace any --where user.id=user_123
+
+# Limit results for a quick sanity check
+planoai trace any --limit 5
+
+# Time window filter
+planoai trace any --since 30m
+
+# Filter displayed attributes by key pattern
+planoai trace any --filter "http.*"
+
+# Output machine-readable JSON
+planoai trace any --json
+```
+
+**What to look for in traces:**
+
+
+| Span name           | What it tells you                                             |
+| ------------------- | ------------------------------------------------------------- |
+| `plano.routing`     | Which routing preference matched and which model was selected |
+| `plano.filter.<id>` | How long each filter in the chain took                        |
+| `plano.llm.request` | Time to first token and full response time                    |
+| `plano.agent.route` | Which agent description matched for agent listeners           |
+
+
+Reference: [https://github.com/katanemo/archgw](https://github.com/katanemo/archgw)
+
+---
+
+## Section 6: CLI Operations
+
+*Using the planoai CLI for startup, tracing, CLI agents, project init, and code generation.*
+
+### 6.1 Follow the `planoai up` Validation Workflow Before Debugging Runtime Issues
+
+**Impact:** `HIGH` — `planoai up` validates config, checks API keys, and health-checks all listeners — skipping this diagnostic information leads to unnecessary debugging of container or network issues
+**Tags:** `cli`, `startup`, `validation`, `debugging`, `workflow`
+
+## Follow the `planoai up` Validation Workflow Before Debugging Runtime Issues
+
+`planoai up` is the entry point for running Plano. It performs sequential checks before the container starts: schema validation, API key presence check, container startup, and health checks on all configured listener ports. Understanding what each failure stage means prevents chasing the wrong root cause.
+
+**Validation stages and failure signals:**
+
+```
+Stage 1: Schema validation        → "config.yaml: invalid against schema"
+Stage 2: API key check            → "Missing required environment variables: OPENAI_API_KEY"
+Stage 3: Container start          → "Docker daemon not running" or image pull errors
+Stage 4: Health check (/healthz)  → "Listener not healthy after 120s" (timeout)
+```
+
+**Development startup workflow:**
+
+```bash
+# Standard startup — config.yaml in current directory
+planoai up
+
+# Explicit config file path
+planoai up my-config.yaml
+
+# Start in foreground to see all logs immediately (great for debugging)
+planoai up config.yaml --foreground
+
+# Start with built-in OTEL trace collector
+planoai up config.yaml --with-tracing
+
+# Enable verbose logging for debugging routing decisions
+LOG_LEVEL=debug planoai up config.yaml --foreground
+```
+
+**Checking what's running:**
+
+```bash
+# Stream recent logs (last N lines, then exit)
+planoai logs
+
+# Follow logs in real-time
+planoai logs --follow
+
+# Include Envoy/gateway debug messages
+planoai logs --debug --follow
+```
+
+**Stopping and restarting after config changes:**
+
+```bash
+# Stop the current container
+planoai down
+
+# Restart with updated config
+planoai up config.yaml
+```
+
+**Common failure patterns:**
+
+```bash
+# API key missing — check your .env file or shell environment
+export OPENAI_API_KEY=sk-proj-...
+planoai up config.yaml
+
+# Health check timeout — listener port may conflict
+# Check if another process uses port 12000
+lsof -i :12000
+
+# Container fails to start — verify Docker daemon is running
+docker ps
+```
+
+`planoai down` fully stops and removes the Plano container. Always run `planoai down` before `planoai up` when changing config to avoid stale container state.
+
+Reference: https://github.com/katanemo/archgw
+
+---
+
+### 6.2 Generate Prompt Targets from Python Functions with `planoai generate_prompt_targets`
+
+**Impact:** `MEDIUM` — Manually writing prompt_targets YAML for existing Python APIs is error-prone — the generator introspects function signatures and produces correct YAML automatically
+**Tags:** `cli`, `generate`, `prompt-targets`, `python`, `code-generation`
+
+## Generate Prompt Targets from Python Functions with `planoai generate_prompt_targets`
+
+`planoai generate_prompt_targets` introspects Python function signatures and docstrings to generate `prompt_targets` YAML for your Plano config. This is the fastest way to expose existing Python APIs as LLM-callable functions without manually writing the YAML schema.
+
+**Python function requirements for generation:**
+- Use simple type annotations: `int`, `float`, `bool`, `str`, `list`, `tuple`, `set`, `dict`
+- Include a docstring describing what the function does (becomes the `description`)
+- Complex Pydantic models must be flattened into primitive typed parameters first
+
+**Example Python file:**
+
+```python
+# api.py
+
+def get_stock_quote(symbol: str, exchange: str = "NYSE") -> dict:
+    """Get the current stock price and trading data for a given stock symbol.
+
+    Returns price, volume, market cap, and 24h change percentage.
+    """
+    # Implementation calls stock API
+    pass
+
+def get_weather_forecast(city: str, days: int = 3, units: str = "celsius") -> dict:
+    """Get the weather forecast for a city.
+
+    Returns temperature, precipitation, and conditions for the specified number of days.
+    """
+    pass
+
+def search_flights(origin: str, destination: str, date: str, passengers: int = 1) -> list:
+    """Search for available flights between two airports on a given date.
+
+    Date format: YYYY-MM-DD. Returns list of flight options with prices.
+    """
+    pass
+```
+
+**Running the generator:**
+
+```bash
+planoai generate_prompt_targets --file api.py
+```
+
+**Generated output (add to your config.yaml):**
+
+```yaml
+prompt_targets:
+  - name: get_stock_quote
+    description: Get the current stock price and trading data for a given stock symbol.
+    parameters:
+      - name: symbol
+        type: str
+        required: true
+      - name: exchange
+        type: str
+        required: false
+        default: NYSE
+    # Add endpoint manually:
+    endpoint:
+      name: stock_api
+      path: /quote?symbol={symbol}&exchange={exchange}
+
+  - name: get_weather_forecast
+    description: Get the weather forecast for a city.
+    parameters:
+      - name: city
+        type: str
+        required: true
+      - name: days
+        type: int
+        required: false
+        default: 3
+      - name: units
+        type: str
+        required: false
+        default: celsius
+    endpoint:
+      name: weather_api
+      path: /forecast?city={city}&days={days}&units={units}
+```
+
+After generation, manually add the `endpoint` blocks pointing to your actual API. The generator produces the schema; you wire in the connectivity.
+
+Reference: https://github.com/katanemo/archgw
+
+---
+
+### 6.3 Use `planoai cli_agent` to Connect Claude Code Through Plano
+
+**Impact:** `MEDIUM-HIGH` — Running Claude Code directly against provider APIs bypasses Plano's routing, observability, and guardrails — cli_agent routes all Claude Code traffic through your configured Plano instance
+**Tags:** `cli`, `cli-agent`, `claude`, `coding-agent`, `integration`
+
+## Use `planoai cli_agent` to Connect Claude Code Through Plano
+
+`planoai cli_agent` starts a Claude Code session that routes all LLM traffic through your running Plano instance instead of directly to Anthropic. This gives you routing preferences, model aliases, tracing, and guardrails for your coding agent workflows — making Claude Code a first-class citizen of your Plano configuration.
+
+**Prerequisites:**
+
+```bash
+# 1. Plano must be running with a model listener
+planoai up config.yaml
+
+# 2. ANTHROPIC_API_KEY must be set (Claude Code uses it for auth)
+export ANTHROPIC_API_KEY=sk-ant-...
+```
+
+**Starting the CLI agent:**
+
+```bash
+# Start CLI agent using config.yaml in current directory
+planoai cli_agent claude
+
+# Use a specific config file
+planoai cli_agent claude config.yaml
+
+# Use a config in a different directory
+planoai cli_agent claude --path /path/to/project
+```
+
+**Recommended config for Claude Code routing:**
+
+```yaml
+version: v0.3.0
+
+listeners:
+  - type: model
+    name: claude_code_router
+    port: 12000
+
+model_providers:
+  - model: anthropic/claude-sonnet-4-20250514
+    access_key: $ANTHROPIC_API_KEY
+    default: true
+    routing_preferences:
+      - name: general coding
+        description: >
+          Writing code, debugging, code review, explaining concepts,
+          answering programming questions, general development tasks.
+
+  - model: anthropic/claude-opus-4-6
+    access_key: $ANTHROPIC_API_KEY
+    routing_preferences:
+      - name: complex architecture
+        description: >
+          System design, complex refactoring across many files,
+          architectural decisions, performance optimization, security audits.
+
+model_aliases:
+  claude.fast.v1:
+    target: claude-sonnet-4-20250514
+  claude.smart.v1:
+    target: claude-opus-4-6
+
+tracing:
+  random_sampling: 100
+  trace_arch_internal: true
+
+overrides:
+  upstream_connect_timeout: "10s"
+```
+
+**What happens when cli_agent runs:**
+
+1. Reads your config.yaml to find the model listener port
+2. Configures Claude Code to use `http://localhost:<port>` as its API endpoint
+3. Starts a Claude Code session in your terminal
+4. All Claude Code LLM calls flow through Plano — routing, tracing, and guardrails apply
+
+After your session, use `planoai trace` to inspect every LLM call Claude Code made, which model was selected, and why.
+
+Reference: [https://github.com/katanemo/archgw](https://github.com/katanemo/archgw)
+
+---
+
+### 6.4 Use `planoai init` Templates to Bootstrap New Projects Correctly
+
+**Impact:** `MEDIUM` — Starting from a blank config.yaml leads to missing required fields and common structural mistakes — templates provide validated, idiomatic starting points
+**Tags:** `cli`, `init`, `templates`, `getting-started`, `project-setup`
+
+## Use `planoai init` Templates to Bootstrap New Projects Correctly
+
+`planoai init` generates a valid `config.yaml` from built-in templates. Each template demonstrates a specific Plano capability with correct structure, realistic examples, and comments. Use this instead of writing config from scratch — it ensures you start with a valid, working configuration.
+
+**Available templates:**
+
+| Template ID | What It Demonstrates | Best For |
+|---|---|---|
+| `sub_agent_orchestration` | Multi-agent routing with specialized sub-agents | Building agentic applications |
+| `coding_agent_routing` | Routing preferences + model aliases for coding workflows | Claude Code and coding assistants |
+| `preference_aware_routing` | Automatic LLM routing based on task type | Multi-model cost optimization |
+| `filter_chain_guardrails` | Input guards, query rewrite, context builder | RAG + safety pipelines |
+| `conversational_state_v1_responses` | Stateful conversations with memory | Chatbots, multi-turn assistants |
+
+**Usage:**
+
+```bash
+# Initialize with a template
+planoai init --template sub_agent_orchestration
+
+# Initialize coding agent routing setup
+planoai init --template coding_agent_routing
+
+# Initialize a RAG with guardrails project
+planoai init --template filter_chain_guardrails
+```
+
+**Typical project setup workflow:**
+
+```bash
+# 1. Create project directory
+mkdir my-plano-agent && cd my-plano-agent
+
+# 2. Bootstrap with the closest matching template
+planoai init --template preference_aware_routing
+
+# 3. Edit config.yaml to add your specific models, agents, and API keys
+#    (keys are already using $VAR substitution — just set your env vars)
+
+# 4. Create .env file for local development
+cat > .env << EOF
+OPENAI_API_KEY=sk-proj-...
+ANTHROPIC_API_KEY=sk-ant-...
+EOF
+
+echo ".env" >> .gitignore
+
+# 5. Start Plano
+planoai up
+
+# 6. Test your configuration
+curl http://localhost:12000/v1/chat/completions \
+  -H "Content-Type: application/json" \
+  -d '{"model": "gpt-4o", "messages": [{"role": "user", "content": "Hello"}]}'
+```
+
+Start with `preference_aware_routing` for most LLM gateway use cases and `sub_agent_orchestration` for multi-agent applications. Both can be combined after you understand each independently.
+
+Reference: https://github.com/katanemo/archgw
+
+---
+
+## Section 7: Deployment & Security
+
+*Docker deployment, environment variable management, health checks, and state storage for production.*
+
+### 7.1 Understand Plano's Docker Network Topology for Agent URL Configuration
+
+**Impact:** `HIGH` — Using `localhost` for agent URLs inside Docker always fails — Plano runs in a container and cannot reach host services via localhost
+**Tags:** `deployment`, `docker`, `networking`, `agents`, `urls`
+
+## Understand Plano's Docker Network Topology for Agent URL Configuration
+
+Plano runs inside a Docker container managed by `planoai up`. Services running on your host machine (agent servers, filter servers, databases) are not accessible as `localhost` from inside the container. Use Docker's special hostname `host.docker.internal` to reach host services.
+
+**Docker network rules:**
+- `localhost` / `127.0.0.1` inside the container → Plano's own container (not your host)
+- `host.docker.internal` → Your host machine's loopback interface
+- Container name or `docker network` hostname → Other Docker containers
+- External domain / IP → Reachable if Docker has network access
+
+**Incorrect (using localhost — agent unreachable from inside container):**
+
+```yaml
+version: v0.3.0
+
+agents:
+  - id: weather_agent
+    url: http://localhost:8001       # Wrong: this is Plano's own container
+
+  - id: flight_agent
+    url: http://127.0.0.1:8002      # Wrong: same issue
+
+filters:
+  - id: input_guards
+    url: http://localhost:10500      # Wrong: filter server unreachable
+```
+
+**Correct (using host.docker.internal for host-side services):**
+
+```yaml
+version: v0.3.0
+
+agents:
+  - id: weather_agent
+    url: http://host.docker.internal:8001    # Correct: reaches host port 8001
+
+  - id: flight_agent
+    url: http://host.docker.internal:8002    # Correct: reaches host port 8002
+
+filters:
+  - id: input_guards
+    url: http://host.docker.internal:10500   # Correct: reaches filter server on host
+
+endpoints:
+  internal_api:
+    endpoint: host.docker.internal            # Correct for internal API on host
+    protocol: http
+```
+
+**Production deployment patterns:**
+
+```yaml
+# Kubernetes / Docker Compose — use service names
+agents:
+  - id: weather_agent
+    url: http://weather-service:8001    # Kubernetes service DNS
+
+# External cloud services — use full domain
+agents:
+  - id: cloud_agent
+    url: https://my-agent.us-east-1.amazonaws.com/v1
+
+# Custom TLS (self-signed or internal CA)
+overrides:
+  upstream_tls_ca_path: /etc/ssl/certs/internal-ca.pem
+```
+
+**Ports exposed by Plano's container:**
+- All `port` values from your `listeners` blocks are automatically mapped
+- `9901` — Envoy admin interface (for advanced debugging)
+- `12001` — Plano internal management API
+
+Reference: https://github.com/katanemo/archgw
+
+---
+
+### 7.2 Use PostgreSQL State Storage for Multi-Turn Conversations in Production
+
+**Impact:** `HIGH` — The default in-memory state storage loses all conversation history when the container restarts — production multi-turn agents require persistent PostgreSQL storage
+**Tags:** `deployment`, `state`, `postgres`, `memory`, `multi-turn`, `production`
+
+## Use PostgreSQL State Storage for Multi-Turn Conversations in Production
+
+`state_storage` enables Plano to maintain conversation context across requests. Without it, each request is stateless. The `memory` type works for development and testing — all state is lost on container restart. Use `postgres` for any production deployment where conversation continuity matters.
+
+**Incorrect (memory storage in production):**
+
+```yaml
+version: v0.3.0
+
+# Memory storage — all conversations lost on planoai down / container restart
+state_storage:
+  type: memory
+
+listeners:
+  - type: agent
+    name: customer_support
+    port: 8000
+    router: plano_orchestrator_v1
+    agents:
+      - id: support_agent
+        description: Customer support assistant with conversation history.
+```
+
+**Correct (PostgreSQL for production persistence):**
+
+```yaml
+version: v0.3.0
+
+state_storage:
+  type: postgres
+  connection_string: "postgresql://${DB_USER}:${DB_PASS}@${DB_HOST}:5432/${DB_NAME}"
+
+listeners:
+  - type: agent
+    name: customer_support
+    port: 8000
+    router: plano_orchestrator_v1
+    agents:
+      - id: support_agent
+        description: Customer support assistant with access to full conversation history.
+
+model_providers:
+  - model: openai/gpt-4o
+    access_key: $OPENAI_API_KEY
+    default: true
+```
+
+**Setting up PostgreSQL for local development:**
+
+```bash
+# Start PostgreSQL with Docker
+docker run -d \
+  --name plano-postgres \
+  -e POSTGRES_USER=plano \
+  -e POSTGRES_PASSWORD=devpassword \
+  -e POSTGRES_DB=plano \
+  -p 5432:5432 \
+  postgres:16
+
+# Set environment variables
+export DB_USER=plano
+export DB_PASS=devpassword
+export DB_HOST=host.docker.internal   # Use host.docker.internal from inside Plano container
+export DB_NAME=plano
+```
+
+**Production `.env` pattern:**
+
+```bash
+DB_USER=plano_prod
+DB_PASS=<strong-random-password>
+DB_HOST=your-rds-endpoint.amazonaws.com
+DB_NAME=plano
+```
+
+Plano automatically creates its state tables on first startup. The `connection_string` supports all standard PostgreSQL connection parameters including SSL: `postgresql://user:pass@host:5432/db?sslmode=require`.
+
+Reference: https://github.com/katanemo/archgw
+
+---
+
+### 7.3 Verify Listener Health Before Sending Requests
+
+**Impact:** `MEDIUM` — Sending requests to Plano before listeners are healthy results in connection refused errors that look like application bugs — always confirm health before testing
+**Tags:** `deployment`, `health-checks`, `readiness`, `debugging`
+
+## Verify Listener Health Before Sending Requests
+
+Each Plano listener exposes a `/healthz` HTTP endpoint. `planoai up` automatically health-checks all listeners during startup (120s timeout), but in CI/CD pipelines, custom scripts, or when troubleshooting, you may need to check health manually.
+
+**Health check endpoints:**
+
+```bash
+# Check model listener health (port from your config)
+curl -f http://localhost:12000/healthz
+# Returns 200 OK when healthy
+
+# Check prompt listener
+curl -f http://localhost:10000/healthz
+
+# Check agent listener
+curl -f http://localhost:8000/healthz
+```
+
+**Polling health in scripts (CI/CD pattern):**
+
+```bash
+#!/bin/bash
+# wait-for-plano.sh
+
+LISTENER_PORT=${1:-12000}
+MAX_WAIT=120
+INTERVAL=2
+elapsed=0
+
+echo "Waiting for Plano listener on port $LISTENER_PORT..."
+
+until curl -sf "http://localhost:$LISTENER_PORT/healthz" > /dev/null; do
+  if [ $elapsed -ge $MAX_WAIT ]; then
+    echo "ERROR: Plano listener not healthy after ${MAX_WAIT}s"
+    planoai logs --debug
+    exit 1
+  fi
+  sleep $INTERVAL
+  elapsed=$((elapsed + INTERVAL))
+done
+
+echo "Plano listener healthy after ${elapsed}s"
+```
+
+**Docker Compose health check:**
+
+```yaml
+# docker-compose.yml for services that depend on Plano
+services:
+  plano:
+    image: katanemo/plano:latest
+    # Plano is managed by planoai, not directly via compose in most setups
+    healthcheck:
+      test: ["CMD", "curl", "-f", "http://localhost:12000/healthz"]
+      interval: 5s
+      timeout: 3s
+      retries: 24
+      start_period: 10s
+
+  my-agent:
+    image: my-agent:latest
+    depends_on:
+      plano:
+        condition: service_healthy
+```
+
+**Debug unhealthy listeners:**
+
+```bash
+# See startup logs
+planoai logs --debug
+
+# Check if port is already in use
+lsof -i :12000
+
+# Check container status
+docker ps -a --filter name=plano
+
+# Restart from scratch
+planoai down && planoai up config.yaml --foreground
+```
+
+Reference: https://github.com/katanemo/archgw
+
+---
+
+## Section 8: Advanced Patterns
+
+*Prompt targets, external API integration, rate limiting, and multi-listener architectures.*
+
+### 8.1 Combine Multiple Listener Types for Layered Agent Architectures
+
+**Impact:** `MEDIUM` — Using a single listener type forces all traffic through one gateway pattern — combining types lets you serve different clients with the right interface without running multiple Plano instances
+**Tags:** `advanced`, `multi-listener`, `architecture`, `agent`, `model`, `prompt`
+
+## Combine Multiple Listener Types for Layered Agent Architectures
+
+A single Plano `config.yaml` can define multiple listeners of different types, each on a separate port. This lets you serve different client types simultaneously: an OpenAI-compatible model gateway for direct API clients, a prompt gateway for LLM-callable function applications, and an agent orchestrator for multi-agent workflows — all from one Plano instance sharing the same model providers.
+
+**Single listener (limited — forces all clients through one interface):**
+
+```yaml
+version: v0.3.0
+
+listeners:
+  - type: model             # Only model clients can use this
+    name: model_gateway
+    port: 12000
+
+# Prompt target clients and agent clients cannot connect
+```
+
+**Multi-listener architecture (serves all client types):**
+
+```yaml
+version: v0.3.0
+
+# --- Shared model providers ---
+model_providers:
+  - model: openai/gpt-4o-mini
+    access_key: $OPENAI_API_KEY
+    default: true
+    routing_preferences:
+      - name: quick tasks
+        description: Short answers, formatting, classification, simple generation
+
+  - model: openai/gpt-4o
+    access_key: $OPENAI_API_KEY
+    routing_preferences:
+      - name: complex reasoning
+        description: Multi-step analysis, code generation, research synthesis
+
+  - model: anthropic/claude-sonnet-4-20250514
+    access_key: $ANTHROPIC_API_KEY
+    routing_preferences:
+      - name: long documents
+        description: Summarizing or analyzing very long documents, PDFs, transcripts
+
+# --- Listener 1: OpenAI-compatible API gateway ---
+# For: SDK clients, Claude Code, LangChain, etc.
+listeners:
+  - type: model
+    name: model_gateway
+    port: 12000
+    timeout: "120s"
+
+# --- Listener 2: Prompt function gateway ---
+# For: Applications that expose LLM-callable APIs
+  - type: prompt
+    name: function_gateway
+    port: 10000
+    timeout: "60s"
+
+# --- Listener 3: Agent orchestration gateway ---
+# For: Multi-agent application clients
+  - type: agent
+    name: agent_orchestrator
+    port: 8000
+    timeout: "90s"
+    router: plano_orchestrator_v1
+    agents:
+      - id: research_agent
+        description: Searches, synthesizes, and summarizes information from multiple sources.
+        filter_chain:
+          - input_guards
+          - context_builder
+      - id: code_agent
+        description: Writes, reviews, debugs, and explains code across all languages.
+        default: true
+
+# --- Agents ---
+agents:
+  - id: research_agent
+    url: http://host.docker.internal:8001
+  - id: code_agent
+    url: http://host.docker.internal:8002
+
+# --- Filters ---
+filters:
+  - id: input_guards
+    url: http://host.docker.internal:10500
+    type: mcp
+    transport: streamable-http
+  - id: context_builder
+    url: http://host.docker.internal:10501
+    type: mcp
+    transport: streamable-http
+
+# --- Prompt targets (for function gateway) ---
+endpoints:
+  internal_api:
+    endpoint: host.docker.internal
+    protocol: http
+
+prompt_targets:
+  - name: search_knowledge_base
+    description: Search the internal knowledge base for relevant documents and facts.
+    parameters:
+      - name: query
+        type: str
+        required: true
+        description: Search query to find relevant information
+    endpoint:
+      name: internal_api
+      path: /kb/search?q={query}
+      http_method: GET
+
+# --- Observability ---
+model_aliases:
+  plano.fast.v1:
+    target: gpt-4o-mini
+  plano.smart.v1:
+    target: gpt-4o
+
+tracing:
+  random_sampling: 50
+  trace_arch_internal: true
+  span_attributes:
+    static:
+      environment: production
+    header_prefixes:
+      - x-katanemo-
+```
+
+This architecture serves: SDK clients on `:12000`, function-calling apps on `:10000`, and multi-agent orchestration on `:8000` — with shared cost-optimized routing across all three.
+
+Reference: [https://github.com/katanemo/archgw](https://github.com/katanemo/archgw)
+
+---
+
+### 8.2 Design Prompt Targets with Precise Parameter Schemas
+
+**Impact:** `HIGH` — Imprecise parameter definitions cause the LLM to hallucinate values, skip required fields, or produce malformed API calls — the schema is the contract between the LLM and your API
+**Tags:** `advanced`, `prompt-targets`, `functions`, `llm`, `api-integration`
+
+## Design Prompt Targets with Precise Parameter Schemas
+
+`prompt_targets` define functions that Plano's LLM can call autonomously when it determines a user request matches the function's description. The parameter schema tells the LLM exactly what values to extract from user input — vague schemas lead to hallucinated parameters and failed API calls.
+
+**Incorrect (too few constraints — LLM must guess):**
+
+```yaml
+prompt_targets:
+  - name: get_flight_info
+    description: Get flight information
+    parameters:
+      - name: flight         # What format? "AA123"? "AA 123"? "American 123"?
+        type: str
+        required: true
+    endpoint:
+      name: flights_api
+      path: /flight?id={flight}
+```
+
+**Correct (fully specified schema with descriptions, formats, and enums):**
+
+```yaml
+version: v0.3.0
+
+endpoints:
+  flights_api:
+    endpoint: api.flightaware.com
+    protocol: https
+    connect_timeout: "5s"
+
+prompt_targets:
+  - name: get_flight_status
+    description: >
+      Get real-time status, gate information, and delays for a specific flight number.
+      Use when the user asks about a flight's current status, arrival time, or gate.
+    parameters:
+      - name: flight_number
+        description: >
+          IATA airline code followed by flight number, e.g., "AA123", "UA456", "DL789".
+          Extract from user message — do not include spaces.
+        type: str
+        required: true
+        format: "^[A-Z]{2}[0-9]{1,4}$"    # Regex hint for validation
+
+      - name: date
+        description: >
+          Flight date in YYYY-MM-DD format. Use today's date if not specified.
+        type: str
+        required: false
+        format: date
+
+    endpoint:
+      name: flights_api
+      path: /flights/{flight_number}?date={date}
+      http_method: GET
+      http_headers:
+        Authorization: "Bearer $FLIGHTAWARE_API_KEY"
+
+  - name: search_flights
+    description: >
+      Search for available flights between two cities or airports.
+      Use when the user wants to find flights, compare options, or book travel.
+    parameters:
+      - name: origin
+        description: Departure airport IATA code (e.g., "JFK", "LAX", "ORD")
+        type: str
+        required: true
+      - name: destination
+        description: Arrival airport IATA code (e.g., "LHR", "CDG", "NRT")
+        type: str
+        required: true
+      - name: departure_date
+        description: Departure date in YYYY-MM-DD format
+        type: str
+        required: true
+        format: date
+      - name: cabin_class
+        description: Preferred cabin class
+        type: str
+        required: false
+        default: economy
+        enum: [economy, premium_economy, business, first]
+      - name: passengers
+        description: Number of adult passengers (1-9)
+        type: int
+        required: false
+        default: 1
+
+    endpoint:
+      name: flights_api
+      path: /search?from={origin}&to={destination}&date={departure_date}&class={cabin_class}&pax={passengers}
+      http_method: GET
+      http_headers:
+        Authorization: "Bearer $FLIGHTAWARE_API_KEY"
+
+    system_prompt: |
+      You are a travel assistant. Present flight search results clearly,
+      highlighting the best value options. Include price, duration, and
+      number of stops for each option.
+
+model_providers:
+  - model: openai/gpt-4o
+    access_key: $OPENAI_API_KEY
+    default: true
+
+listeners:
+  - type: prompt
+    name: travel_functions
+    port: 10000
+    timeout: "30s"
+```
+
+**Key principles:**
+- `description` on the target tells the LLM when to call it — be specific about trigger conditions
+- `description` on each parameter tells the LLM what value to extract — include format examples
+- Use `enum` to constrain categorical values — prevents the LLM from inventing categories
+- Use `format: date` or regex patterns to hint at expected format
+- Use `default` for optional parameters so the API never receives null values
+- `system_prompt` on the target customizes how the LLM formats the API response to the user
+
+Reference: https://github.com/katanemo/archgw
+
+---
+
+*Generated from individual rule files in `rules/`.*
+*To contribute, see [CONTRIBUTING](https://github.com/katanemo/archgw/blob/main/CONTRIBUTING.md).*
diff --git a/skills/README.md b/skills/README.md
new file mode 100644
index 00000000..d941fb93
--- /dev/null
+++ b/skills/README.md
@@ -0,0 +1,243 @@
+# Plano Agent Skills
+
+A structured repository of best practices for building agents and agentic applications with [Plano](https://github.com/katanemo/archgw) — the AI-native proxy and dataplane. Optimized for coding agents and LLMs.
+
+## What Are Skills?
+
+Skills are principle-based guides that help coding agents (Claude Code, Cursor, Copilot, etc.) make better decisions when working with Plano. They cover configuration patterns, routing strategies, agent orchestration, observability, and CLI workflows — acting as operating principles, not documentation replacements.
+
+## Installing
+
+```bash
+# Install via npx skills
+npx skills add katanemo/plano
+```
+
+This skills collection is published from the `skills/` directory in the `katanemo/plano` monorepo.
+
+Install a specific skill:
+
+```bash
+npx skills add katanemo/plano --skill plano-routing-model-selection
+```
+
+List available skills before install:
+
+```bash
+npx skills add katanemo/plano --list
+```
+
+## Using Skills in Agents
+
+After installation, these skills are available to your coding agent and can be invoked with normal language. You do not need special syntax unless your tooling requires it.
+
+### Natural Language Invocation Examples
+
+- "Use the Plano skills to validate this `config.yaml` and fix issues."
+- "Apply Plano routing best practices to improve model/provider selection."
+- "Review this agent listener config with the orchestration rules."
+- "Refactor this filter chain to follow guardrail ordering best practices."
+- "Audit this setup against Plano deployment and security recommendations."
+
+### Prompting Tips for Better Results
+
+- Name your goal and file: "Harden `config.yaml` for production."
+- Ask for an action: "Generate a patch," "fix directly," or "explain the changes."
+- Include runtime context when relevant: trace output, logs, listener errors.
+- Ask for verification: "Run a final validation check after edits."
+
+### Invoke by Skill Area (Optional)
+
+- **Configuration:** "Use Plano configuration fundamentals on this config."
+- **Routing:** "Use routing/model-selection skills to tune defaults and aliases."
+- **Agent orchestration:** "Use agent orchestration skills to improve routing accuracy."
+- **Filters/guardrails:** "Use filter-chain skills to harden input/output safety."
+- **Observability:** "Use observability skills to add traceability and debug routing."
+- **CLI/deployment:** "Use CLI and deployment skills to produce a startup checklist."
+
+## Available Skills
+
+- `plano-agent-skills` - Umbrella skill covering all Plano areas
+- `plano-config-fundamentals` - Config versioning, listeners, providers, secrets
+- `plano-routing-model-selection` - Defaults, aliases, passthrough auth, preferences
+- `plano-agent-orchestration` - Agent registration and routing descriptions
+- `plano-filter-guardrails` - MCP filters, guardrail messaging, filter ordering
+- `plano-observability-debugging` - Tracing setup, span attributes, trace analysis
+- `plano-cli-operations` - `planoai up`, `cli_agent`, init, prompt target generation
+- `plano-deployment-security` - Docker networking, health checks, state storage
+- `plano-advanced-patterns` - Multi-listener architecture and prompt target schema design
+
+## Local Testing
+
+```bash
+# From repo root
+npx skills add ./skills --list
+npx skills add ./skills --skill plano-agent-skills -y
+npx skills list
+```
+
+## Structure
+
+```
+skills/
+├── rules/                    # Individual rule files (one per rule)
+│   ├── _sections.md          # Section metadata and prefix definitions
+│   ├── _template.md          # Template for creating new rules
+│   ├── config-*.md           # Section 1: Configuration Fundamentals
+│   ├── routing-*.md          # Section 2: Routing & Model Selection
+│   ├── agent-*.md            # Section 3: Agent Orchestration
+│   ├── filter-*.md           # Section 4: Filter Chains & Guardrails
+│   ├── observe-*.md          # Section 5: Observability & Debugging
+│   ├── cli-*.md              # Section 6: CLI Operations
+│   ├── deploy-*.md           # Section 7: Deployment & Security
+│   └── advanced-*.md         # Section 8: Advanced Patterns
+├── src/
+│   ├── build.ts              # Compiles rules/ into AGENTS.md
+│   ├── validate.ts           # Validates rule files
+│   └── extract-tests.ts      # Extracts test cases for LLM evaluation
+├── metadata.json             # Document metadata
+├── AGENTS.md                 # Compiled output (generated — do not edit directly)
+├── test-cases.json           # Test cases for LLM evaluation (generated)
+└── package.json
+```
+
+## Sections
+
+| # | Prefix | Section | Rules |
+|---|--------|---------|-------|
+| 1 | `config-` | Configuration Fundamentals | Version, listeners, providers, secrets, timeouts |
+| 2 | `routing-` | Routing & Model Selection | Preferences, aliases, defaults, passthrough |
+| 3 | `agent-` | Agent Orchestration | Descriptions, agent registration |
+| 4 | `filter-` | Filter Chains & Guardrails | Ordering, MCP integration, guardrails |
+| 5 | `observe-` | Observability & Debugging | Tracing, trace inspection, span attributes |
+| 6 | `cli-` | CLI Operations | Startup, CLI agent, init, code generation |
+| 7 | `deploy-` | Deployment & Security | Docker networking, state storage, health checks |
+| 8 | `advanced-` | Advanced Patterns | Prompt targets, rate limits, multi-listener |
+
+## Getting Started
+
+```bash
+# Install dependencies
+npm install
+
+# Validate all rule files
+npm run validate
+
+# Build AGENTS.md from rules
+npm run build
+
+# Extract test cases for LLM evaluation
+npm run extract-tests
+
+# Run all of the above
+npm run dev
+```
+
+## Creating a New Rule
+
+1. Copy `rules/_template.md` to `rules/<prefix>-<description>.md`
+
+2. Choose the correct prefix for your section:
+   - `config-` — Configuration Fundamentals
+   - `routing-` — Routing & Model Selection
+   - `agent-` — Agent Orchestration
+   - `filter-` — Filter Chains & Guardrails
+   - `observe-` — Observability & Debugging
+   - `cli-` — CLI Operations
+   - `deploy-` — Deployment & Security
+   - `advanced-` — Advanced Patterns
+
+3. Fill in the frontmatter:
+   ```yaml
+   ---
+   title: Clear, Actionable Rule Title
+   impact: HIGH
+   impactDescription: One-line description of why this matters
+   tags: config, routing, relevant-tags
+   ---
+   ```
+
+4. Write the rule body with:
+   - Brief explanation of the principle and why it matters
+   - **Incorrect** example (YAML config or CLI command showing the wrong pattern)
+   - **Correct** example (the right pattern with comments)
+   - Optional explanatory notes
+
+5. Run `npm run dev` to validate and regenerate
+
+## Rule File Structure
+
+```markdown
+---
+title: Rule Title Here
+impact: CRITICAL
+impactDescription: One sentence on the impact
+tags: tag1, tag2, tag3
+---
+
+## Rule Title Here
+
+Brief explanation of the rule and why it matters for Plano developers.
+
+**Incorrect (describe what's wrong):**
+
+```yaml
+# Bad example
+```
+
+**Correct (describe what's right):**
+
+```yaml
+# Good example with comments explaining the decisions
+```
+
+Optional explanatory text, lists, or tables.
+
+Reference: https://github.com/katanemo/archgw
+
+
+
+## Impact Levels
+
+| Level | Description |
+|-------|-------------|
+| `CRITICAL` | Causes startup failures or silent misbehavior — always fix |
+| `HIGH` | Significantly degrades routing accuracy, security, or reliability |
+| `MEDIUM-HIGH` | Important for production deployments |
+| `MEDIUM` | Best practice for maintainability and developer experience |
+| `LOW-MEDIUM` | Incremental improvements |
+| `LOW` | Nice to have |
+
+## Key Rules at a Glance
+
+- **Always set `version: v0.3.0`** — config is rejected without it
+- **Use `host.docker.internal`** for agent/filter URLs — `localhost` doesn't work inside Docker
+- **Set exactly one `default: true` provider** — unmatched requests need a fallback
+- **Write specific routing preference descriptions** — vague descriptions cause misroutes
+- **Order filter chains: guards → rewriters → context builders** — never build context before blocking bad input
+- **Use `$VAR_NAME` for all secrets** — never hardcode API keys in config.yaml
+- **Enable tracing with `--with-tracing`** — traces are the primary debugging tool
+
+## Scripts
+
+| Command | Description |
+|---------|-------------|
+| `npm run build` | Compile `rules/` into `AGENTS.md` |
+| `npm run validate` | Validate all rule files for required fields and structure |
+| `npm run extract-tests` | Generate `test-cases.json` for LLM evaluation |
+| `npm run dev` | Validate + build + extract tests |
+
+## Contributing
+
+Rules are automatically sorted alphabetically by title within each section — no need to manage numbers. IDs (`1.1`, `1.2`, etc.) are assigned during build.
+
+When adding rules:
+1. Use the correct filename prefix for your section
+2. Follow `_template.md` structure
+3. Include clear bad/good YAML or CLI examples
+4. Add relevant tags
+5. Run `npm run dev` to validate and regenerate
+
+## License
+
+Apache-2.0 — see [LICENSE](../LICENSE)
diff --git a/skills/metadata.json b/skills/metadata.json
new file mode 100644
index 00000000..f1f754ab
--- /dev/null
+++ b/skills/metadata.json
@@ -0,0 +1,8 @@
+{
+  "version": "1.0.0",
+  "organization": "Plano",
+  "name": "plano-agent-skills",
+  "abstract": "Best practices for building agents and agentic applications with Plano — the AI-native proxy and dataplane. Covers configuration, routing, agent orchestration, filter chains, observability, CLI operations, and deployment patterns.",
+  "homepage": "https://github.com/katanemo/archgw",
+  "license": "Apache-2.0"
+}
diff --git a/skills/package-lock.json b/skills/package-lock.json
new file mode 100644
index 00000000..080a8c7f
--- /dev/null
+++ b/skills/package-lock.json
@@ -0,0 +1,594 @@
+{
+  "name": "plano-agent-skills",
+  "version": "1.0.0",
+  "lockfileVersion": 3,
+  "requires": true,
+  "packages": {
+    "": {
+      "name": "plano-agent-skills",
+      "version": "1.0.0",
+      "license": "Apache-2.0",
+      "devDependencies": {
+        "@types/node": "^24.3.0",
+        "tsx": "^4.20.5",
+        "typescript": "^5.9.2"
+      },
+      "engines": {
+        "node": ">=18.0.0"
+      }
+    },
+    "node_modules/@esbuild/aix-ppc64": {
+      "version": "0.27.3",
+      "resolved": "https://registry.npmjs.org/@esbuild/aix-ppc64/-/aix-ppc64-0.27.3.tgz",
+      "integrity": "sha512-9fJMTNFTWZMh5qwrBItuziu834eOCUcEqymSH7pY+zoMVEZg3gcPuBNxH1EvfVYe9h0x/Ptw8KBzv7qxb7l8dg==",
+      "cpu": [
+        "ppc64"
+      ],
+      "dev": true,
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "aix"
+      ],
+      "engines": {
+        "node": ">=18"
+      }
+    },
+    "node_modules/@esbuild/android-arm": {
+      "version": "0.27.3",
+      "resolved": "https://registry.npmjs.org/@esbuild/android-arm/-/android-arm-0.27.3.tgz",
+      "integrity": "sha512-i5D1hPY7GIQmXlXhs2w8AWHhenb00+GxjxRncS2ZM7YNVGNfaMxgzSGuO8o8SJzRc/oZwU2bcScvVERk03QhzA==",
+      "cpu": [
+        "arm"
+      ],
+      "dev": true,
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "android"
+      ],
+      "engines": {
+        "node": ">=18"
+      }
+    },
+    "node_modules/@esbuild/android-arm64": {
+      "version": "0.27.3",
+      "resolved": "https://registry.npmjs.org/@esbuild/android-arm64/-/android-arm64-0.27.3.tgz",
+      "integrity": "sha512-YdghPYUmj/FX2SYKJ0OZxf+iaKgMsKHVPF1MAq/P8WirnSpCStzKJFjOjzsW0QQ7oIAiccHdcqjbHmJxRb/dmg==",
+      "cpu": [
+        "arm64"
+      ],
+      "dev": true,
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "android"
+      ],
+      "engines": {
+        "node": ">=18"
+      }
+    },
+    "node_modules/@esbuild/android-x64": {
+      "version": "0.27.3",
+      "resolved": "https://registry.npmjs.org/@esbuild/android-x64/-/android-x64-0.27.3.tgz",
+      "integrity": "sha512-IN/0BNTkHtk8lkOM8JWAYFg4ORxBkZQf9zXiEOfERX/CzxW3Vg1ewAhU7QSWQpVIzTW+b8Xy+lGzdYXV6UZObQ==",
+      "cpu": [
+        "x64"
+      ],
+      "dev": true,
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "android"
+      ],
+      "engines": {
+        "node": ">=18"
+      }
+    },
+    "node_modules/@esbuild/darwin-arm64": {
+      "version": "0.27.3",
+      "resolved": "https://registry.npmjs.org/@esbuild/darwin-arm64/-/darwin-arm64-0.27.3.tgz",
+      "integrity": "sha512-Re491k7ByTVRy0t3EKWajdLIr0gz2kKKfzafkth4Q8A5n1xTHrkqZgLLjFEHVD+AXdUGgQMq+Godfq45mGpCKg==",
+      "cpu": [
+        "arm64"
+      ],
+      "dev": true,
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "darwin"
+      ],
+      "engines": {
+        "node": ">=18"
+      }
+    },
+    "node_modules/@esbuild/darwin-x64": {
+      "version": "0.27.3",
+      "resolved": "https://registry.npmjs.org/@esbuild/darwin-x64/-/darwin-x64-0.27.3.tgz",
+      "integrity": "sha512-vHk/hA7/1AckjGzRqi6wbo+jaShzRowYip6rt6q7VYEDX4LEy1pZfDpdxCBnGtl+A5zq8iXDcyuxwtv3hNtHFg==",
+      "cpu": [
+        "x64"
+      ],
+      "dev": true,
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "darwin"
+      ],
+      "engines": {
+        "node": ">=18"
+      }
+    },
+    "node_modules/@esbuild/freebsd-arm64": {
+      "version": "0.27.3",
+      "resolved": "https://registry.npmjs.org/@esbuild/freebsd-arm64/-/freebsd-arm64-0.27.3.tgz",
+      "integrity": "sha512-ipTYM2fjt3kQAYOvo6vcxJx3nBYAzPjgTCk7QEgZG8AUO3ydUhvelmhrbOheMnGOlaSFUoHXB6un+A7q4ygY9w==",
+      "cpu": [
+        "arm64"
+      ],
+      "dev": true,
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "freebsd"
+      ],
+      "engines": {
+        "node": ">=18"
+      }
+    },
+    "node_modules/@esbuild/freebsd-x64": {
+      "version": "0.27.3",
+      "resolved": "https://registry.npmjs.org/@esbuild/freebsd-x64/-/freebsd-x64-0.27.3.tgz",
+      "integrity": "sha512-dDk0X87T7mI6U3K9VjWtHOXqwAMJBNN2r7bejDsc+j03SEjtD9HrOl8gVFByeM0aJksoUuUVU9TBaZa2rgj0oA==",
+      "cpu": [
+        "x64"
+      ],
+      "dev": true,
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "freebsd"
+      ],
+      "engines": {
+        "node": ">=18"
+      }
+    },
+    "node_modules/@esbuild/linux-arm": {
+      "version": "0.27.3",
+      "resolved": "https://registry.npmjs.org/@esbuild/linux-arm/-/linux-arm-0.27.3.tgz",
+      "integrity": "sha512-s6nPv2QkSupJwLYyfS+gwdirm0ukyTFNl3KTgZEAiJDd+iHZcbTPPcWCcRYH+WlNbwChgH2QkE9NSlNrMT8Gfw==",
+      "cpu": [
+        "arm"
+      ],
+      "dev": true,
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "linux"
+      ],
+      "engines": {
+        "node": ">=18"
+      }
+    },
+    "node_modules/@esbuild/linux-arm64": {
+      "version": "0.27.3",
+      "resolved": "https://registry.npmjs.org/@esbuild/linux-arm64/-/linux-arm64-0.27.3.tgz",
+      "integrity": "sha512-sZOuFz/xWnZ4KH3YfFrKCf1WyPZHakVzTiqji3WDc0BCl2kBwiJLCXpzLzUBLgmp4veFZdvN5ChW4Eq/8Fc2Fg==",
+      "cpu": [
+        "arm64"
+      ],
+      "dev": true,
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "linux"
+      ],
+      "engines": {
+        "node": ">=18"
+      }
+    },
+    "node_modules/@esbuild/linux-ia32": {
+      "version": "0.27.3",
+      "resolved": "https://registry.npmjs.org/@esbuild/linux-ia32/-/linux-ia32-0.27.3.tgz",
+      "integrity": "sha512-yGlQYjdxtLdh0a3jHjuwOrxQjOZYD/C9PfdbgJJF3TIZWnm/tMd/RcNiLngiu4iwcBAOezdnSLAwQDPqTmtTYg==",
+      "cpu": [
+        "ia32"
+      ],
+      "dev": true,
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "linux"
+      ],
+      "engines": {
+        "node": ">=18"
+      }
+    },
+    "node_modules/@esbuild/linux-loong64": {
+      "version": "0.27.3",
+      "resolved": "https://registry.npmjs.org/@esbuild/linux-loong64/-/linux-loong64-0.27.3.tgz",
+      "integrity": "sha512-WO60Sn8ly3gtzhyjATDgieJNet/KqsDlX5nRC5Y3oTFcS1l0KWba+SEa9Ja1GfDqSF1z6hif/SkpQJbL63cgOA==",
+      "cpu": [
+        "loong64"
+      ],
+      "dev": true,
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "linux"
+      ],
+      "engines": {
+        "node": ">=18"
+      }
+    },
+    "node_modules/@esbuild/linux-mips64el": {
+      "version": "0.27.3",
+      "resolved": "https://registry.npmjs.org/@esbuild/linux-mips64el/-/linux-mips64el-0.27.3.tgz",
+      "integrity": "sha512-APsymYA6sGcZ4pD6k+UxbDjOFSvPWyZhjaiPyl/f79xKxwTnrn5QUnXR5prvetuaSMsb4jgeHewIDCIWljrSxw==",
+      "cpu": [
+        "mips64el"
+      ],
+      "dev": true,
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "linux"
+      ],
+      "engines": {
+        "node": ">=18"
+      }
+    },
+    "node_modules/@esbuild/linux-ppc64": {
+      "version": "0.27.3",
+      "resolved": "https://registry.npmjs.org/@esbuild/linux-ppc64/-/linux-ppc64-0.27.3.tgz",
+      "integrity": "sha512-eizBnTeBefojtDb9nSh4vvVQ3V9Qf9Df01PfawPcRzJH4gFSgrObw+LveUyDoKU3kxi5+9RJTCWlj4FjYXVPEA==",
+      "cpu": [
+        "ppc64"
+      ],
+      "dev": true,
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "linux"
+      ],
+      "engines": {
+        "node": ">=18"
+      }
+    },
+    "node_modules/@esbuild/linux-riscv64": {
+      "version": "0.27.3",
+      "resolved": "https://registry.npmjs.org/@esbuild/linux-riscv64/-/linux-riscv64-0.27.3.tgz",
+      "integrity": "sha512-3Emwh0r5wmfm3ssTWRQSyVhbOHvqegUDRd0WhmXKX2mkHJe1SFCMJhagUleMq+Uci34wLSipf8Lagt4LlpRFWQ==",
+      "cpu": [
+        "riscv64"
+      ],
+      "dev": true,
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "linux"
+      ],
+      "engines": {
+        "node": ">=18"
+      }
+    },
+    "node_modules/@esbuild/linux-s390x": {
+      "version": "0.27.3",
+      "resolved": "https://registry.npmjs.org/@esbuild/linux-s390x/-/linux-s390x-0.27.3.tgz",
+      "integrity": "sha512-pBHUx9LzXWBc7MFIEEL0yD/ZVtNgLytvx60gES28GcWMqil8ElCYR4kvbV2BDqsHOvVDRrOxGySBM9Fcv744hw==",
+      "cpu": [
+        "s390x"
+      ],
+      "dev": true,
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "linux"
+      ],
+      "engines": {
+        "node": ">=18"
+      }
+    },
+    "node_modules/@esbuild/linux-x64": {
+      "version": "0.27.3",
+      "resolved": "https://registry.npmjs.org/@esbuild/linux-x64/-/linux-x64-0.27.3.tgz",
+      "integrity": "sha512-Czi8yzXUWIQYAtL/2y6vogER8pvcsOsk5cpwL4Gk5nJqH5UZiVByIY8Eorm5R13gq+DQKYg0+JyQoytLQas4dA==",
+      "cpu": [
+        "x64"
+      ],
+      "dev": true,
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "linux"
+      ],
+      "engines": {
+        "node": ">=18"
+      }
+    },
+    "node_modules/@esbuild/netbsd-arm64": {
+      "version": "0.27.3",
+      "resolved": "https://registry.npmjs.org/@esbuild/netbsd-arm64/-/netbsd-arm64-0.27.3.tgz",
+      "integrity": "sha512-sDpk0RgmTCR/5HguIZa9n9u+HVKf40fbEUt+iTzSnCaGvY9kFP0YKBWZtJaraonFnqef5SlJ8/TiPAxzyS+UoA==",
+      "cpu": [
+        "arm64"
+      ],
+      "dev": true,
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "netbsd"
+      ],
+      "engines": {
+        "node": ">=18"
+      }
+    },
+    "node_modules/@esbuild/netbsd-x64": {
+      "version": "0.27.3",
+      "resolved": "https://registry.npmjs.org/@esbuild/netbsd-x64/-/netbsd-x64-0.27.3.tgz",
+      "integrity": "sha512-P14lFKJl/DdaE00LItAukUdZO5iqNH7+PjoBm+fLQjtxfcfFE20Xf5CrLsmZdq5LFFZzb5JMZ9grUwvtVYzjiA==",
+      "cpu": [
+        "x64"
+      ],
+      "dev": true,
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "netbsd"
+      ],
+      "engines": {
+        "node": ">=18"
+      }
+    },
+    "node_modules/@esbuild/openbsd-arm64": {
+      "version": "0.27.3",
+      "resolved": "https://registry.npmjs.org/@esbuild/openbsd-arm64/-/openbsd-arm64-0.27.3.tgz",
+      "integrity": "sha512-AIcMP77AvirGbRl/UZFTq5hjXK+2wC7qFRGoHSDrZ5v5b8DK/GYpXW3CPRL53NkvDqb9D+alBiC/dV0Fb7eJcw==",
+      "cpu": [
+        "arm64"
+      ],
+      "dev": true,
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "openbsd"
+      ],
+      "engines": {
+        "node": ">=18"
+      }
+    },
+    "node_modules/@esbuild/openbsd-x64": {
+      "version": "0.27.3",
+      "resolved": "https://registry.npmjs.org/@esbuild/openbsd-x64/-/openbsd-x64-0.27.3.tgz",
+      "integrity": "sha512-DnW2sRrBzA+YnE70LKqnM3P+z8vehfJWHXECbwBmH/CU51z6FiqTQTHFenPlHmo3a8UgpLyH3PT+87OViOh1AQ==",
+      "cpu": [
+        "x64"
+      ],
+      "dev": true,
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "openbsd"
+      ],
+      "engines": {
+        "node": ">=18"
+      }
+    },
+    "node_modules/@esbuild/openharmony-arm64": {
+      "version": "0.27.3",
+      "resolved": "https://registry.npmjs.org/@esbuild/openharmony-arm64/-/openharmony-arm64-0.27.3.tgz",
+      "integrity": "sha512-NinAEgr/etERPTsZJ7aEZQvvg/A6IsZG/LgZy+81wON2huV7SrK3e63dU0XhyZP4RKGyTm7aOgmQk0bGp0fy2g==",
+      "cpu": [
+        "arm64"
+      ],
+      "dev": true,
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "openharmony"
+      ],
+      "engines": {
+        "node": ">=18"
+      }
+    },
+    "node_modules/@esbuild/sunos-x64": {
+      "version": "0.27.3",
+      "resolved": "https://registry.npmjs.org/@esbuild/sunos-x64/-/sunos-x64-0.27.3.tgz",
+      "integrity": "sha512-PanZ+nEz+eWoBJ8/f8HKxTTD172SKwdXebZ0ndd953gt1HRBbhMsaNqjTyYLGLPdoWHy4zLU7bDVJztF5f3BHA==",
+      "cpu": [
+        "x64"
+      ],
+      "dev": true,
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "sunos"
+      ],
+      "engines": {
+        "node": ">=18"
+      }
+    },
+    "node_modules/@esbuild/win32-arm64": {
+      "version": "0.27.3",
+      "resolved": "https://registry.npmjs.org/@esbuild/win32-arm64/-/win32-arm64-0.27.3.tgz",
+      "integrity": "sha512-B2t59lWWYrbRDw/tjiWOuzSsFh1Y/E95ofKz7rIVYSQkUYBjfSgf6oeYPNWHToFRr2zx52JKApIcAS/D5TUBnA==",
+      "cpu": [
+        "arm64"
+      ],
+      "dev": true,
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "win32"
+      ],
+      "engines": {
+        "node": ">=18"
+      }
+    },
+    "node_modules/@esbuild/win32-ia32": {
+      "version": "0.27.3",
+      "resolved": "https://registry.npmjs.org/@esbuild/win32-ia32/-/win32-ia32-0.27.3.tgz",
+      "integrity": "sha512-QLKSFeXNS8+tHW7tZpMtjlNb7HKau0QDpwm49u0vUp9y1WOF+PEzkU84y9GqYaAVW8aH8f3GcBck26jh54cX4Q==",
+      "cpu": [
+        "ia32"
+      ],
+      "dev": true,
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "win32"
+      ],
+      "engines": {
+        "node": ">=18"
+      }
+    },
+    "node_modules/@esbuild/win32-x64": {
+      "version": "0.27.3",
+      "resolved": "https://registry.npmjs.org/@esbuild/win32-x64/-/win32-x64-0.27.3.tgz",
+      "integrity": "sha512-4uJGhsxuptu3OcpVAzli+/gWusVGwZZHTlS63hh++ehExkVT8SgiEf7/uC/PclrPPkLhZqGgCTjd0VWLo6xMqA==",
+      "cpu": [
+        "x64"
+      ],
+      "dev": true,
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "win32"
+      ],
+      "engines": {
+        "node": ">=18"
+      }
+    },
+    "node_modules/@types/node": {
+      "version": "24.11.0",
+      "resolved": "https://registry.npmjs.org/@types/node/-/node-24.11.0.tgz",
+      "integrity": "sha512-fPxQqz4VTgPI/IQ+lj9r0h+fDR66bzoeMGHp8ASee+32OSGIkeASsoZuJixsQoVef1QJbeubcPBxKk22QVoWdw==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "undici-types": "~7.16.0"
+      }
+    },
+    "node_modules/esbuild": {
+      "version": "0.27.3",
+      "resolved": "https://registry.npmjs.org/esbuild/-/esbuild-0.27.3.tgz",
+      "integrity": "sha512-8VwMnyGCONIs6cWue2IdpHxHnAjzxnw2Zr7MkVxB2vjmQ2ivqGFb4LEG3SMnv0Gb2F/G/2yA8zUaiL1gywDCCg==",
+      "dev": true,
+      "hasInstallScript": true,
+      "license": "MIT",
+      "bin": {
+        "esbuild": "bin/esbuild"
+      },
+      "engines": {
+        "node": ">=18"
+      },
+      "optionalDependencies": {
+        "@esbuild/aix-ppc64": "0.27.3",
+        "@esbuild/android-arm": "0.27.3",
+        "@esbuild/android-arm64": "0.27.3",
+        "@esbuild/android-x64": "0.27.3",
+        "@esbuild/darwin-arm64": "0.27.3",
+        "@esbuild/darwin-x64": "0.27.3",
+        "@esbuild/freebsd-arm64": "0.27.3",
+        "@esbuild/freebsd-x64": "0.27.3",
+        "@esbuild/linux-arm": "0.27.3",
+        "@esbuild/linux-arm64": "0.27.3",
+        "@esbuild/linux-ia32": "0.27.3",
+        "@esbuild/linux-loong64": "0.27.3",
+        "@esbuild/linux-mips64el": "0.27.3",
+        "@esbuild/linux-ppc64": "0.27.3",
+        "@esbuild/linux-riscv64": "0.27.3",
+        "@esbuild/linux-s390x": "0.27.3",
+        "@esbuild/linux-x64": "0.27.3",
+        "@esbuild/netbsd-arm64": "0.27.3",
+        "@esbuild/netbsd-x64": "0.27.3",
+        "@esbuild/openbsd-arm64": "0.27.3",
+        "@esbuild/openbsd-x64": "0.27.3",
+        "@esbuild/openharmony-arm64": "0.27.3",
+        "@esbuild/sunos-x64": "0.27.3",
+        "@esbuild/win32-arm64": "0.27.3",
+        "@esbuild/win32-ia32": "0.27.3",
+        "@esbuild/win32-x64": "0.27.3"
+      }
+    },
+    "node_modules/fsevents": {
+      "version": "2.3.3",
+      "resolved": "https://registry.npmjs.org/fsevents/-/fsevents-2.3.3.tgz",
+      "integrity": "sha512-5xoDfX+fL7faATnagmWPpbFtwh/R77WmMMqqHGS65C3vvB0YHrgF+B1YmZ3441tMj5n63k0212XNoJwzlhffQw==",
+      "dev": true,
+      "hasInstallScript": true,
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "darwin"
+      ],
+      "engines": {
+        "node": "^8.16.0 || ^10.6.0 || >=11.0.0"
+      }
+    },
+    "node_modules/get-tsconfig": {
+      "version": "4.13.6",
+      "resolved": "https://registry.npmjs.org/get-tsconfig/-/get-tsconfig-4.13.6.tgz",
+      "integrity": "sha512-shZT/QMiSHc/YBLxxOkMtgSid5HFoauqCE3/exfsEcwg1WkeqjG+V40yBbBrsD+jW2HDXcs28xOfcbm2jI8Ddw==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "resolve-pkg-maps": "^1.0.0"
+      },
+      "funding": {
+        "url": "https://github.com/privatenumber/get-tsconfig?sponsor=1"
+      }
+    },
+    "node_modules/resolve-pkg-maps": {
+      "version": "1.0.0",
+      "resolved": "https://registry.npmjs.org/resolve-pkg-maps/-/resolve-pkg-maps-1.0.0.tgz",
+      "integrity": "sha512-seS2Tj26TBVOC2NIc2rOe2y2ZO7efxITtLZcGSOnHHNOQ7CkiUBfw0Iw2ck6xkIhPwLhKNLS8BO+hEpngQlqzw==",
+      "dev": true,
+      "license": "MIT",
+      "funding": {
+        "url": "https://github.com/privatenumber/resolve-pkg-maps?sponsor=1"
+      }
+    },
+    "node_modules/tsx": {
+      "version": "4.21.0",
+      "resolved": "https://registry.npmjs.org/tsx/-/tsx-4.21.0.tgz",
+      "integrity": "sha512-5C1sg4USs1lfG0GFb2RLXsdpXqBSEhAaA/0kPL01wxzpMqLILNxIxIOKiILz+cdg/pLnOUxFYOR5yhHU666wbw==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "esbuild": "~0.27.0",
+        "get-tsconfig": "^4.7.5"
+      },
+      "bin": {
+        "tsx": "dist/cli.mjs"
+      },
+      "engines": {
+        "node": ">=18.0.0"
+      },
+      "optionalDependencies": {
+        "fsevents": "~2.3.3"
+      }
+    },
+    "node_modules/typescript": {
+      "version": "5.9.3",
+      "resolved": "https://registry.npmjs.org/typescript/-/typescript-5.9.3.tgz",
+      "integrity": "sha512-jl1vZzPDinLr9eUt3J/t7V6FgNEw9QjvBPdysz9KfQDD41fQrC2Y4vKQdiaUpFT4bXlb1RHhLpp8wtm6M5TgSw==",
+      "dev": true,
+      "license": "Apache-2.0",
+      "bin": {
+        "tsc": "bin/tsc",
+        "tsserver": "bin/tsserver"
+      },
+      "engines": {
+        "node": ">=14.17"
+      }
+    },
+    "node_modules/undici-types": {
+      "version": "7.16.0",
+      "resolved": "https://registry.npmjs.org/undici-types/-/undici-types-7.16.0.tgz",
+      "integrity": "sha512-Zz+aZWSj8LE6zoxD+xrjh4VfkIG8Ya6LvYkZqtUQGJPZjYl53ypCaUwWqo7eI0x66KBGeRo+mlBEkMSeSZ38Nw==",
+      "dev": true,
+      "license": "MIT"
+    }
+  }
+}
diff --git a/skills/package.json b/skills/package.json
new file mode 100644
index 00000000..eb33002f
--- /dev/null
+++ b/skills/package.json
@@ -0,0 +1,31 @@
+{
+  "name": "plano-agent-skills",
+  "version": "1.0.0",
+  "description": "Best practices for building agents and agentic applications with Plano — installable via npx skills add",
+  "type": "module",
+  "scripts": {
+    "typecheck": "tsc --noEmit",
+    "build": "tsx src/build.ts",
+    "validate": "tsx src/validate.ts",
+    "extract-tests": "tsx src/extract-tests.ts",
+    "dev": "npm run typecheck && npm run validate && npm run build && npm run extract-tests"
+  },
+  "keywords": [
+    "plano",
+    "archgw",
+    "ai-gateway",
+    "agent",
+    "llm",
+    "skills",
+    "best-practices"
+  ],
+  "license": "Apache-2.0",
+  "engines": {
+    "node": ">=18.0.0"
+  },
+  "devDependencies": {
+    "@types/node": "^24.3.0",
+    "tsx": "^4.20.5",
+    "typescript": "^5.9.2"
+  }
+}
diff --git a/skills/plano-advanced-patterns/SKILL.md b/skills/plano-advanced-patterns/SKILL.md
new file mode 100644
index 00000000..7e2f1b00
--- /dev/null
+++ b/skills/plano-advanced-patterns/SKILL.md
@@ -0,0 +1,32 @@
+---
+name: plano-advanced-patterns
+description: Design advanced Plano architectures. Use for multi-listener systems, prompt target schema quality, and layered orchestration patterns.
+license: Apache-2.0
+metadata:
+  author: katanemo
+  version: "1.0.0"
+---
+
+# Plano Advanced Patterns
+
+Use this skill for higher-order architecture decisions once fundamentals are stable.
+
+## When To Use
+
+- "Design a multi-listener Plano architecture"
+- "Improve prompt target schema precision"
+- "Combine model, prompt, and agent listeners"
+- "Refine advanced routing/function-calling behavior"
+
+## Apply These Rules
+
+- `advanced-multi-listener`
+- `advanced-prompt-targets`
+
+## Execution Checklist
+
+1. Use multiple listeners only when interfaces are truly distinct.
+2. Keep provider/routing definitions shared and consistent.
+3. Define prompt target parameters with strict, explicit schemas.
+4. Minimize ambiguity that causes malformed tool calls.
+5. Provide migration-safe recommendations and test scenarios.
diff --git a/skills/plano-agent-orchestration/SKILL.md b/skills/plano-agent-orchestration/SKILL.md
new file mode 100644
index 00000000..90f25beb
--- /dev/null
+++ b/skills/plano-agent-orchestration/SKILL.md
@@ -0,0 +1,32 @@
+---
+name: plano-agent-orchestration
+description: Improve multi-agent orchestration in Plano. Use for agent registration, agent listener wiring, and capability-focused agent descriptions for accurate routing.
+license: Apache-2.0
+metadata:
+  author: katanemo
+  version: "1.0.0"
+---
+
+# Plano Agent Orchestration
+
+Use this skill for agent listener quality, sub-agent registration, and route accuracy.
+
+## When To Use
+
+- "Fix multi-agent routing"
+- "Validate agents vs listeners.agents config"
+- "Improve agent descriptions"
+- "Set up a reliable orchestrator"
+
+## Apply These Rules
+
+- `agent-orchestration`
+- `agent-descriptions`
+
+## Execution Checklist
+
+1. Verify each agent exists in both `agents` and `listeners[].agents`.
+2. Ensure one fallback/default agent where appropriate.
+3. Rewrite descriptions to be capability-focused and non-overlapping.
+4. Keep descriptions specific, concise, and example-driven.
+5. Provide test prompts to validate routing outcomes.
diff --git a/skills/plano-agent-skills/SKILL.md b/skills/plano-agent-skills/SKILL.md
new file mode 100644
index 00000000..e6ecbb20
--- /dev/null
+++ b/skills/plano-agent-skills/SKILL.md
@@ -0,0 +1,53 @@
+---
+name: plano-agent-skills
+description: Best practices for building agents and agentic applications with Plano, including configuration, routing, orchestration, guardrails, observability, and deployment.
+license: Apache-2.0
+metadata:
+  author: katanemo
+  version: "1.0.0"
+---
+
+# Plano Agent Skills
+
+Comprehensive Plano guidance for coding agents. Use this umbrella skill when a task spans multiple areas (config, routing, orchestration, filters, observability, CLI, deployment).
+
+## When To Use
+
+- Validating or fixing Plano `config.yaml`
+- Designing listener architecture (`model`, `prompt`, `agent`)
+- Improving model/provider routing quality and fallback behavior
+- Hardening filter chains and prompt guardrails
+- Debugging routing with traces and CLI workflows
+- Preparing deployment and production readiness checks
+
+## How To Use
+
+1. Classify the request by scope (single section vs. cross-cutting).
+2. For focused work, prefer a section-specific skill (for example `plano-routing-model-selection`).
+3. For broad work, apply this umbrella skill and reference section rules from `skills/AGENTS.md`.
+4. Produce concrete edits first, then concise reasoning and validation steps.
+
+## Operating Workflow
+
+1. Identify the task area first: config, routing, orchestration, filters, observability, CLI, or deployment.
+2. Apply the smallest correct change that satisfies the requested behavior.
+3. Preserve security and reliability defaults:
+   - `version: v0.3.0`
+   - exactly one `default: true` model provider
+   - secrets via `$ENV_VAR` substitution only
+   - `host.docker.internal` for host services from inside Docker
+   - guardrails before enrichment in filter chains
+4. For debugging, prioritize traces over guesswork (`planoai up --with-tracing`, `planoai trace`).
+5. Return concrete diffs and a short validation checklist.
+
+## Response Style
+
+- Prefer actionable edits over generic advice.
+- Be explicit about why a config choice is correct.
+- Call out risky patterns (hardcoded secrets, missing default provider, bad filter ordering).
+- Keep examples minimal and production-viable.
+
+## References
+
+- Repo: https://github.com/katanemo/plano
+- Full rulebook: `skills/AGENTS.md`
diff --git a/skills/plano-cli-operations/SKILL.md b/skills/plano-cli-operations/SKILL.md
new file mode 100644
index 00000000..da25db58
--- /dev/null
+++ b/skills/plano-cli-operations/SKILL.md
@@ -0,0 +1,34 @@
+---
+name: plano-cli-operations
+description: Apply Plano CLI best practices. Use for startup troubleshooting, cli_agent workflows, prompt target generation, and template-based project bootstrapping.
+license: Apache-2.0
+metadata:
+  author: katanemo
+  version: "1.0.0"
+---
+
+# Plano CLI Operations
+
+Use this skill when the task is primarily operational and CLI-driven.
+
+## When To Use
+
+- "Fix `planoai up` failures"
+- "Use `planoai cli_agent` with coding agents"
+- "Generate prompt targets from Python functions"
+- "Bootstrap a project with `planoai init` templates"
+
+## Apply These Rules
+
+- `cli-startup`
+- `cli-agent`
+- `cli-generate`
+- `cli-init`
+
+## Execution Checklist
+
+1. Follow startup validation order before deep debugging.
+2. Use `cli_agent` to route coding-agent traffic through Plano.
+3. Generate prompt target schema, then wire endpoint details explicitly.
+4. Start from templates for reliable first-time setup.
+5. Provide a compact runbook with exact CLI commands.
diff --git a/skills/plano-config-fundamentals/SKILL.md b/skills/plano-config-fundamentals/SKILL.md
new file mode 100644
index 00000000..87b7fbdd
--- /dev/null
+++ b/skills/plano-config-fundamentals/SKILL.md
@@ -0,0 +1,34 @@
+---
+name: plano-config-fundamentals
+description: Validate and fix Plano config fundamentals. Use for config versioning, listener types, provider registration, secrets handling, and startup validation failures.
+license: Apache-2.0
+metadata:
+  author: katanemo
+  version: "1.0.0"
+---
+
+# Plano Configuration Fundamentals
+
+Use this skill for foundational `config.yaml` correctness.
+
+## When To Use
+
+- "Validate this Plano config"
+- "Fix startup config errors"
+- "Check listeners/providers/secrets"
+- "Why does `planoai up` fail schema validation?"
+
+## Apply These Rules
+
+- `config-version`
+- `config-listeners`
+- `config-providers`
+- `config-secrets`
+
+## Execution Checklist
+
+1. Ensure `version: v0.3.0` is present.
+2. Confirm listener type matches intended architecture.
+3. Verify provider names/interfaces and exactly one default provider.
+4. Replace hardcoded secrets with `$ENV_VAR` substitution.
+5. Return minimal patch and a `planoai up` verification plan.
diff --git a/skills/plano-deployment-security/SKILL.md b/skills/plano-deployment-security/SKILL.md
new file mode 100644
index 00000000..48256777
--- /dev/null
+++ b/skills/plano-deployment-security/SKILL.md
@@ -0,0 +1,33 @@
+---
+name: plano-deployment-security
+description: Apply Plano deployment and production security practices. Use for Docker networking, state storage choices, readiness checks, and environment-based secret handling.
+license: Apache-2.0
+metadata:
+  author: katanemo
+  version: "1.0.0"
+---
+
+# Plano Deployment and Security
+
+Use this skill to harden production deployments and reduce runtime surprises.
+
+## When To Use
+
+- "Fix unreachable agents in Docker"
+- "Configure persistent conversation state"
+- "Add readiness and health checks"
+- "Prepare production deployment checklist"
+
+## Apply These Rules
+
+- `deploy-docker`
+- `deploy-state`
+- `deploy-health`
+
+## Execution Checklist
+
+1. Use `host.docker.internal` for host-side services from inside Plano container.
+2. Prefer PostgreSQL state storage for production multi-turn workloads.
+3. Verify `/healthz` before traffic or CI assertions.
+4. Ensure secrets remain environment-based, never hardcoded.
+5. Return deployment checks with failure-mode diagnostics.
diff --git a/skills/plano-filter-guardrails/SKILL.md b/skills/plano-filter-guardrails/SKILL.md
new file mode 100644
index 00000000..2f19e67b
--- /dev/null
+++ b/skills/plano-filter-guardrails/SKILL.md
@@ -0,0 +1,33 @@
+---
+name: plano-filter-guardrails
+description: Harden Plano filter chains and guardrails. Use for MCP filter setup, prompt guard responses, and safe filter ordering.
+license: Apache-2.0
+metadata:
+  author: katanemo
+  version: "1.0.0"
+---
+
+# Plano Filter Chains and Guardrails
+
+Use this skill when safety controls or filter pipelines need correction.
+
+## When To Use
+
+- "Fix filter chain ordering"
+- "Set up MCP filters correctly"
+- "Improve guardrail rejection behavior"
+- "Harden request processing for safety"
+
+## Apply These Rules
+
+- `filter-mcp`
+- `filter-guardrails`
+- `filter-ordering`
+
+## Execution Checklist
+
+1. Configure filter `type`, `transport`, and `tool` explicitly for MCP.
+2. Ensure rejection messages are clear and actionable.
+3. Order chain as guards -> rewriters -> enrichment -> output checks.
+4. Prevent expensive enrichment on unsafe requests.
+5. Verify with representative blocked and allowed test prompts.
diff --git a/skills/plano-observability-debugging/SKILL.md b/skills/plano-observability-debugging/SKILL.md
new file mode 100644
index 00000000..c4039a7f
--- /dev/null
+++ b/skills/plano-observability-debugging/SKILL.md
@@ -0,0 +1,33 @@
+---
+name: plano-observability-debugging
+description: Improve Plano tracing and debugging workflows. Use for sampling strategy, span attributes, and trace query-based root-cause analysis.
+license: Apache-2.0
+metadata:
+  author: katanemo
+  version: "1.0.0"
+---
+
+# Plano Observability and Debugging
+
+Use this skill to make routing and latency behavior inspectable and debuggable.
+
+## When To Use
+
+- "Enable tracing correctly"
+- "Add useful span attributes"
+- "Debug why a request routed incorrectly"
+- "Inspect filter/model latency from traces"
+
+## Apply These Rules
+
+- `observe-tracing`
+- `observe-span-attributes`
+- `observe-trace-query`
+
+## Execution Checklist
+
+1. Enable tracing with environment-appropriate sampling.
+2. Add useful static and header-derived span attributes.
+3. Use `planoai trace` filters to isolate route and latency issues.
+4. Prefer trace evidence over assumptions in recommendations.
+5. Return exact commands to reproduce and validate findings.
diff --git a/skills/plano-routing-model-selection/SKILL.md b/skills/plano-routing-model-selection/SKILL.md
new file mode 100644
index 00000000..083f21c8
--- /dev/null
+++ b/skills/plano-routing-model-selection/SKILL.md
@@ -0,0 +1,34 @@
+---
+name: plano-routing-model-selection
+description: Optimize Plano model routing and selection. Use for provider defaults, model aliases, passthrough auth, and routing preference quality.
+license: Apache-2.0
+metadata:
+  author: katanemo
+  version: "1.0.0"
+---
+
+# Plano Routing and Model Selection
+
+Use this skill when requests are routed to the wrong model, costs are high, or fallback behavior is unclear.
+
+## When To Use
+
+- "Improve model routing"
+- "Add aliases and defaults"
+- "Fix passthrough auth with proxy providers"
+- "Tune routing preferences for better classification"
+
+## Apply These Rules
+
+- `routing-default`
+- `routing-aliases`
+- `routing-passthrough`
+- `routing-preferences`
+
+## Execution Checklist
+
+1. Ensure exactly one `default: true` provider.
+2. Add semantic aliases for stable client contracts.
+3. Configure passthrough auth only where required.
+4. Rewrite vague preference descriptions with concrete task scopes.
+5. Validate routing behavior using trace-based checks.
diff --git a/skills/rules/_sections.md b/skills/rules/_sections.md
new file mode 100644
index 00000000..a74c77f8
--- /dev/null
+++ b/skills/rules/_sections.md
@@ -0,0 +1,16 @@
+# Section Definitions
+
+This file defines the sections used to organize Plano agent skills rules.
+Files are assigned to sections based on their filename prefix.
+
+
+| Prefix      | Section # | Title                      | Impact      | Description                                                                                                             |
+| ----------- | --------- | -------------------------- | ----------- | ----------------------------------------------------------------------------------------------------------------------- |
+| `config-`   | 1         | Configuration Fundamentals | CRITICAL    | Core config.yaml structure, versioning, listener types, and provider setup — the entry point for every Plano deployment |
+| `routing-`  | 2         | Routing & Model Selection  | HIGH        | Intelligent LLM routing using preferences, aliases, and defaults to match tasks to the best model                       |
+| `agent-`    | 3         | Agent Orchestration        | HIGH        | Multi-agent patterns, agent descriptions, and orchestration strategies for building agentic applications                |
+| `filter-`   | 4         | Filter Chains & Guardrails | HIGH        | Request/response processing pipelines — ordering, MCP integration, and safety guardrails                                |
+| `observe-`  | 5         | Observability & Debugging  | MEDIUM-HIGH | OpenTelemetry tracing, log levels, span attributes, and sampling for production visibility                              |
+| `cli-`      | 6         | CLI Operations             | MEDIUM      | Using the planoai CLI for startup, tracing, CLI agents, project init, and code generation                               |
+| `deploy-`   | 7         | Deployment & Security      | HIGH        | Docker deployment, environment variable management, health checks, and state storage for production                     |
+| `advanced-` | 8         | Advanced Patterns          | MEDIUM      | Prompt targets, external API integration, and multi-listener architectures                                              |
diff --git a/skills/rules/_template.md b/skills/rules/_template.md
new file mode 100644
index 00000000..9566063e
--- /dev/null
+++ b/skills/rules/_template.md
@@ -0,0 +1,26 @@
+---
+title: Rule Title Here
+impact: MEDIUM
+impactDescription: Optional one-line description of the impact
+tags: tag1, tag2, tag3
+---
+
+## Rule Title Here
+
+Brief explanation of what this rule is and why it matters for Plano developers and agents.
+
+**Incorrect (explain what's wrong):**
+
+```yaml
+# Bad config or CLI example
+```
+
+**Correct (explain what's right):**
+
+```yaml
+# Good config or CLI example
+```
+
+Optional explanatory text elaborating on the principle or listing key points.
+
+Reference: https://github.com/katanemo/archgw
diff --git a/skills/rules/advanced-multi-listener.md b/skills/rules/advanced-multi-listener.md
new file mode 100644
index 00000000..81c8d4d9
--- /dev/null
+++ b/skills/rules/advanced-multi-listener.md
@@ -0,0 +1,139 @@
+---
+title: Combine Multiple Listener Types for Layered Agent Architectures
+impact: MEDIUM
+impactDescription: Using a single listener type forces all traffic through one gateway pattern — combining types lets you serve different clients with the right interface without running multiple Plano instances
+tags: advanced, multi-listener, architecture, agent, model, prompt
+---
+
+## Combine Multiple Listener Types for Layered Agent Architectures
+
+A single Plano `config.yaml` can define multiple listeners of different types, each on a separate port. This lets you serve different client types simultaneously: an OpenAI-compatible model gateway for direct API clients, a prompt gateway for LLM-callable function applications, and an agent orchestrator for multi-agent workflows — all from one Plano instance sharing the same model providers.
+
+**Single listener (limited — forces all clients through one interface):**
+
+```yaml
+version: v0.3.0
+
+listeners:
+  - type: model             # Only model clients can use this
+    name: model_gateway
+    port: 12000
+
+# Prompt target clients and agent clients cannot connect
+```
+
+**Multi-listener architecture (serves all client types):**
+
+```yaml
+version: v0.3.0
+
+# --- Shared model providers ---
+model_providers:
+  - model: openai/gpt-4o-mini
+    access_key: $OPENAI_API_KEY
+    default: true
+    routing_preferences:
+      - name: quick tasks
+        description: Short answers, formatting, classification, simple generation
+
+  - model: openai/gpt-4o
+    access_key: $OPENAI_API_KEY
+    routing_preferences:
+      - name: complex reasoning
+        description: Multi-step analysis, code generation, research synthesis
+
+  - model: anthropic/claude-sonnet-4-20250514
+    access_key: $ANTHROPIC_API_KEY
+    routing_preferences:
+      - name: long documents
+        description: Summarizing or analyzing very long documents, PDFs, transcripts
+
+# --- Listener 1: OpenAI-compatible API gateway ---
+# For: SDK clients, Claude Code, LangChain, etc.
+listeners:
+  - type: model
+    name: model_gateway
+    port: 12000
+    timeout: "120s"
+
+# --- Listener 2: Prompt function gateway ---
+# For: Applications that expose LLM-callable APIs
+  - type: prompt
+    name: function_gateway
+    port: 10000
+    timeout: "60s"
+
+# --- Listener 3: Agent orchestration gateway ---
+# For: Multi-agent application clients
+  - type: agent
+    name: agent_orchestrator
+    port: 8000
+    timeout: "90s"
+    router: plano_orchestrator_v1
+    agents:
+      - id: research_agent
+        description: Searches, synthesizes, and summarizes information from multiple sources.
+        filter_chain:
+          - input_guards
+          - context_builder
+      - id: code_agent
+        description: Writes, reviews, debugs, and explains code across all languages.
+        default: true
+
+# --- Agents ---
+agents:
+  - id: research_agent
+    url: http://host.docker.internal:8001
+  - id: code_agent
+    url: http://host.docker.internal:8002
+
+# --- Filters ---
+filters:
+  - id: input_guards
+    url: http://host.docker.internal:10500
+    type: mcp
+    transport: streamable-http
+  - id: context_builder
+    url: http://host.docker.internal:10501
+    type: mcp
+    transport: streamable-http
+
+# --- Prompt targets (for function gateway) ---
+endpoints:
+  internal_api:
+    endpoint: host.docker.internal
+    protocol: http
+
+prompt_targets:
+  - name: search_knowledge_base
+    description: Search the internal knowledge base for relevant documents and facts.
+    parameters:
+      - name: query
+        type: str
+        required: true
+        description: Search query to find relevant information
+    endpoint:
+      name: internal_api
+      path: /kb/search?q={query}
+      http_method: GET
+
+# --- Observability ---
+model_aliases:
+  plano.fast.v1:
+    target: gpt-4o-mini
+  plano.smart.v1:
+    target: gpt-4o
+
+tracing:
+  random_sampling: 50
+  trace_arch_internal: true
+  span_attributes:
+    static:
+      environment: production
+    header_prefixes:
+      - x-katanemo-
+```
+
+This architecture serves: SDK clients on `:12000`, function-calling apps on `:10000`, and multi-agent orchestration on `:8000` — with shared cost-optimized routing across all three.
+
+Reference: [https://github.com/katanemo/archgw](https://github.com/katanemo/archgw)
diff --git a/skills/rules/advanced-prompt-targets.md b/skills/rules/advanced-prompt-targets.md
new file mode 100644
index 00000000..88f376fd
--- /dev/null
+++ b/skills/rules/advanced-prompt-targets.md
@@ -0,0 +1,128 @@
+---
+title: Design Prompt Targets with Precise Parameter Schemas
+impact: HIGH
+impactDescription: Imprecise parameter definitions cause the LLM to hallucinate values, skip required fields, or produce malformed API calls — the schema is the contract between the LLM and your API
+tags: advanced, prompt-targets, functions, llm, api-integration
+---
+
+## Design Prompt Targets with Precise Parameter Schemas
+
+`prompt_targets` define functions that Plano's LLM can call autonomously when it determines a user request matches the function's description. The parameter schema tells the LLM exactly what values to extract from user input — vague schemas lead to hallucinated parameters and failed API calls.
+
+**Incorrect (too few constraints — LLM must guess):**
+
+```yaml
+prompt_targets:
+  - name: get_flight_info
+    description: Get flight information
+    parameters:
+      - name: flight         # What format? "AA123"? "AA 123"? "American 123"?
+        type: str
+        required: true
+    endpoint:
+      name: flights_api
+      path: /flight?id={flight}
+```
+
+**Correct (fully specified schema with descriptions, formats, and enums):**
+
+```yaml
+version: v0.3.0
+
+endpoints:
+  flights_api:
+    endpoint: api.flightaware.com
+    protocol: https
+    connect_timeout: "5s"
+
+prompt_targets:
+  - name: get_flight_status
+    description: >
+      Get real-time status, gate information, and delays for a specific flight number.
+      Use when the user asks about a flight's current status, arrival time, or gate.
+    parameters:
+      - name: flight_number
+        description: >
+          IATA airline code followed by flight number, e.g., "AA123", "UA456", "DL789".
+          Extract from user message — do not include spaces.
+        type: str
+        required: true
+        format: "^[A-Z]{2}[0-9]{1,4}$"    # Regex hint for validation
+
+      - name: date
+        description: >
+          Flight date in YYYY-MM-DD format. Use today's date if not specified.
+        type: str
+        required: false
+        format: date
+
+    endpoint:
+      name: flights_api
+      path: /flights/{flight_number}?date={date}
+      http_method: GET
+      http_headers:
+        Authorization: "Bearer $FLIGHTAWARE_API_KEY"
+
+  - name: search_flights
+    description: >
+      Search for available flights between two cities or airports.
+      Use when the user wants to find flights, compare options, or book travel.
+    parameters:
+      - name: origin
+        description: Departure airport IATA code (e.g., "JFK", "LAX", "ORD")
+        type: str
+        required: true
+      - name: destination
+        description: Arrival airport IATA code (e.g., "LHR", "CDG", "NRT")
+        type: str
+        required: true
+      - name: departure_date
+        description: Departure date in YYYY-MM-DD format
+        type: str
+        required: true
+        format: date
+      - name: cabin_class
+        description: Preferred cabin class
+        type: str
+        required: false
+        default: economy
+        enum: [economy, premium_economy, business, first]
+      - name: passengers
+        description: Number of adult passengers (1-9)
+        type: int
+        required: false
+        default: 1
+
+    endpoint:
+      name: flights_api
+      path: /search?from={origin}&to={destination}&date={departure_date}&class={cabin_class}&pax={passengers}
+      http_method: GET
+      http_headers:
+        Authorization: "Bearer $FLIGHTAWARE_API_KEY"
+
+    system_prompt: |
+      You are a travel assistant. Present flight search results clearly,
+      highlighting the best value options. Include price, duration, and
+      number of stops for each option.
+
+model_providers:
+  - model: openai/gpt-4o
+    access_key: $OPENAI_API_KEY
+    default: true
+
+listeners:
+  - type: prompt
+    name: travel_functions
+    port: 10000
+    timeout: "30s"
+```
+
+**Key principles:**
+- `description` on the target tells the LLM when to call it — be specific about trigger conditions
+- `description` on each parameter tells the LLM what value to extract — include format examples
+- Use `enum` to constrain categorical values — prevents the LLM from inventing categories
+- Use `format: date` or regex patterns to hint at expected format
+- Use `default` for optional parameters so the API never receives null values
+- `system_prompt` on the target customizes how the LLM formats the API response to the user
+
+Reference: https://github.com/katanemo/archgw
diff --git a/skills/rules/agent-descriptions.md b/skills/rules/agent-descriptions.md
new file mode 100644
index 00000000..86728bde
--- /dev/null
+++ b/skills/rules/agent-descriptions.md
@@ -0,0 +1,75 @@
+---
+title: Write Capability-Focused Agent Descriptions for Accurate Routing
+impact: HIGH
+impactDescription: The orchestrator LLM routes requests purely by reading agent descriptions — poor descriptions cause misroutes to the wrong specialized agent
+tags: agent, orchestration, descriptions, routing, multi-agent
+---
+
+## Write Capability-Focused Agent Descriptions for Accurate Routing
+
+In an `agent` listener, Plano's orchestrator reads each agent's `description` and routes user requests to the best-matching agent. This is LLM-based intent matching — the description is the entire specification the router sees. Write it as a capability manifest: what can this agent do, what data does it have access to, and what types of requests should it handle?
+
+**Incorrect (generic, overlapping descriptions):**
+
+```yaml
+listeners:
+  - type: agent
+    name: orchestrator
+    port: 8000
+    router: plano_orchestrator_v1
+    agents:
+      - id: agent_1
+        description: Helps users with information    # Too generic — matches everything
+
+      - id: agent_2
+        description: Also helps users               # Indistinguishable from agent_1
+```
+
+**Correct (specific capabilities, distinct domains, concrete examples):**
+
+```yaml
+version: v0.3.0
+
+agents:
+  - id: weather_agent
+    url: http://host.docker.internal:8001
+  - id: flight_agent
+    url: http://host.docker.internal:8002
+  - id: hotel_agent
+    url: http://host.docker.internal:8003
+
+listeners:
+  - type: agent
+    name: travel_orchestrator
+    port: 8000
+    router: plano_orchestrator_v1
+    agents:
+      - id: weather_agent
+        description: >
+          Provides real-time weather conditions and multi-day forecasts for any city
+          worldwide. Handles questions about temperature, precipitation, wind, humidity,
+          sunrise/sunset times, and severe weather alerts. Examples: "What's the weather
+          in Tokyo?", "Will it rain in London this weekend?", "Sunrise time in New York."
+
+      - id: flight_agent
+        description: >
+          Provides live flight status, schedules, gate information, delays, and
+          aircraft details for any flight number or route between airports.
+          Handles questions about departures, arrivals, and airline information.
+          Examples: "Is AA123 on time?", "Flights from JFK to LAX tomorrow."
+
+      - id: hotel_agent
+        description: >
+          Searches and books hotel accommodations, compares room types, pricing,
+          and availability. Handles check-in/check-out dates, amenities, and
+          cancellation policies. Examples: "Hotels near Times Square for next Friday."
+```
+
+**Description writing checklist:**
+- State the primary domain in the first sentence
+- List 3–5 specific data types or question categories this agent handles
+- Include 2–3 concrete example user queries in quotes
+- Avoid capability overlap between agents — if they overlap, the router will split traffic unpredictably
+- Keep descriptions under 150 words — the orchestrator reads all descriptions per request
+
+Reference: https://github.com/katanemo/archgw
diff --git a/skills/rules/agent-orchestration.md b/skills/rules/agent-orchestration.md
new file mode 100644
index 00000000..0e6d7bb3
--- /dev/null
+++ b/skills/rules/agent-orchestration.md
@@ -0,0 +1,88 @@
+---
+title: Register All Sub-Agents in Both `agents` and `listeners.agents`
+impact: CRITICAL
+impactDescription: An agent registered only in `agents` but not referenced in a listener's agent list is unreachable; an agent listed in a listener but missing from `agents` causes a startup error
+tags: agent, orchestration, config, multi-agent
+---
+
+## Register All Sub-Agents in Both `agents` and `listeners.agents`
+
+Plano's agent system has two separate concepts: the global `agents` array (defines the agent's ID and backend URL) and the `listeners[].agents` array (controls which agents are available to an orchestrator and provides their routing descriptions). Both must reference the same agent ID.
+
+**Incorrect (agent defined globally but not referenced in listener):**
+
+```yaml
+version: v0.3.0
+
+agents:
+  - id: weather_agent
+    url: http://host.docker.internal:8001
+  - id: news_agent              # Defined but never referenced in any listener
+    url: http://host.docker.internal:8002
+
+listeners:
+  - type: agent
+    name: orchestrator
+    port: 8000
+    router: plano_orchestrator_v1
+    agents:
+      - id: weather_agent
+        description: Provides weather forecasts and current conditions.
+      # news_agent is missing here — the orchestrator cannot route to it
+```
+
+**Incorrect (listener references an agent ID not in the global agents list):**
+
+```yaml
+agents:
+  - id: weather_agent
+    url: http://host.docker.internal:8001
+
+listeners:
+  - type: agent
+    name: orchestrator
+    port: 8000
+    router: plano_orchestrator_v1
+    agents:
+      - id: weather_agent
+        description: Provides weather forecasts.
+      - id: flights_agent        # ID not in global agents[] — startup error
+        description: Provides flight status information.
+```
+
+**Correct (every agent ID appears in both places):**
+
+```yaml
+version: v0.3.0
+
+agents:
+  - id: weather_agent
+    url: http://host.docker.internal:8001
+  - id: flights_agent
+    url: http://host.docker.internal:8002
+  - id: hotels_agent
+    url: http://host.docker.internal:8003
+
+model_providers:
+  - model: openai/gpt-4o
+    access_key: $OPENAI_API_KEY
+    default: true
+
+listeners:
+  - type: agent
+    name: travel_orchestrator
+    port: 8000
+    router: plano_orchestrator_v1
+    agents:
+      - id: weather_agent
+        description: Real-time weather, forecasts, and climate data for any city.
+      - id: flights_agent
+        description: Live flight status, schedules, gates, and delays.
+      - id: hotels_agent
+        description: Hotel search, availability, pricing, and booking.
+        default: true    # Fallback if no other agent matches
+```
+
+Set `default: true` on one agent in each listener's agents list to handle unmatched requests. The agent's URL in the global `agents` array is the HTTP endpoint Plano forwards matching requests to — it must be reachable from within the Docker container (use `host.docker.internal` for services on the host).
+
+Reference: https://github.com/katanemo/archgw
diff --git a/skills/rules/cli-agent.md b/skills/rules/cli-agent.md
new file mode 100644
index 00000000..e311e99e
--- /dev/null
+++ b/skills/rules/cli-agent.md
@@ -0,0 +1,86 @@
+---
+title: Use `planoai cli_agent` to Connect Claude Code Through Plano
+impact: MEDIUM-HIGH
+impactDescription: Running Claude Code directly against provider APIs bypasses Plano's routing, observability, and guardrails — cli_agent routes all Claude Code traffic through your configured Plano instance
+tags: cli, cli-agent, claude, coding-agent, integration
+---
+
+## Use `planoai cli_agent` to Connect Claude Code Through Plano
+
+`planoai cli_agent` starts a Claude Code session that routes all LLM traffic through your running Plano instance instead of directly to Anthropic. This gives you routing preferences, model aliases, tracing, and guardrails for your coding agent workflows — making Claude Code a first-class citizen of your Plano configuration.
+
+**Prerequisites:**
+
+```bash
+# 1. Plano must be running with a model listener
+planoai up config.yaml
+
+# 2. ANTHROPIC_API_KEY must be set (Claude Code uses it for auth)
+export ANTHROPIC_API_KEY=sk-ant-...
+```
+
+**Starting the CLI agent:**
+
+```bash
+# Start CLI agent using config.yaml in current directory
+planoai cli_agent claude
+
+# Use a specific config file
+planoai cli_agent claude config.yaml
+
+# Use a config in a different directory
+planoai cli_agent claude --path /path/to/project
+```
+
+**Recommended config for Claude Code routing:**
+
+```yaml
+version: v0.3.0
+
+listeners:
+  - type: model
+    name: claude_code_router
+    port: 12000
+
+model_providers:
+  - model: anthropic/claude-sonnet-4-20250514
+    access_key: $ANTHROPIC_API_KEY
+    default: true
+    routing_preferences:
+      - name: general coding
+        description: >
+          Writing code, debugging, code review, explaining concepts,
+          answering programming questions, general development tasks.
+
+  - model: anthropic/claude-opus-4-6
+    access_key: $ANTHROPIC_API_KEY
+    routing_preferences:
+      - name: complex architecture
+        description: >
+          System design, complex refactoring across many files,
+          architectural decisions, performance optimization, security audits.
+
+model_aliases:
+  claude.fast.v1:
+    target: claude-sonnet-4-20250514
+  claude.smart.v1:
+    target: claude-opus-4-6
+
+tracing:
+  random_sampling: 100
+  trace_arch_internal: true
+
+overrides:
+  upstream_connect_timeout: "10s"
+```
+
+**What happens when cli_agent runs:**
+
+1. Reads your config.yaml to find the model listener port
+2. Configures Claude Code to use `http://localhost:<port>` as its API endpoint
+3. Starts a Claude Code session in your terminal
+4. All Claude Code LLM calls flow through Plano — routing, tracing, and guardrails apply
+
+After your session, use `planoai trace` to inspect every LLM call Claude Code made, which model was selected, and why.
+
+Reference: [https://github.com/katanemo/archgw](https://github.com/katanemo/archgw)
diff --git a/skills/rules/cli-generate.md b/skills/rules/cli-generate.md
new file mode 100644
index 00000000..75ae8e4f
--- /dev/null
+++ b/skills/rules/cli-generate.md
@@ -0,0 +1,91 @@
+---
+title: Generate Prompt Targets from Python Functions with `planoai generate_prompt_targets`
+impact: MEDIUM
+impactDescription: Manually writing prompt_targets YAML for existing Python APIs is error-prone — the generator introspects function signatures and produces correct YAML automatically
+tags: cli, generate, prompt-targets, python, code-generation
+---
+
+## Generate Prompt Targets from Python Functions with `planoai generate_prompt_targets`
+
+`planoai generate_prompt_targets` introspects Python function signatures and docstrings to generate `prompt_targets` YAML for your Plano config. This is the fastest way to expose existing Python APIs as LLM-callable functions without manually writing the YAML schema.
+
+**Python function requirements for generation:**
+- Use simple type annotations: `int`, `float`, `bool`, `str`, `list`, `tuple`, `set`, `dict`
+- Include a docstring describing what the function does (becomes the `description`)
+- Complex Pydantic models must be flattened into primitive typed parameters first
+
+**Example Python file:**
+
+```python
+# api.py
+
+def get_stock_quote(symbol: str, exchange: str = "NYSE") -> dict:
+    """Get the current stock price and trading data for a given stock symbol.
+
+    Returns price, volume, market cap, and 24h change percentage.
+    """
+    # Implementation calls stock API
+    pass
+
+def get_weather_forecast(city: str, days: int = 3, units: str = "celsius") -> dict:
+    """Get the weather forecast for a city.
+
+    Returns temperature, precipitation, and conditions for the specified number of days.
+    """
+    pass
+
+def search_flights(origin: str, destination: str, date: str, passengers: int = 1) -> list:
+    """Search for available flights between two airports on a given date.
+
+    Date format: YYYY-MM-DD. Returns list of flight options with prices.
+    """
+    pass
+```
+
+**Running the generator:**
+
+```bash
+planoai generate_prompt_targets --file api.py
+```
+
+**Generated output (add to your config.yaml):**
+
+```yaml
+prompt_targets:
+  - name: get_stock_quote
+    description: Get the current stock price and trading data for a given stock symbol.
+    parameters:
+      - name: symbol
+        type: str
+        required: true
+      - name: exchange
+        type: str
+        required: false
+        default: NYSE
+    # Add endpoint manually:
+    endpoint:
+      name: stock_api
+      path: /quote?symbol={symbol}&exchange={exchange}
+
+  - name: get_weather_forecast
+    description: Get the weather forecast for a city.
+    parameters:
+      - name: city
+        type: str
+        required: true
+      - name: days
+        type: int
+        required: false
+        default: 3
+      - name: units
+        type: str
+        required: false
+        default: celsius
+    endpoint:
+      name: weather_api
+      path: /forecast?city={city}&days={days}&units={units}
+```
+
+After generation, manually add the `endpoint` blocks pointing to your actual API. The generator produces the schema; you wire in the connectivity.
+
+Reference: https://github.com/katanemo/archgw
diff --git a/skills/rules/cli-init.md b/skills/rules/cli-init.md
new file mode 100644
index 00000000..740396ae
--- /dev/null
+++ b/skills/rules/cli-init.md
@@ -0,0 +1,66 @@
+---
+title: Use `planoai init` Templates to Bootstrap New Projects Correctly
+impact: MEDIUM
+impactDescription: Starting from a blank config.yaml leads to missing required fields and common structural mistakes — templates provide validated, idiomatic starting points
+tags: cli, init, templates, getting-started, project-setup
+---
+
+## Use `planoai init` Templates to Bootstrap New Projects Correctly
+
+`planoai init` generates a valid `config.yaml` from built-in templates. Each template demonstrates a specific Plano capability with correct structure, realistic examples, and comments. Use this instead of writing config from scratch — it ensures you start with a valid, working configuration.
+
+**Available templates:**
+
+| Template ID | What It Demonstrates | Best For |
+|---|---|---|
+| `sub_agent_orchestration` | Multi-agent routing with specialized sub-agents | Building agentic applications |
+| `coding_agent_routing` | Routing preferences + model aliases for coding workflows | Claude Code and coding assistants |
+| `preference_aware_routing` | Automatic LLM routing based on task type | Multi-model cost optimization |
+| `filter_chain_guardrails` | Input guards, query rewrite, context builder | RAG + safety pipelines |
+| `conversational_state_v1_responses` | Stateful conversations with memory | Chatbots, multi-turn assistants |
+
+**Usage:**
+
+```bash
+# Initialize with a template
+planoai init --template sub_agent_orchestration
+
+# Initialize coding agent routing setup
+planoai init --template coding_agent_routing
+
+# Initialize a RAG with guardrails project
+planoai init --template filter_chain_guardrails
+```
+
+**Typical project setup workflow:**
+
+```bash
+# 1. Create project directory
+mkdir my-plano-agent && cd my-plano-agent
+
+# 2. Bootstrap with the closest matching template
+planoai init --template preference_aware_routing
+
+# 3. Edit config.yaml to add your specific models, agents, and API keys
+#    (keys are already using $VAR substitution — just set your env vars)
+
+# 4. Create .env file for local development
+cat > .env << EOF
+OPENAI_API_KEY=sk-proj-...
+ANTHROPIC_API_KEY=sk-ant-...
+EOF
+
+echo ".env" >> .gitignore
+
+# 5. Start Plano
+planoai up
+
+# 6. Test your configuration
+curl http://localhost:12000/v1/chat/completions \
+  -H "Content-Type: application/json" \
+  -d '{"model": "gpt-4o", "messages": [{"role": "user", "content": "Hello"}]}'
+```
+
+Start with `preference_aware_routing` for most LLM gateway use cases and `sub_agent_orchestration` for multi-agent applications. Both can be combined after you understand each independently.
+
+Reference: https://github.com/katanemo/archgw
diff --git a/skills/rules/cli-startup.md b/skills/rules/cli-startup.md
new file mode 100644
index 00000000..2d51927c
--- /dev/null
+++ b/skills/rules/cli-startup.md
@@ -0,0 +1,80 @@
+---
+title: Follow the `planoai up` Validation Workflow Before Debugging Runtime Issues
+impact: HIGH
+impactDescription: `planoai up` validates config, checks API keys, and health-checks all listeners — skipping this diagnostic information leads to unnecessary debugging of container or network issues
+tags: cli, startup, validation, debugging, workflow
+---
+
+## Follow the `planoai up` Validation Workflow Before Debugging Runtime Issues
+
+`planoai up` is the entry point for running Plano. It performs sequential checks before the container starts: schema validation, API key presence check, container startup, and health checks on all configured listener ports. Understanding what each failure stage means prevents chasing the wrong root cause.
+
+**Validation stages and failure signals:**
+
+```
+Stage 1: Schema validation        → "config.yaml: invalid against schema"
+Stage 2: API key check            → "Missing required environment variables: OPENAI_API_KEY"
+Stage 3: Container start          → "Docker daemon not running" or image pull errors
+Stage 4: Health check (/healthz)  → "Listener not healthy after 120s" (timeout)
+```
+
+**Development startup workflow:**
+
+```bash
+# Standard startup — config.yaml in current directory
+planoai up
+
+# Explicit config file path
+planoai up my-config.yaml
+
+# Start in foreground to see all logs immediately (great for debugging)
+planoai up config.yaml --foreground
+
+# Start with built-in OTEL trace collector
+planoai up config.yaml --with-tracing
+
+# Enable verbose logging for debugging routing decisions
+LOG_LEVEL=debug planoai up config.yaml --foreground
+```
+
+**Checking what's running:**
+
+```bash
+# Stream recent logs (last N lines, then exit)
+planoai logs
+
+# Follow logs in real-time
+planoai logs --follow
+
+# Include Envoy/gateway debug messages
+planoai logs --debug --follow
+```
+
+**Stopping and restarting after config changes:**
+
+```bash
+# Stop the current container
+planoai down
+
+# Restart with updated config
+planoai up config.yaml
+```
+
+**Common failure patterns:**
+
+```bash
+# API key missing — check your .env file or shell environment
+export OPENAI_API_KEY=sk-proj-...
+planoai up config.yaml
+
+# Health check timeout — listener port may conflict
+# Check if another process uses port 12000
+lsof -i :12000
+
+# Container fails to start — verify Docker daemon is running
+docker ps
+```
+
+`planoai down` fully stops and removes the Plano container. Always run `planoai down` before `planoai up` when changing config to avoid stale container state.
+
+Reference: https://github.com/katanemo/archgw
diff --git a/skills/rules/config-listeners.md b/skills/rules/config-listeners.md
new file mode 100644
index 00000000..d40a3e30
--- /dev/null
+++ b/skills/rules/config-listeners.md
@@ -0,0 +1,64 @@
+---
+title: Choose the Right Listener Type for Your Use Case
+impact: CRITICAL
+impactDescription: The listener type determines the entire request processing pipeline — choosing the wrong type means features like prompt functions or agent routing are unavailable
+tags: config, listeners, architecture, routing
+---
+
+## Choose the Right Listener Type for Your Use Case
+
+Plano supports three listener types, each serving a distinct purpose. `listeners` is the only required top-level array in a Plano config. Every listener needs at minimum a `type`, `name`, and `port`.
+
+| Type | Use When | Key Feature |
+|------|----------|-------------|
+| `model` | You want an OpenAI-compatible LLM gateway | Routes to multiple LLM providers, supports model aliases and routing preferences |
+| `prompt` | You want LLM-callable custom functions | Define `prompt_targets` that the LLM dispatches as function calls |
+| `agent` | You want multi-agent orchestration | Routes user requests to specialized sub-agents by matching agent descriptions |
+
+**Incorrect (using `model` when agents need orchestration):**
+
+```yaml
+version: v0.3.0
+
+# Wrong: a model listener cannot route to backend agent services
+listeners:
+  - type: model
+    name: main
+    port: 12000
+
+agents:
+  - id: weather_agent
+    url: http://host.docker.internal:8001
+```
+
+**Correct (use `agent` listener for multi-agent systems):**
+
+```yaml
+version: v0.3.0
+
+agents:
+  - id: weather_agent
+    url: http://host.docker.internal:8001
+  - id: travel_agent
+    url: http://host.docker.internal:8002
+
+listeners:
+  - type: agent
+    name: orchestrator
+    port: 8000
+    router: plano_orchestrator_v1
+    agents:
+      - id: weather_agent
+        description: Provides real-time weather, forecasts, and conditions for any city.
+      - id: travel_agent
+        description: Books flights, hotels, and travel itineraries.
+
+model_providers:
+  - model: openai/gpt-4o
+    access_key: $OPENAI_API_KEY
+    default: true
+```
+
+A single Plano instance can expose multiple listeners on different ports, each with a different type, to serve different clients simultaneously.
+
+Reference: https://github.com/katanemo/archgw
diff --git a/skills/rules/config-providers.md b/skills/rules/config-providers.md
new file mode 100644
index 00000000..30476cd5
--- /dev/null
+++ b/skills/rules/config-providers.md
@@ -0,0 +1,64 @@
+---
+title: Register Model Providers with Correct Format Identifiers
+impact: CRITICAL
+impactDescription: Incorrect provider format causes request translation failures — Plano must know the wire format each provider expects
+tags: config, model-providers, llm, api-format
+---
+
+## Register Model Providers with Correct Format Identifiers
+
+Plano translates requests between its internal format and each provider's API. The `model` field uses `provider/model-name` syntax which determines both the upstream endpoint and the request/response translation layer. Some providers require an explicit `provider_interface` override.
+
+**Provider format reference:**
+
+| Model prefix | Wire format | Example |
+|---|---|---|
+| `openai/*` | OpenAI | `openai/gpt-4o` |
+| `anthropic/*` | Anthropic | `anthropic/claude-sonnet-4-20250514` |
+| `gemini/*` | Google Gemini | `gemini/gemini-2.0-flash` |
+| `mistral/*` | Mistral | `mistral/mistral-large-latest` |
+| `groq/*` | Groq | `groq/llama-3.3-70b-versatile` |
+| `deepseek/*` | DeepSeek | `deepseek/deepseek-chat` |
+| `xai/*` | Grok (OpenAI-compat) | `xai/grok-2` |
+| `together_ai/*` | Together.ai | `together_ai/meta-llama/Llama-3` |
+| `custom/*` | Requires `provider_interface` | `custom/my-local-model` |
+
+**Incorrect (missing provider prefix, ambiguous format):**
+
+```yaml
+model_providers:
+  - model: gpt-4o            # Missing openai/ prefix — Plano cannot route this
+    access_key: $OPENAI_API_KEY
+
+  - model: claude-3-5-sonnet # Missing anthropic/ prefix
+    access_key: $ANTHROPIC_API_KEY
+```
+
+**Correct (explicit provider prefixes):**
+
+```yaml
+model_providers:
+  - model: openai/gpt-4o
+    access_key: $OPENAI_API_KEY
+    default: true
+
+  - model: anthropic/claude-sonnet-4-20250514
+    access_key: $ANTHROPIC_API_KEY
+
+  - model: gemini/gemini-2.0-flash
+    access_key: $GOOGLE_API_KEY
+```
+
+**For local or self-hosted models (Ollama, LiteLLM, vLLM):**
+
+```yaml
+model_providers:
+  - model: custom/llama3
+    base_url: http://host.docker.internal:11434/v1   # Ollama endpoint
+    provider_interface: openai                        # Ollama speaks OpenAI format
+    default: true
+```
+
+Always set `default: true` on exactly one provider per listener so Plano has a fallback when routing preferences do not match.
+
+Reference: https://github.com/katanemo/archgw
diff --git a/skills/rules/config-secrets.md b/skills/rules/config-secrets.md
new file mode 100644
index 00000000..5f585c87
--- /dev/null
+++ b/skills/rules/config-secrets.md
@@ -0,0 +1,72 @@
+---
+title: Use Environment Variable Substitution for All Secrets
+impact: CRITICAL
+impactDescription: Hardcoded API keys in config.yaml will be committed to version control and exposed in Docker container inspect output
+tags: config, security, secrets, api-keys, environment-variables
+---
+
+## Use Environment Variable Substitution for All Secrets
+
+Plano supports `$VAR_NAME` substitution in config values. This applies to `access_key` fields, `connection_string` for state storage, and `http_headers` in prompt targets and endpoints. Never hardcode credentials — Plano reads them from environment variables or a `.env` file at startup via `planoai up`.
+
+**Incorrect (hardcoded secrets):**
+
+```yaml
+version: v0.3.0
+
+model_providers:
+  - model: openai/gpt-4o
+    access_key: abcdefghijklmnopqrstuvwxyz...   # Hardcoded — never do this
+
+state_storage:
+  type: postgres
+  connection_string: "postgresql://admin:mysecretpassword@prod-db:5432/plano"
+
+prompt_targets:
+  - name: get_data
+    endpoint:
+      name: my_api
+      http_headers:
+        Authorization: "Bearer abcdefghijklmnopqrstuvwxyz"   # Hardcoded token
+```
+
+**Correct (environment variable substitution):**
+
+```yaml
+version: v0.3.0
+
+model_providers:
+  - model: openai/gpt-4o
+    access_key: $OPENAI_API_KEY
+    default: true
+
+  - model: anthropic/claude-sonnet-4-20250514
+    access_key: $ANTHROPIC_API_KEY
+
+state_storage:
+  type: postgres
+  connection_string: "postgresql://${DB_USER}:${DB_PASS}@${DB_HOST}:5432/${DB_NAME}"
+
+prompt_targets:
+  - name: get_data
+    endpoint:
+      name: my_api
+      http_headers:
+        Authorization: "Bearer $MY_API_TOKEN"
+```
+
+**`.env` file pattern (loaded automatically by `planoai up`):**
+
+```bash
+# .env — add to .gitignore
+OPENAI_API_KEY=sk-proj-...
+ANTHROPIC_API_KEY=sk-ant-...
+DB_USER=plano
+DB_PASS=secure-password
+DB_HOST=localhost
+MY_API_TOKEN=tok_live_...
+```
+
+Plano also accepts keys set directly in the shell environment. Variables referenced in config but not found at startup cause `planoai up` to fail with a clear error listing the missing keys.
+
+Reference: https://github.com/katanemo/archgw
diff --git a/skills/rules/config-version.md b/skills/rules/config-version.md
new file mode 100644
index 00000000..768d7b04
--- /dev/null
+++ b/skills/rules/config-version.md
@@ -0,0 +1,44 @@
+---
+title: Always Specify a Supported Config Version
+impact: CRITICAL
+impactDescription: Plano rejects configs with missing or unsupported version fields — the version field gates all other validation
+tags: config, versioning, validation
+---
+
+## Always Specify a Supported Config Version
+
+Every Plano `config.yaml` must include a `version` field at the top level. Plano validates configs against a versioned JSON schema — an unrecognized or missing version will cause `planoai up` to fail immediately with a schema validation error before the container starts.
+
+**Incorrect (missing or invalid version):**
+
+```yaml
+# No version field — fails schema validation
+listeners:
+  - type: model
+    name: model_listener
+    port: 12000
+
+model_providers:
+  - model: openai/gpt-4o
+    access_key: $OPENAI_API_KEY
+```
+
+**Correct (explicit supported version):**
+
+```yaml
+version: v0.3.0
+
+listeners:
+  - type: model
+    name: model_listener
+    port: 12000
+
+model_providers:
+  - model: openai/gpt-4o
+    access_key: $OPENAI_API_KEY
+    default: true
+```
+
+Use the latest supported version unless you are targeting a specific deployed Plano image. Current supported versions: `v0.1`, `v0.1.0`, `0.1-beta`, `v0.2.0`, `v0.3.0`. Prefer `v0.3.0` for all new projects.
+
+Reference: https://github.com/katanemo/archgw/blob/main/config/plano_config_schema.yaml
diff --git a/skills/rules/deploy-docker.md b/skills/rules/deploy-docker.md
new file mode 100644
index 00000000..ecc23586
--- /dev/null
+++ b/skills/rules/deploy-docker.md
@@ -0,0 +1,80 @@
+---
+title: Understand Plano's Docker Network Topology for Agent URL Configuration
+impact: HIGH
+impactDescription: Using `localhost` for agent URLs inside Docker always fails — Plano runs in a container and cannot reach host services via localhost
+tags: deployment, docker, networking, agents, urls
+---
+
+## Understand Plano's Docker Network Topology for Agent URL Configuration
+
+Plano runs inside a Docker container managed by `planoai up`. Services running on your host machine (agent servers, filter servers, databases) are not accessible as `localhost` from inside the container. Use Docker's special hostname `host.docker.internal` to reach host services.
+
+**Docker network rules:**
+- `localhost` / `127.0.0.1` inside the container → Plano's own container (not your host)
+- `host.docker.internal` → Your host machine's loopback interface
+- Container name or `docker network` hostname → Other Docker containers
+- External domain / IP → Reachable if Docker has network access
+
+**Incorrect (using localhost — agent unreachable from inside container):**
+
+```yaml
+version: v0.3.0
+
+agents:
+  - id: weather_agent
+    url: http://localhost:8001       # Wrong: this is Plano's own container
+
+  - id: flight_agent
+    url: http://127.0.0.1:8002      # Wrong: same issue
+
+filters:
+  - id: input_guards
+    url: http://localhost:10500      # Wrong: filter server unreachable
+```
+
+**Correct (using host.docker.internal for host-side services):**
+
+```yaml
+version: v0.3.0
+
+agents:
+  - id: weather_agent
+    url: http://host.docker.internal:8001    # Correct: reaches host port 8001
+
+  - id: flight_agent
+    url: http://host.docker.internal:8002    # Correct: reaches host port 8002
+
+filters:
+  - id: input_guards
+    url: http://host.docker.internal:10500   # Correct: reaches filter server on host
+
+endpoints:
+  internal_api:
+    endpoint: host.docker.internal            # Correct for internal API on host
+    protocol: http
+```
+
+**Production deployment patterns:**
+
+```yaml
+# Kubernetes / Docker Compose — use service names
+agents:
+  - id: weather_agent
+    url: http://weather-service:8001    # Kubernetes service DNS
+
+# External cloud services — use full domain
+agents:
+  - id: cloud_agent
+    url: https://my-agent.us-east-1.amazonaws.com/v1
+
+# Custom TLS (self-signed or internal CA)
+overrides:
+  upstream_tls_ca_path: /etc/ssl/certs/internal-ca.pem
+```
+
+**Ports exposed by Plano's container:**
+- All `port` values from your `listeners` blocks are automatically mapped
+- `9901` — Envoy admin interface (for advanced debugging)
+- `12001` — Plano internal management API
+
+Reference: https://github.com/katanemo/archgw
diff --git a/skills/rules/deploy-health.md b/skills/rules/deploy-health.md
new file mode 100644
index 00000000..8e948ee4
--- /dev/null
+++ b/skills/rules/deploy-health.md
@@ -0,0 +1,90 @@
+---
+title: Verify Listener Health Before Sending Requests
+impact: MEDIUM
+impactDescription: Sending requests to Plano before listeners are healthy results in connection refused errors that look like application bugs — always confirm health before testing
+tags: deployment, health-checks, readiness, debugging
+---
+
+## Verify Listener Health Before Sending Requests
+
+Each Plano listener exposes a `/healthz` HTTP endpoint. `planoai up` automatically health-checks all listeners during startup (120s timeout), but in CI/CD pipelines, custom scripts, or when troubleshooting, you may need to check health manually.
+
+**Health check endpoints:**
+
+```bash
+# Check model listener health (port from your config)
+curl -f http://localhost:12000/healthz
+# Returns 200 OK when healthy
+
+# Check prompt listener
+curl -f http://localhost:10000/healthz
+
+# Check agent listener
+curl -f http://localhost:8000/healthz
+```
+
+**Polling health in scripts (CI/CD pattern):**
+
+```bash
+#!/bin/bash
+# wait-for-plano.sh
+
+LISTENER_PORT=${1:-12000}
+MAX_WAIT=120
+INTERVAL=2
+elapsed=0
+
+echo "Waiting for Plano listener on port $LISTENER_PORT..."
+
+until curl -sf "http://localhost:$LISTENER_PORT/healthz" > /dev/null; do
+  if [ $elapsed -ge $MAX_WAIT ]; then
+    echo "ERROR: Plano listener not healthy after ${MAX_WAIT}s"
+    planoai logs --debug
+    exit 1
+  fi
+  sleep $INTERVAL
+  elapsed=$((elapsed + INTERVAL))
+done
+
+echo "Plano listener healthy after ${elapsed}s"
+```
+
+**Docker Compose health check:**
+
+```yaml
+# docker-compose.yml for services that depend on Plano
+services:
+  plano:
+    image: katanemo/plano:latest
+    # Plano is managed by planoai, not directly via compose in most setups
+    healthcheck:
+      test: ["CMD", "curl", "-f", "http://localhost:12000/healthz"]
+      interval: 5s
+      timeout: 3s
+      retries: 24
+      start_period: 10s
+
+  my-agent:
+    image: my-agent:latest
+    depends_on:
+      plano:
+        condition: service_healthy
+```
+
+**Debug unhealthy listeners:**
+
+```bash
+# See startup logs
+planoai logs --debug
+
+# Check if port is already in use
+lsof -i :12000
+
+# Check container status
+docker ps -a --filter name=plano
+
+# Restart from scratch
+planoai down && planoai up config.yaml --foreground
+```
+
+Reference: https://github.com/katanemo/archgw
diff --git a/skills/rules/deploy-state.md b/skills/rules/deploy-state.md
new file mode 100644
index 00000000..03ce1f3d
--- /dev/null
+++ b/skills/rules/deploy-state.md
@@ -0,0 +1,85 @@
+---
+title: Use PostgreSQL State Storage for Multi-Turn Conversations in Production
+impact: HIGH
+impactDescription: The default in-memory state storage loses all conversation history when the container restarts — production multi-turn agents require persistent PostgreSQL storage
+tags: deployment, state, postgres, memory, multi-turn, production
+---
+
+## Use PostgreSQL State Storage for Multi-Turn Conversations in Production
+
+`state_storage` enables Plano to maintain conversation context across requests. Without it, each request is stateless. The `memory` type works for development and testing — all state is lost on container restart. Use `postgres` for any production deployment where conversation continuity matters.
+
+**Incorrect (memory storage in production):**
+
+```yaml
+version: v0.3.0
+
+# Memory storage — all conversations lost on planoai down / container restart
+state_storage:
+  type: memory
+
+listeners:
+  - type: agent
+    name: customer_support
+    port: 8000
+    router: plano_orchestrator_v1
+    agents:
+      - id: support_agent
+        description: Customer support assistant with conversation history.
+```
+
+**Correct (PostgreSQL for production persistence):**
+
+```yaml
+version: v0.3.0
+
+state_storage:
+  type: postgres
+  connection_string: "postgresql://${DB_USER}:${DB_PASS}@${DB_HOST}:5432/${DB_NAME}"
+
+listeners:
+  - type: agent
+    name: customer_support
+    port: 8000
+    router: plano_orchestrator_v1
+    agents:
+      - id: support_agent
+        description: Customer support assistant with access to full conversation history.
+
+model_providers:
+  - model: openai/gpt-4o
+    access_key: $OPENAI_API_KEY
+    default: true
+```
+
+**Setting up PostgreSQL for local development:**
+
+```bash
+# Start PostgreSQL with Docker
+docker run -d \
+  --name plano-postgres \
+  -e POSTGRES_USER=plano \
+  -e POSTGRES_PASSWORD=devpassword \
+  -e POSTGRES_DB=plano \
+  -p 5432:5432 \
+  postgres:16
+
+# Set environment variables
+export DB_USER=plano
+export DB_PASS=devpassword
+export DB_HOST=host.docker.internal   # Use host.docker.internal from inside Plano container
+export DB_NAME=plano
+```
+
+**Production `.env` pattern:**
+
+```bash
+DB_USER=plano_prod
+DB_PASS=<strong-random-password>
+DB_HOST=your-rds-endpoint.amazonaws.com
+DB_NAME=plano
+```
+
+Plano automatically creates its state tables on first startup. The `connection_string` supports all standard PostgreSQL connection parameters including SSL: `postgresql://user:pass@host:5432/db?sslmode=require`.
+
+Reference: https://github.com/katanemo/archgw
diff --git a/skills/rules/filter-guardrails.md b/skills/rules/filter-guardrails.md
new file mode 100644
index 00000000..d60bea65
--- /dev/null
+++ b/skills/rules/filter-guardrails.md
@@ -0,0 +1,81 @@
+---
+title: Configure Prompt Guards with Actionable Rejection Messages
+impact: MEDIUM
+impactDescription: A generic or empty rejection message leaves users confused about why their request was blocked and unable to rephrase appropriately
+tags: filter, guardrails, jailbreak, security, ux
+---
+
+## Configure Prompt Guards with Actionable Rejection Messages
+
+Plano has built-in `prompt_guards` for detecting jailbreak attempts. When triggered, Plano returns the `on_exception.message` instead of forwarding the request. Write messages that explain the restriction and suggest what the user can do instead — both for user experience and to reduce support burden.
+
+**Incorrect (no message configured — returns a generic error):**
+
+```yaml
+version: v0.3.0
+
+prompt_guards:
+  input_guards:
+    jailbreak:
+      on_exception: {}    # Empty — returns unhelpful generic error
+```
+
+**Incorrect (cryptic technical message):**
+
+```yaml
+prompt_guards:
+  input_guards:
+    jailbreak:
+      on_exception:
+        message: "Error code 403: guard triggered"    # Unhelpful to the user
+```
+
+**Correct (clear, actionable, brand-appropriate message):**
+
+```yaml
+version: v0.3.0
+
+prompt_guards:
+  input_guards:
+    jailbreak:
+      on_exception:
+        message: >
+          I'm not able to help with that request. This assistant is designed
+          to help with [your use case, e.g., customer support, coding questions].
+          Please rephrase your question or contact support@yourdomain.com
+          if you believe this is an error.
+```
+
+**Combining prompt_guards with MCP filter guardrails:**
+
+```yaml
+# Built-in jailbreak detection (fast, no external service needed)
+prompt_guards:
+  input_guards:
+    jailbreak:
+      on_exception:
+        message: "This request cannot be processed. Please ask about our products and services."
+
+# MCP-based custom guards for additional policy enforcement
+filters:
+  - id: topic_restriction
+    url: http://host.docker.internal:10500
+    type: mcp
+    transport: streamable-http
+    tool: topic_restriction    # Custom filter for domain-specific restrictions
+
+listeners:
+  - type: agent
+    name: customer_support
+    port: 8000
+    router: plano_orchestrator_v1
+    agents:
+      - id: support_agent
+        description: Customer support assistant for product questions and order issues.
+        filter_chain:
+          - topic_restriction    # Additional custom topic filtering
+```
+
+`prompt_guards` applies globally to all listeners. Use `filter_chain` on individual agents for per-agent policies.
+
+Reference: https://github.com/katanemo/archgw
diff --git a/skills/rules/filter-mcp.md b/skills/rules/filter-mcp.md
new file mode 100644
index 00000000..c2d02efd
--- /dev/null
+++ b/skills/rules/filter-mcp.md
@@ -0,0 +1,59 @@
+---
+title: Configure MCP Filters with Explicit Type and Transport
+impact: MEDIUM
+impactDescription: Omitting type and transport fields relies on defaults that may not match your MCP server's protocol implementation
+tags: filter, mcp, integration, configuration
+---
+
+## Configure MCP Filters with Explicit Type and Transport
+
+Plano filters integrate with external services via MCP (Model Context Protocol) or plain HTTP. MCP filters call a specific tool on a remote MCP server. Always specify `type`, `transport`, and optionally `tool` (defaults to the filter `id`) to ensure Plano connects correctly to your filter implementation.
+
+**Incorrect (minimal filter definition relying on all defaults):**
+
+```yaml
+filters:
+  - id: my_guard          # Plano infers type=mcp, transport=streamable-http, tool=my_guard
+    url: http://localhost:10500
+    # If your MCP server uses a different tool name or transport, this silently misroutes
+```
+
+**Correct (explicit configuration for each filter):**
+
+```yaml
+version: v0.3.0
+
+filters:
+  - id: input_guards
+    url: http://host.docker.internal:10500
+    type: mcp                        # Explicitly MCP protocol
+    transport: streamable-http       # Streamable HTTP transport
+    tool: input_guards               # MCP tool name (matches MCP server registration)
+
+  - id: query_rewriter
+    url: http://host.docker.internal:10501
+    type: mcp
+    transport: streamable-http
+    tool: rewrite_query              # Tool name differs from filter ID — explicit is safer
+
+  - id: custom_validator
+    url: http://host.docker.internal:10503
+    type: http                       # Plain HTTP filter (not MCP)
+    # No tool field for HTTP filters
+```
+
+**MCP filter implementation contract:**
+Your MCP server must expose a tool matching the `tool` name. The tool receives the request payload and must return either:
+- A modified request (to pass through with changes)
+- A rejection response (to short-circuit the pipeline)
+
+**HTTP filter alternative** — use `type: http` for simpler request/response interceptors that don't need the MCP protocol:
+
+```yaml
+filters:
+  - id: auth_validator
+    url: http://host.docker.internal:9000/validate
+    type: http    # Plano POSTs the request, expects the modified request back
+```
+
+Reference: https://github.com/katanemo/archgw
diff --git a/skills/rules/filter-ordering.md b/skills/rules/filter-ordering.md
new file mode 100644
index 00000000..ad2d0d7b
--- /dev/null
+++ b/skills/rules/filter-ordering.md
@@ -0,0 +1,78 @@
+---
+title: Order Filter Chains with Guards First, Enrichment Last
+impact: HIGH
+impactDescription: Running context builders before input guards means jailbreak attempts get RAG-enriched context before being blocked — wasting compute and risking data exposure
+tags: filter, guardrails, security, pipeline, ordering
+---
+
+## Order Filter Chains with Guards First, Enrichment Last
+
+A `filter_chain` is an ordered list of filter IDs applied sequentially to each request. The order is semantically meaningful: each filter receives the output of the previous one. Safety and validation filters must run first to short-circuit bad requests before expensive enrichment filters process them.
+
+**Recommended filter chain order:**
+
+1. **Input guards** — jailbreak detection, PII detection, topic restrictions (reject early)
+2. **Query rewriting** — normalize or enhance the user query
+3. **Context building** — RAG retrieval, tool lookup, knowledge injection (expensive)
+4. **Output guards** — validate or sanitize LLM response before returning
+
+**Incorrect (context built before guards — wasteful and potentially unsafe):**
+
+```yaml
+filters:
+  - id: context_builder
+    url: http://host.docker.internal:10502    # Runs expensive RAG retrieval first
+  - id: query_rewriter
+    url: http://host.docker.internal:10501
+  - id: input_guards
+    url: http://host.docker.internal:10500    # Guards run last — jailbreak gets context
+
+listeners:
+  - type: agent
+    name: rag_orchestrator
+    port: 8000
+    router: plano_orchestrator_v1
+    agents:
+      - id: rag_agent
+        filter_chain:
+          - context_builder   # Wrong: expensive enrichment before safety check
+          - query_rewriter
+          - input_guards
+```
+
+**Correct (guards block bad requests before any enrichment):**
+
+```yaml
+version: v0.3.0
+
+filters:
+  - id: input_guards
+    url: http://host.docker.internal:10500
+    type: mcp
+    transport: streamable-http
+  - id: query_rewriter
+    url: http://host.docker.internal:10501
+    type: mcp
+    transport: streamable-http
+  - id: context_builder
+    url: http://host.docker.internal:10502
+    type: mcp
+    transport: streamable-http
+
+listeners:
+  - type: agent
+    name: rag_orchestrator
+    port: 8000
+    router: plano_orchestrator_v1
+    agents:
+      - id: rag_agent
+        description: Answers questions using internal knowledge base documents.
+        filter_chain:
+          - input_guards      # 1. Block jailbreaks and policy violations
+          - query_rewriter    # 2. Normalize the safe query
+          - context_builder   # 3. Retrieve relevant context for the clean query
+```
+
+Different agents within the same listener can have different filter chains — a public-facing agent may need all guards while an internal admin agent may skip them.
+
+Reference: https://github.com/katanemo/archgw
diff --git a/skills/rules/observe-span-attributes.md b/skills/rules/observe-span-attributes.md
new file mode 100644
index 00000000..a90b3006
--- /dev/null
+++ b/skills/rules/observe-span-attributes.md
@@ -0,0 +1,80 @@
+---
+title: Add Custom Span Attributes for Correlation and Filtering
+impact: MEDIUM
+impactDescription: Without custom span attributes, traces cannot be filtered by user, session, or environment — making production debugging significantly harder
+tags: observability, tracing, span-attributes, correlation
+---
+
+## Add Custom Span Attributes for Correlation and Filtering
+
+Plano can automatically extract HTTP request headers and attach them as span attributes, plus attach static key-value pairs to every span. This enables filtering traces by user, session, tenant, environment, or any other dimension that matters to your application.
+
+**Incorrect (no span attributes — traces are unfiltered blobs):**
+
+```yaml
+tracing:
+  random_sampling: 20
+  # No span_attributes — cannot filter by user, session, or environment
+```
+
+**Correct (rich span attributes for production correlation):**
+
+```yaml
+version: v0.3.0
+
+tracing:
+  random_sampling: 20
+  trace_arch_internal: true
+
+  span_attributes:
+    # Match all headers with this prefix, then map to span attributes by:
+    # 1) stripping the prefix and 2) converting hyphens to dots
+    header_prefixes:
+      - x-katanemo-
+
+    # Static attributes added to every span from this Plano instance
+    static:
+      environment: production
+      service.name: plano-gateway
+      deployment.region: us-east-1
+      service.version: "2.1.0"
+      team: platform-engineering
+```
+
+**Sending correlation headers from client code:**
+
+```python
+import httpx
+
+response = httpx.post(
+    "http://localhost:12000/v1/chat/completions",
+    headers={
+        "x-katanemo-request-id": "req_abc123",
+        "x-katanemo-user-id": "usr_12",
+        "x-katanemo-session-id": "sess_xyz456",
+        "x-katanemo-tenant-id": "acme-corp",
+    },
+    json={"model": "plano.v1", "messages": [...]}
+)
+```
+
+**Querying by custom attribute:**
+
+```bash
+# Find all requests from a specific user
+planoai trace --where user.id=usr_12
+
+# Find all traces from production environment
+planoai trace --where environment=production
+
+# Find traces from a specific tenant
+planoai trace --where tenant.id=acme-corp
+```
+
+Header prefix matching is a prefix match. With `x-katanemo-`, these mappings apply:
+
+- `x-katanemo-user-id` -> `user.id`
+- `x-katanemo-tenant-id` -> `tenant.id`
+- `x-katanemo-request-id` -> `request.id`
+
+Reference: [https://github.com/katanemo/archgw](https://github.com/katanemo/archgw)
diff --git a/skills/rules/observe-trace-query.md b/skills/rules/observe-trace-query.md
new file mode 100644
index 00000000..a7ef7db7
--- /dev/null
+++ b/skills/rules/observe-trace-query.md
@@ -0,0 +1,85 @@
+---
+title: Use `planoai trace` to Inspect Routing Decisions
+impact: MEDIUM-HIGH
+impactDescription: The trace CLI lets you verify which model was selected, why, and how long each step took — without setting up a full OTEL backend
+tags: observability, tracing, cli, debugging, routing
+---
+
+## Use `planoai trace` to Inspect Routing Decisions
+
+`planoai trace` provides a built-in trace viewer backed by an in-memory OTEL collector. Use it to inspect routing decisions, verify preference matching, measure filter latency, and debug failed requests — all from the CLI without configuring Jaeger, Zipkin, or another backend.
+
+**Workflow: start collector, run requests, then inspect traces:**
+
+```bash
+# 1. Start Plano with the built-in trace collector (recommended)
+planoai up config.yaml --with-tracing
+
+# 2. Send test requests through Plano
+curl http://localhost:12000/v1/chat/completions \
+  -H "Content-Type: application/json" \
+  -d '{"model": "plano.v1", "messages": [{"role": "user", "content": "Write a Python function to sort a list"}]}'
+
+# 3. Show the latest trace
+planoai trace
+```
+
+You can also run the trace listener directly:
+
+```bash
+planoai trace listen # available on a process ID running OTEL collector
+```
+
+Stop the background trace listener:
+
+```bash
+planoai trace down
+```
+
+**Useful trace viewer patterns:**
+
+```bash
+# Show latest trace (default target is "last")
+planoai trace
+
+# List available trace IDs
+planoai trace --list
+
+# Show all traces
+planoai trace any
+
+# Show a specific trace (short 8-char or full 32-char ID)
+planoai trace 7f4e9a1c
+planoai trace 7f4e9a1c0d9d4a0bb9bf5a8a7d13f62a
+
+# Filter by specific span attributes (AND semantics for repeated --where)
+planoai trace any --where llm.model=gpt-4o-mini
+
+# Filter by user ID (if header prefix is x-katanemo-, x-katanemo-user-id maps to user.id)
+planoai trace any --where user.id=user_123
+
+# Limit results for a quick sanity check
+planoai trace any --limit 5
+
+# Time window filter
+planoai trace any --since 30m
+
+# Filter displayed attributes by key pattern
+planoai trace any --filter "http.*"
+
+# Output machine-readable JSON
+planoai trace any --json
+```
+
+**What to look for in traces:**
+
+
+| Span name           | What it tells you                                             |
+| ------------------- | ------------------------------------------------------------- |
+| `plano.routing`     | Which routing preference matched and which model was selected |
+| `plano.filter.<id>` | How long each filter in the chain took                        |
+| `plano.llm.request` | Time to first token and full response time                    |
+| `plano.agent.route` | Which agent description matched for agent listeners           |
+
+
+Reference: [https://github.com/katanemo/archgw](https://github.com/katanemo/archgw)
diff --git a/skills/rules/observe-tracing.md b/skills/rules/observe-tracing.md
new file mode 100644
index 00000000..93b9c003
--- /dev/null
+++ b/skills/rules/observe-tracing.md
@@ -0,0 +1,80 @@
+---
+title: Enable Tracing with Appropriate Sampling for Your Environment
+impact: HIGH
+impactDescription: Without tracing enabled, debugging routing decisions, latency issues, and model selection is guesswork — traces are the primary observability primitive in Plano
+tags: observability, tracing, opentelemetry, otel, debugging
+---
+
+## Enable Tracing with Appropriate Sampling for Your Environment
+
+Plano emits OpenTelemetry (OTEL) traces for every request, capturing routing decisions, LLM provider selection, filter chain execution, and response latency. Traces are the best tool for understanding why a request was routed to a particular model and debugging unexpected behavior.
+
+**Incorrect (no tracing configured — flying blind in production):**
+
+```yaml
+version: v0.3.0
+
+listeners:
+  - type: model
+    name: model_listener
+    port: 12000
+
+model_providers:
+  - model: openai/gpt-4o
+    access_key: $OPENAI_API_KEY
+    default: true
+
+# No tracing block — no visibility into routing, latency, or errors
+```
+
+**Correct (tracing enabled with environment-appropriate sampling):**
+
+```yaml
+version: v0.3.0
+
+listeners:
+  - type: model
+    name: model_listener
+    port: 12000
+
+model_providers:
+  - model: openai/gpt-4o
+    access_key: $OPENAI_API_KEY
+    default: true
+
+tracing:
+  random_sampling: 100              # 100% for development/debugging
+  trace_arch_internal: true         # Include Plano's internal routing spans
+```
+
+**Production configuration (sampled to control volume):**
+
+```yaml
+tracing:
+  random_sampling: 10               # Sample 10% of requests in production
+  trace_arch_internal: false        # Skip internal spans to reduce noise
+  span_attributes:
+    header_prefixes:
+      - x-katanemo-               # Match all x-katanemo-* headers
+    static:
+      environment: production
+      service.name: my-plano-service
+      version: "1.0.0"
+```
+
+With `x-katanemo-` configured, Plano maps headers to attributes by stripping the prefix and converting hyphens to dots:
+
+- `x-katanemo-user-id` -> `user.id`
+- `x-katanemo-session-id` -> `session.id`
+- `x-katanemo-request-id` -> `request.id`
+
+**Starting the trace collector:**
+
+```bash
+# Start Plano with built-in OTEL collector
+planoai up config.yaml --with-tracing
+```
+
+Sampling rates: 100% for dev/staging, 5–20% for high-traffic production, 100% for low-traffic production. `trace_arch_internal: true` adds spans showing which routing preference matched — essential for debugging preference configuration.
+
+Reference: [https://github.com/katanemo/archgw](https://github.com/katanemo/archgw)
diff --git a/skills/rules/routing-aliases.md b/skills/rules/routing-aliases.md
new file mode 100644
index 00000000..91f0b31a
--- /dev/null
+++ b/skills/rules/routing-aliases.md
@@ -0,0 +1,77 @@
+---
+title: Use Model Aliases for Semantic, Stable Model References
+impact: MEDIUM
+impactDescription: Hardcoded model names in client code require code changes when you swap providers; aliases let you update routing in config.yaml alone
+tags: routing, model-aliases, maintainability, client-integration
+---
+
+## Use Model Aliases for Semantic, Stable Model References
+
+`model_aliases` map human-readable names to specific model identifiers. Client applications reference the alias, not the underlying model. When you want to upgrade from `gpt-4o` to a new model, you change one line in `config.yaml` — not every client calling the API.
+
+**Incorrect (clients hardcode specific model names):**
+
+```yaml
+# config.yaml — no aliases defined
+version: v0.3.0
+
+listeners:
+  - type: model
+    name: model_listener
+    port: 12000
+
+model_providers:
+  - model: openai/gpt-4o
+    access_key: $OPENAI_API_KEY
+    default: true
+```
+
+```python
+# Client code — brittle, must be updated when model changes
+client.chat.completions.create(model="gpt-4o", ...)
+```
+
+**Correct (semantic aliases, stable client contracts):**
+
+```yaml
+version: v0.3.0
+
+listeners:
+  - type: model
+    name: model_listener
+    port: 12000
+
+model_providers:
+  - model: openai/gpt-4o-mini
+    access_key: $OPENAI_API_KEY
+    default: true
+  - model: openai/gpt-4o
+    access_key: $OPENAI_API_KEY
+  - model: anthropic/claude-sonnet-4-20250514
+    access_key: $ANTHROPIC_API_KEY
+
+model_aliases:
+  plano.fast.v1:
+    target: gpt-4o-mini          # Cheap, fast — for high-volume tasks
+
+  plano.smart.v1:
+    target: gpt-4o               # High capability — for complex reasoning
+
+  plano.creative.v1:
+    target: claude-sonnet-4-20250514  # Strong creative writing and analysis
+
+  plano.v1:
+    target: gpt-4o               # Default production alias
+```
+
+```python
+# Client code — stable, alias is the contract
+client.chat.completions.create(model="plano.smart.v1", ...)
+```
+
+**Alias naming conventions:**
+- `<org>.<purpose>.<version>` — e.g., `plano.fast.v1`, `acme.code.v2`
+- Bumping `.v2` → `.v3` lets you run old and new aliases simultaneously during rollouts
+- `plano.v1` as a canonical default gives clients a single stable entry point
+
+Reference: https://github.com/katanemo/archgw
diff --git a/skills/rules/routing-default.md b/skills/rules/routing-default.md
new file mode 100644
index 00000000..f23e7357
--- /dev/null
+++ b/skills/rules/routing-default.md
@@ -0,0 +1,70 @@
+---
+title: Always Set Exactly One Default Model Provider
+impact: HIGH
+impactDescription: Without a default provider, Plano has no fallback when routing preferences do not match — requests with unclassified intent will fail
+tags: routing, defaults, model-providers, reliability
+---
+
+## Always Set Exactly One Default Model Provider
+
+When a request does not match any routing preference, Plano forwards it to the `default: true` provider. Without a default, unmatched requests fail. If multiple providers are marked `default: true`, Plano uses the first one — which can produce unexpected behavior.
+
+**Incorrect (no default provider set):**
+
+```yaml
+version: v0.3.0
+
+model_providers:
+  - model: openai/gpt-4o-mini     # No default: true anywhere
+    access_key: $OPENAI_API_KEY
+    routing_preferences:
+      - name: summarization
+        description: Summarizing documents and extracting key points
+
+  - model: openai/gpt-4o
+    access_key: $OPENAI_API_KEY
+    routing_preferences:
+      - name: code_generation
+        description: Writing new functions and implementing algorithms
+```
+
+**Incorrect (multiple defaults — ambiguous):**
+
+```yaml
+model_providers:
+  - model: openai/gpt-4o-mini
+    default: true               # First default
+    access_key: $OPENAI_API_KEY
+
+  - model: openai/gpt-4o
+    default: true               # Second default — confusing
+    access_key: $OPENAI_API_KEY
+```
+
+**Correct (exactly one default, covering unmatched requests):**
+
+```yaml
+version: v0.3.0
+
+model_providers:
+  - model: openai/gpt-4o-mini
+    access_key: $OPENAI_API_KEY
+    default: true               # Handles general/unclassified requests
+    routing_preferences:
+      - name: summarization
+        description: Summarizing documents, articles, and meeting notes
+      - name: classification
+        description: Categorizing inputs, labeling, and intent detection
+
+  - model: openai/gpt-4o
+    access_key: $OPENAI_API_KEY
+    routing_preferences:
+      - name: code_generation
+        description: Writing, debugging, and reviewing code
+      - name: complex_reasoning
+        description: Multi-step math, logical analysis, research synthesis
+```
+
+Choose your most cost-effective capable model as the default — it handles all traffic that doesn't match specialized preferences.
+
+Reference: [https://github.com/katanemo/archgw](https://github.com/katanemo/archgw)
diff --git a/skills/rules/routing-passthrough.md b/skills/rules/routing-passthrough.md
new file mode 100644
index 00000000..ff9fbaf9
--- /dev/null
+++ b/skills/rules/routing-passthrough.md
@@ -0,0 +1,69 @@
+---
+title: Use Passthrough Auth for Proxy and Multi-Tenant Setups
+impact: MEDIUM
+impactDescription: Without passthrough auth, self-hosted proxy services (LiteLLM, vLLM, etc.) reject Plano's requests because the wrong Authorization header is sent
+tags: routing, authentication, proxy, litellm, multi-tenant
+---
+
+## Use Passthrough Auth for Proxy and Multi-Tenant Setups
+
+When routing to a self-hosted LLM proxy (LiteLLM, vLLM, OpenRouter, Azure APIM) or in multi-tenant setups where clients supply their own keys, set `passthrough_auth: true`. This forwards the client's `Authorization` header rather than Plano's configured `access_key`. Combine with a `base_url` pointing to the proxy.
+
+**Incorrect (Plano sends its own key to a proxy that expects the client's key):**
+
+```yaml
+model_providers:
+  - model: custom/proxy
+    base_url: http://host.docker.internal:8000
+    access_key: $SOME_KEY    # Plano overwrites the client's auth — proxy rejects it
+```
+
+**Correct (forward client Authorization header to the proxy):**
+
+```yaml
+version: v0.3.0
+
+listeners:
+  - type: model
+    name: model_listener
+    port: 12000
+
+model_providers:
+  - model: custom/litellm-proxy
+    base_url: http://host.docker.internal:4000    # LiteLLM server
+    provider_interface: openai                    # LiteLLM uses OpenAI format
+    passthrough_auth: true                        # Forward client's Bearer token
+    default: true
+```
+
+**Multi-tenant pattern (client supplies their own API key):**
+
+```yaml
+model_providers:
+  # Plano acts as a passthrough gateway; each client has their own OpenAI key
+  - model: openai/gpt-4o
+    passthrough_auth: true    # No access_key here — client's key is forwarded
+    default: true
+```
+
+**Combined: proxy for some models, Plano-managed for others:**
+
+```yaml
+model_providers:
+  - model: openai/gpt-4o-mini
+    access_key: $OPENAI_API_KEY    # Plano manages this key
+    default: true
+    routing_preferences:
+      - name: quick tasks
+        description: Short answers, simple lookups, fast completions
+
+  - model: custom/vllm-llama
+    base_url: http://gpu-server:8000
+    provider_interface: openai
+    passthrough_auth: true         # vLLM cluster handles its own auth
+    routing_preferences:
+      - name: long context
+        description: Processing very long documents, multi-document analysis
+```
+
+Reference: https://github.com/katanemo/archgw
diff --git a/skills/rules/routing-preferences.md b/skills/rules/routing-preferences.md
new file mode 100644
index 00000000..571a3acd
--- /dev/null
+++ b/skills/rules/routing-preferences.md
@@ -0,0 +1,73 @@
+---
+title: Write Task-Specific Routing Preference Descriptions
+impact: HIGH
+impactDescription: Vague preference descriptions cause Plano's internal router LLM to misclassify requests, routing expensive tasks to cheap models and vice versa
+tags: routing, model-selection, preferences, llm-routing
+---
+
+## Write Task-Specific Routing Preference Descriptions
+
+Plano's `plano_orchestrator_v1` router uses a 1.5B preference-aligned LLM to classify incoming requests against your `routing_preferences` descriptions. It routes the request to the first provider whose preferences match. Description quality directly determines routing accuracy.
+
+**Incorrect (vague, overlapping descriptions):**
+
+```yaml
+model_providers:
+  - model: openai/gpt-4o-mini
+    access_key: $OPENAI_API_KEY
+    default: true
+    routing_preferences:
+      - name: simple
+        description: easy tasks      # Too vague — what is "easy"?
+
+  - model: openai/gpt-4o
+    access_key: $OPENAI_API_KEY
+    routing_preferences:
+      - name: hard
+        description: hard tasks      # Too vague — overlaps with "easy"
+```
+
+**Correct (specific, distinct task descriptions):**
+
+```yaml
+model_providers:
+  - model: openai/gpt-4o-mini
+    access_key: $OPENAI_API_KEY
+    default: true
+    routing_preferences:
+      - name: summarization
+        description: >
+          Summarizing documents, articles, emails, or meeting transcripts.
+          Extracting key points, generating TL;DR sections, condensing long text.
+      - name: classification
+        description: >
+          Categorizing inputs, sentiment analysis, spam detection,
+          intent classification, labeling structured data fields.
+      - name: translation
+        description: >
+          Translating text between languages, localization tasks.
+
+  - model: openai/gpt-4o
+    access_key: $OPENAI_API_KEY
+    routing_preferences:
+      - name: code_generation
+        description: >
+          Writing new functions, classes, or modules from scratch.
+          Implementing algorithms, boilerplate generation, API integrations.
+      - name: code_review
+        description: >
+          Reviewing code for bugs, security vulnerabilities, performance issues.
+          Suggesting refactors, explaining complex code, debugging errors.
+      - name: complex_reasoning
+        description: >
+          Multi-step math problems, logical deduction, strategic planning,
+          research synthesis requiring chain-of-thought reasoning.
+```
+
+**Key principles for good preference descriptions:**
+- Use concrete action verbs: "writing", "reviewing", "translating", "summarizing"
+- List 3–5 specific sub-tasks or synonyms for each preference
+- Ensure preferences across providers are mutually exclusive in scope
+- Test with representative queries using `planoai trace` and `--where` filters to verify routing decisions
+
+Reference: https://github.com/katanemo/archgw
diff --git a/skills/src/build.ts b/skills/src/build.ts
new file mode 100644
index 00000000..5d4640f1
--- /dev/null
+++ b/skills/src/build.ts
@@ -0,0 +1,262 @@
+#!/usr/bin/env node
+
+import { readFileSync, writeFileSync, readdirSync } from "node:fs";
+import { join, dirname } from "node:path";
+import { fileURLToPath } from "node:url";
+
+type Section = {
+  prefix: string;
+  number: number;
+  title: string;
+  description: string;
+};
+
+type Rule = {
+  file: string;
+  title: string;
+  impact: string;
+  impactDescription: string;
+  tags: string[];
+  body: string;
+  section: Section;
+};
+
+type ParsedFrontmatter = {
+  frontmatter: Record<string, string>;
+  body: string;
+};
+
+type Metadata = {
+  abstract: string;
+  version: string;
+  organization: string;
+};
+
+const __dirname = dirname(fileURLToPath(import.meta.url));
+const RULES_DIR = join(__dirname, "..", "rules");
+const OUTPUT_FILE = join(__dirname, "..", "AGENTS.md");
+const METADATA_FILE = join(__dirname, "..", "metadata.json");
+
+const SECTIONS: Section[] = [
+  {
+    prefix: "config-",
+    number: 1,
+    title: "Configuration Fundamentals",
+    description:
+      "Core config.yaml structure, versioning, listener types, and provider setup — the entry point for every Plano deployment.",
+  },
+  {
+    prefix: "routing-",
+    number: 2,
+    title: "Routing & Model Selection",
+    description:
+      "Intelligent LLM routing using preferences, aliases, and defaults to match tasks to the best model.",
+  },
+  {
+    prefix: "agent-",
+    number: 3,
+    title: "Agent Orchestration",
+    description:
+      "Multi-agent patterns, agent descriptions, and orchestration strategies for building agentic applications.",
+  },
+  {
+    prefix: "filter-",
+    number: 4,
+    title: "Filter Chains & Guardrails",
+    description:
+      "Request/response processing pipelines — ordering, MCP integration, and safety guardrails.",
+  },
+  {
+    prefix: "observe-",
+    number: 5,
+    title: "Observability & Debugging",
+    description:
+      "OpenTelemetry tracing, log levels, span attributes, and sampling for production visibility.",
+  },
+  {
+    prefix: "cli-",
+    number: 6,
+    title: "CLI Operations",
+    description:
+      "Using the planoai CLI for startup, tracing, CLI agents, project init, and code generation.",
+  },
+  {
+    prefix: "deploy-",
+    number: 7,
+    title: "Deployment & Security",
+    description:
+      "Docker deployment, environment variable management, health checks, and state storage for production.",
+  },
+  {
+    prefix: "advanced-",
+    number: 8,
+    title: "Advanced Patterns",
+    description:
+      "Prompt targets, external API integration, rate limiting, and multi-listener architectures.",
+  },
+];
+
+function parseFrontmatter(content: string): ParsedFrontmatter | null {
+  const match = content.match(/^---\n([\s\S]*?)\n---\n([\s\S]*)$/);
+  if (!match) return null;
+
+  const frontmatter: Record<string, string> = {};
+  const lines = match[1].split("\n");
+  for (const line of lines) {
+    const colonIdx = line.indexOf(":");
+    if (colonIdx === -1) continue;
+    const key = line.slice(0, colonIdx).trim();
+    const value = line.slice(colonIdx + 1).trim();
+    frontmatter[key] = value;
+  }
+
+  return {
+    frontmatter,
+    body: match[2].trim(),
+  };
+}
+
+function inferSection(filename: string): Section | null {
+  for (const section of SECTIONS) {
+    if (filename.startsWith(section.prefix)) {
+      return section;
+    }
+  }
+  return null;
+}
+
+function main(): void {
+  const metadata = JSON.parse(readFileSync(METADATA_FILE, "utf-8")) as Metadata;
+
+  const files = readdirSync(RULES_DIR)
+    .filter((f) => f.endsWith(".md") && !f.startsWith("_"))
+    .sort();
+
+  const sectionRules = new Map<number, Rule[]>();
+  for (const section of SECTIONS) {
+    sectionRules.set(section.number, []);
+  }
+
+  let parseErrors = 0;
+
+  for (const file of files) {
+    const content = readFileSync(join(RULES_DIR, file), "utf-8");
+    const parsed = parseFrontmatter(content);
+
+    if (!parsed) {
+      console.error(`ERROR: Could not parse frontmatter in ${file}`);
+      parseErrors++;
+      continue;
+    }
+
+    const section = inferSection(file);
+    if (!section) {
+      console.warn(`WARN: No section found for ${file} — skipping`);
+      continue;
+    }
+
+    const rule: Rule = {
+      file,
+      title: parsed.frontmatter.title ?? file,
+      impact: parsed.frontmatter.impact ?? "MEDIUM",
+      impactDescription: parsed.frontmatter.impactDescription ?? "",
+      tags: parsed.frontmatter.tags
+        ? parsed.frontmatter.tags.split(",").map((t) => t.trim())
+        : [],
+      body: parsed.body,
+      section,
+    };
+    sectionRules.get(section.number)?.push(rule);
+  }
+
+  if (parseErrors > 0) {
+    console.error(`\nBuild failed: ${parseErrors} file(s) had parse errors.`);
+    process.exit(1);
+  }
+
+  for (const [, rules] of sectionRules) {
+    rules.sort((a, b) => a.title.localeCompare(b.title));
+  }
+
+  const lines: string[] = [];
+  lines.push(`# Plano Agent Skills`);
+  lines.push(``);
+  lines.push(`> ${metadata.abstract}`);
+  lines.push(``);
+  lines.push(
+    `**Version:** ${metadata.version} | **Organization:** ${metadata.organization}`
+  );
+  lines.push(``);
+  lines.push(`---`);
+  lines.push(``);
+
+  lines.push(`## Table of Contents`);
+  lines.push(``);
+  for (const section of SECTIONS) {
+    const rules = sectionRules.get(section.number) ?? [];
+    if (rules.length === 0) continue;
+    lines.push(
+      `- [Section ${section.number}: ${section.title}](#section-${section.number})`
+    );
+    for (let i = 0; i < rules.length; i++) {
+      const rule = rules[i];
+      const id = `${section.number}.${i + 1}`;
+      const anchor = rule.title
+        .toLowerCase()
+        .replace(/[^a-z0-9\s-]/g, "")
+        .replace(/\s+/g, "-");
+      lines.push(`  - [${id} ${rule.title}](#${anchor})`);
+    }
+  }
+  lines.push(``);
+  lines.push(`---`);
+  lines.push(``);
+
+  for (const section of SECTIONS) {
+    const rules = sectionRules.get(section.number) ?? [];
+    if (rules.length === 0) continue;
+
+    lines.push(`## Section ${section.number}: ${section.title}`);
+    lines.push(``);
+    lines.push(`*${section.description}*`);
+    lines.push(``);
+
+    for (let i = 0; i < rules.length; i++) {
+      const rule = rules[i];
+      const id = `${section.number}.${i + 1}`;
+
+      lines.push(`### ${id} ${rule.title}`);
+      lines.push(``);
+      lines.push(
+        `**Impact:** \`${rule.impact}\`${rule.impactDescription ? ` — ${rule.impactDescription}` : ""}`
+      );
+      if (rule.tags.length > 0) {
+        lines.push(`**Tags:** ${rule.tags.map((t) => `\`${t}\``).join(", ")}`);
+      }
+      lines.push(``);
+      lines.push(rule.body);
+      lines.push(``);
+      lines.push(`---`);
+      lines.push(``);
+    }
+  }
+
+  lines.push(`*Generated from individual rule files in \`rules/\`.*`);
+  lines.push(
+    `*To contribute, see [CONTRIBUTING](https://github.com/katanemo/archgw/blob/main/CONTRIBUTING.md).*`
+  );
+
+  writeFileSync(OUTPUT_FILE, lines.join("\n"), "utf-8");
+
+  let totalRules = 0;
+  for (const section of SECTIONS) {
+    const rules = sectionRules.get(section.number) ?? [];
+    if (rules.length > 0) {
+      console.log(`  Section ${section.number}: ${rules.length} rules`);
+      totalRules += rules.length;
+    }
+  }
+  console.log(`\nBuilt AGENTS.md with ${totalRules} rules.`);
+}
+
+main();
diff --git a/skills/src/extract-tests.ts b/skills/src/extract-tests.ts
new file mode 100644
index 00000000..b7d03b61
--- /dev/null
+++ b/skills/src/extract-tests.ts
@@ -0,0 +1,147 @@
+#!/usr/bin/env node
+
+import { readFileSync, writeFileSync, readdirSync } from "node:fs";
+import { join, dirname } from "node:path";
+import { fileURLToPath } from "node:url";
+
+type ParsedFrontmatter = {
+  frontmatter: Record<string, string>;
+  body: string;
+};
+
+type SectionPrefix = {
+  prefix: string;
+  number: number;
+  title: string;
+};
+
+type ExampleExtraction = {
+  incorrect: string | null;
+  correct: string | null;
+};
+
+type TestCaseEntry = {
+  id: string;
+  section: number;
+  sectionTitle: string;
+  title: string;
+  impact: string;
+  tags: string[];
+  testCase: {
+    description: string;
+    input: string | null;
+    expected: string | null;
+    evaluationPrompt: string;
+  };
+};
+
+const __dirname = dirname(fileURLToPath(import.meta.url));
+const RULES_DIR = join(__dirname, "..", "rules");
+const OUTPUT_FILE = join(__dirname, "..", "test-cases.json");
+
+const SECTION_PREFIXES: SectionPrefix[] = [
+  { prefix: "config-", number: 1, title: "Configuration Fundamentals" },
+  { prefix: "routing-", number: 2, title: "Routing & Model Selection" },
+  { prefix: "agent-", number: 3, title: "Agent Orchestration" },
+  { prefix: "filter-", number: 4, title: "Filter Chains & Guardrails" },
+  { prefix: "observe-", number: 5, title: "Observability & Debugging" },
+  { prefix: "cli-", number: 6, title: "CLI Operations" },
+  { prefix: "deploy-", number: 7, title: "Deployment & Security" },
+  { prefix: "advanced-", number: 8, title: "Advanced Patterns" },
+];
+
+function parseFrontmatter(content: string): ParsedFrontmatter | null {
+  const match = content.match(/^---\n([\s\S]*?)\n---\n([\s\S]*)$/);
+  if (!match) return null;
+
+  const frontmatter: Record<string, string> = {};
+  const lines = match[1].split("\n");
+  for (const line of lines) {
+    const colonIdx = line.indexOf(":");
+    if (colonIdx === -1) continue;
+    const key = line.slice(0, colonIdx).trim();
+    const value = line.slice(colonIdx + 1).trim();
+    frontmatter[key] = value;
+  }
+
+  return { frontmatter, body: match[2].trim() };
+}
+
+function extractCodeBlocks(text: string): string[] {
+  const blocks: string[] = [];
+  const regex = /```(?:yaml|bash|python|typescript|json|sh)?\n([\s\S]*?)```/g;
+  let match: RegExpExecArray | null;
+  do {
+    match = regex.exec(text);
+    if (match) {
+      blocks.push(match[1].trim());
+    }
+  } while (match !== null);
+  return blocks;
+}
+
+function extractExamples(body: string): ExampleExtraction {
+  const incorrectMatch = body.match(
+    /\*\*Incorrect[^*]*\*\*[:\s]*([\s\S]*?)(?=\*\*Correct|\*\*Key|$)/
+  );
+  const correctMatch = body.match(
+    /\*\*Correct[^*]*\*\*[:\s]*([\s\S]*?)(?=\*\*Incorrect|\*\*Key|\*\*Note|Reference:|$)/
+  );
+
+  return {
+    incorrect: incorrectMatch
+      ? extractCodeBlocks(incorrectMatch[1]).join("\n\n")
+      : null,
+    correct: correctMatch ? extractCodeBlocks(correctMatch[1]).join("\n\n") : null,
+  };
+}
+
+function inferSection(filename: string): SectionPrefix | null {
+  for (const s of SECTION_PREFIXES) {
+    if (filename.startsWith(s.prefix)) return s;
+  }
+  return null;
+}
+
+function main(): void {
+  const files = readdirSync(RULES_DIR)
+    .filter((f) => f.endsWith(".md") && !f.startsWith("_"))
+    .sort();
+
+  const testCases: TestCaseEntry[] = [];
+
+  for (const file of files) {
+    const content = readFileSync(join(RULES_DIR, file), "utf-8");
+    const parsed = parseFrontmatter(content);
+    if (!parsed) continue;
+
+    const { frontmatter, body } = parsed;
+    const section = inferSection(file);
+    if (!section) continue;
+
+    const { incorrect, correct } = extractExamples(body);
+    if (!incorrect && !correct) continue;
+
+    testCases.push({
+      id: file.replace(".md", ""),
+      section: section.number,
+      sectionTitle: section.title,
+      title: frontmatter.title ?? file,
+      impact: frontmatter.impact ?? "MEDIUM",
+      tags: frontmatter.tags
+        ? frontmatter.tags.split(",").map((t) => t.trim())
+        : [],
+      testCase: {
+        description: `Detect and fix: "${frontmatter.title}"`,
+        input: incorrect,
+        expected: correct,
+        evaluationPrompt: `Given the following Plano config or CLI usage, identify if it violates the rule "${frontmatter.title}" and explain how to fix it.`,
+      },
+    });
+  }
+
+  writeFileSync(OUTPUT_FILE, JSON.stringify(testCases, null, 2), "utf-8");
+  console.log(`Extracted ${testCases.length} test cases to test-cases.json`);
+}
+
+main();
diff --git a/skills/src/validate.ts b/skills/src/validate.ts
new file mode 100644
index 00000000..4fdf46ea
--- /dev/null
+++ b/skills/src/validate.ts
@@ -0,0 +1,156 @@
+#!/usr/bin/env node
+
+import { readFileSync, readdirSync } from "node:fs";
+import { join, dirname } from "node:path";
+import { fileURLToPath } from "node:url";
+
+type ParsedFrontmatter = {
+  frontmatter: Record<string, string>;
+  body: string;
+};
+
+type ValidationResult = {
+  errors: string[];
+  warnings: string[];
+};
+
+const __dirname = dirname(fileURLToPath(import.meta.url));
+const RULES_DIR = join(__dirname, "..", "rules");
+
+const VALID_IMPACTS = [
+  "CRITICAL",
+  "HIGH",
+  "MEDIUM-HIGH",
+  "MEDIUM",
+  "LOW-MEDIUM",
+  "LOW",
+] as const;
+
+const SECTION_PREFIXES = [
+  "config-",
+  "routing-",
+  "agent-",
+  "filter-",
+  "observe-",
+  "cli-",
+  "deploy-",
+  "advanced-",
+];
+
+function parseFrontmatter(content: string): ParsedFrontmatter | null {
+  const match = content.match(/^---\n([\s\S]*?)\n---\n([\s\S]*)$/);
+  if (!match) return null;
+
+  const frontmatter: Record<string, string> = {};
+  const lines = match[1].split("\n");
+  for (const line of lines) {
+    const colonIdx = line.indexOf(":");
+    if (colonIdx === -1) continue;
+    const key = line.slice(0, colonIdx).trim();
+    const value = line.slice(colonIdx + 1).trim();
+    frontmatter[key] = value;
+  }
+
+  return { frontmatter, body: match[2].trim() };
+}
+
+function validateFile(file: string, content: string): ValidationResult {
+  const errors: string[] = [];
+  const warnings: string[] = [];
+
+  const parsed = parseFrontmatter(content);
+  if (!parsed) {
+    errors.push("Missing or malformed frontmatter (expected --- ... ---)");
+    return { errors, warnings };
+  }
+
+  const { frontmatter, body } = parsed;
+
+  if (!frontmatter.title) {
+    errors.push("Missing required frontmatter field: title");
+  }
+  if (!frontmatter.impact) {
+    errors.push("Missing required frontmatter field: impact");
+  } else if (!VALID_IMPACTS.includes(frontmatter.impact as (typeof VALID_IMPACTS)[number])) {
+    errors.push(
+      `Invalid impact value: "${frontmatter.impact}". Valid values: ${VALID_IMPACTS.join(", ")}`
+    );
+  }
+  if (!frontmatter.tags) {
+    warnings.push("No tags defined — consider adding relevant tags");
+  }
+
+  const hasValidPrefix = SECTION_PREFIXES.some((p) => file.startsWith(p));
+  if (!hasValidPrefix) {
+    errors.push(
+      `Filename must start with a valid prefix: ${SECTION_PREFIXES.join(", ")}`
+    );
+  }
+
+  if (body.length < 100) {
+    warnings.push("Rule body seems very short — consider adding more detail");
+  }
+
+  if (!body.includes("```")) {
+    warnings.push(
+      "No code examples found — rules should include YAML or CLI examples"
+    );
+  }
+
+  if (!body.includes("Incorrect") || !body.includes("Correct")) {
+    warnings.push(
+      "Consider adding both Incorrect and Correct examples for clarity"
+    );
+  }
+
+  return { errors, warnings };
+}
+
+function main(): void {
+  const files = readdirSync(RULES_DIR)
+    .filter((f) => f.endsWith(".md") && !f.startsWith("_"))
+    .sort();
+
+  let totalErrors = 0;
+  let totalWarnings = 0;
+  let filesWithIssues = 0;
+
+  console.log(`Validating ${files.length} rule files...\n`);
+
+  for (const file of files) {
+    const content = readFileSync(join(RULES_DIR, file), "utf-8");
+    const { errors, warnings } = validateFile(file, content);
+
+    if (errors.length > 0 || warnings.length > 0) {
+      filesWithIssues++;
+      console.log(`📄 ${file}`);
+
+      for (const error of errors) {
+        console.log(`  ❌ ERROR: ${error}`);
+        totalErrors++;
+      }
+      for (const warning of warnings) {
+        console.log(`  ⚠️  WARN:  ${warning}`);
+        totalWarnings++;
+      }
+      console.log();
+    } else {
+      console.log(`✅ ${file}`);
+    }
+  }
+
+  console.log(`\n--- Validation Summary ---`);
+  console.log(`Files checked:    ${files.length}`);
+  console.log(`Files with issues: ${filesWithIssues}`);
+  console.log(`Errors:           ${totalErrors}`);
+  console.log(`Warnings:         ${totalWarnings}`);
+
+  if (totalErrors > 0) {
+    console.log(`\nValidation FAILED with ${totalErrors} error(s).`);
+    process.exit(1);
+  } else {
+    console.log(`\nValidation passed.`);
+  }
+}
+
+main();
diff --git a/skills/test-cases.json b/skills/test-cases.json
new file mode 100644
index 00000000..c8bcfe33
--- /dev/null
+++ b/skills/test-cases.json
@@ -0,0 +1,353 @@
+[
+  {
+    "id": "advanced-prompt-targets",
+    "section": 8,
+    "sectionTitle": "Advanced Patterns",
+    "title": "Design Prompt Targets with Precise Parameter Schemas",
+    "impact": "HIGH",
+    "tags": [
+      "advanced",
+      "prompt-targets",
+      "functions",
+      "llm",
+      "api-integration"
+    ],
+    "testCase": {
+      "description": "Detect and fix: \"Design Prompt Targets with Precise Parameter Schemas\"",
+      "input": "prompt_targets:\n  - name: get_flight_info\n    description: Get flight information\n    parameters:\n      - name: flight         # What format? \"AA123\"? \"AA 123\"? \"American 123\"?\n        type: str\n        required: true\n    endpoint:\n      name: flights_api\n      path: /flight?id={flight}",
+      "expected": "version: v0.3.0\n\nendpoints:\n  flights_api:\n    endpoint: api.flightaware.com\n    protocol: https\n    connect_timeout: \"5s\"\n\nprompt_targets:\n  - name: get_flight_status\n    description: >\n      Get real-time status, gate information, and delays for a specific flight number.\n      Use when the user asks about a flight's current status, arrival time, or gate.\n    parameters:\n      - name: flight_number\n        description: >\n          IATA airline code followed by flight number, e.g., \"AA123\", \"UA456\", \"DL789\".\n          Extract from user message — do not include spaces.\n        type: str\n        required: true\n        format: \"^[A-Z]{2}[0-9]{1,4}$\"    # Regex hint for validation\n\n      - name: date\n        description: >\n          Flight date in YYYY-MM-DD format. Use today's date if not specified.\n        type: str\n        required: false\n        format: date\n\n    endpoint:\n      name: flights_api\n      path: /flights/{flight_number}?date={date}\n      http_method: GET\n      http_headers:\n        Authorization: \"Bearer $FLIGHTAWARE_API_KEY\"\n\n  - name: search_flights\n    description: >\n      Search for available flights between two cities or airports.\n      Use when the user wants to find flights, compare options, or book travel.\n    parameters:\n      - name: origin\n        description: Departure airport IATA code (e.g., \"JFK\", \"LAX\", \"ORD\")\n        type: str\n        required: true\n      - name: destination\n        description: Arrival airport IATA code (e.g., \"LHR\", \"CDG\", \"NRT\")\n        type: str\n        required: true\n      - name: departure_date\n        description: Departure date in YYYY-MM-DD format\n        type: str\n        required: true\n        format: date\n      - name: cabin_class\n        description: Preferred cabin class\n        type: str\n        required: false\n        default: economy\n        enum: [economy, premium_economy, business, first]\n      - name: passengers\n        description: Number of adult passengers (1-9)\n        type: int\n        required: false\n        default: 1\n\n    endpoint:\n      name: flights_api\n      path: /search?from={origin}&to={destination}&date={departure_date}&class={cabin_class}&pax={passengers}\n      http_method: GET\n      http_headers:\n        Authorization: \"Bearer $FLIGHTAWARE_API_KEY\"\n\n    system_prompt: |\n      You are a travel assistant. Present flight search results clearly,\n      highlighting the best value options. Include price, duration, and\n      number of stops for each option.\n\nmodel_providers:\n  - model: openai/gpt-4o\n    access_key: $OPENAI_API_KEY\n    default: true\n\nlisteners:\n  - type: prompt\n    name: travel_functions\n    port: 10000\n    timeout: \"30s\"",
+      "evaluationPrompt": "Given the following Plano config or CLI usage, identify if it violates the rule \"Design Prompt Targets with Precise Parameter Schemas\" and explain how to fix it."
+    }
+  },
+  {
+    "id": "agent-descriptions",
+    "section": 3,
+    "sectionTitle": "Agent Orchestration",
+    "title": "Write Capability-Focused Agent Descriptions for Accurate Routing",
+    "impact": "HIGH",
+    "tags": [
+      "agent",
+      "orchestration",
+      "descriptions",
+      "routing",
+      "multi-agent"
+    ],
+    "testCase": {
+      "description": "Detect and fix: \"Write Capability-Focused Agent Descriptions for Accurate Routing\"",
+      "input": "listeners:\n  - type: agent\n    name: orchestrator\n    port: 8000\n    router: plano_orchestrator_v1\n    agents:\n      - id: agent_1\n        description: Helps users with information    # Too generic — matches everything\n\n      - id: agent_2\n        description: Also helps users               # Indistinguishable from agent_1",
+      "expected": "version: v0.3.0\n\nagents:\n  - id: weather_agent\n    url: http://host.docker.internal:8001\n  - id: flight_agent\n    url: http://host.docker.internal:8002\n  - id: hotel_agent\n    url: http://host.docker.internal:8003\n\nlisteners:\n  - type: agent\n    name: travel_orchestrator\n    port: 8000\n    router: plano_orchestrator_v1\n    agents:\n      - id: weather_agent\n        description: >\n          Provides real-time weather conditions and multi-day forecasts for any city\n          worldwide. Handles questions about temperature, precipitation, wind, humidity,\n          sunrise/sunset times, and severe weather alerts. Examples: \"What's the weather\n          in Tokyo?\", \"Will it rain in London this weekend?\", \"Sunrise time in New York.\"\n\n      - id: flight_agent\n        description: >\n          Provides live flight status, schedules, gate information, delays, and\n          aircraft details for any flight number or route between airports.\n          Handles questions about departures, arrivals, and airline information.\n          Examples: \"Is AA123 on time?\", \"Flights from JFK to LAX tomorrow.\"\n\n      - id: hotel_agent\n        description: >\n          Searches and books hotel accommodations, compares room types, pricing,\n          and availability. Handles check-in/check-out dates, amenities, and\n          cancellation policies. Examples: \"Hotels near Times Square for next Friday.\"",
+      "evaluationPrompt": "Given the following Plano config or CLI usage, identify if it violates the rule \"Write Capability-Focused Agent Descriptions for Accurate Routing\" and explain how to fix it."
+    }
+  },
+  {
+    "id": "agent-orchestration",
+    "section": 3,
+    "sectionTitle": "Agent Orchestration",
+    "title": "Register All Sub-Agents in Both `agents` and `listeners.agents`",
+    "impact": "CRITICAL",
+    "tags": [
+      "agent",
+      "orchestration",
+      "config",
+      "multi-agent"
+    ],
+    "testCase": {
+      "description": "Detect and fix: \"Register All Sub-Agents in Both `agents` and `listeners.agents`\"",
+      "input": "version: v0.3.0\n\nagents:\n  - id: weather_agent\n    url: http://host.docker.internal:8001\n  - id: news_agent              # Defined but never referenced in any listener\n    url: http://host.docker.internal:8002\n\nlisteners:\n  - type: agent\n    name: orchestrator\n    port: 8000\n    router: plano_orchestrator_v1\n    agents:\n      - id: weather_agent\n        description: Provides weather forecasts and current conditions.\n      # news_agent is missing here — the orchestrator cannot route to it\n\nagents:\n  - id: weather_agent\n    url: http://host.docker.internal:8001\n\nlisteners:\n  - type: agent\n    name: orchestrator\n    port: 8000\n    router: plano_orchestrator_v1\n    agents:\n      - id: weather_agent\n        description: Provides weather forecasts.\n      - id: flights_agent        # ID not in global agents[] — startup error\n        description: Provides flight status information.",
+      "expected": "version: v0.3.0\n\nagents:\n  - id: weather_agent\n    url: http://host.docker.internal:8001\n  - id: flights_agent\n    url: http://host.docker.internal:8002\n  - id: hotels_agent\n    url: http://host.docker.internal:8003\n\nmodel_providers:\n  - model: openai/gpt-4o\n    access_key: $OPENAI_API_KEY\n    default: true\n\nlisteners:\n  - type: agent\n    name: travel_orchestrator\n    port: 8000\n    router: plano_orchestrator_v1\n    agents:\n      - id: weather_agent\n        description: Real-time weather, forecasts, and climate data for any city.\n      - id: flights_agent\n        description: Live flight status, schedules, gates, and delays.\n      - id: hotels_agent\n        description: Hotel search, availability, pricing, and booking.\n        default: true    # Fallback if no other agent matches",
+      "evaluationPrompt": "Given the following Plano config or CLI usage, identify if it violates the rule \"Register All Sub-Agents in Both `agents` and `listeners.agents`\" and explain how to fix it."
+    }
+  },
+  {
+    "id": "config-listeners",
+    "section": 1,
+    "sectionTitle": "Configuration Fundamentals",
+    "title": "Choose the Right Listener Type for Your Use Case",
+    "impact": "CRITICAL",
+    "tags": [
+      "config",
+      "listeners",
+      "architecture",
+      "routing"
+    ],
+    "testCase": {
+      "description": "Detect and fix: \"Choose the Right Listener Type for Your Use Case\"",
+      "input": "version: v0.3.0\n\n# Wrong: a model listener cannot route to backend agent services\nlisteners:\n  - type: model\n    name: main\n    port: 12000\n\nagents:\n  - id: weather_agent\n    url: http://host.docker.internal:8001",
+      "expected": "version: v0.3.0\n\nagents:\n  - id: weather_agent\n    url: http://host.docker.internal:8001\n  - id: travel_agent\n    url: http://host.docker.internal:8002\n\nlisteners:\n  - type: agent\n    name: orchestrator\n    port: 8000\n    router: plano_orchestrator_v1\n    agents:\n      - id: weather_agent\n        description: Provides real-time weather, forecasts, and conditions for any city.\n      - id: travel_agent\n        description: Books flights, hotels, and travel itineraries.\n\nmodel_providers:\n  - model: openai/gpt-4o\n    access_key: $OPENAI_API_KEY\n    default: true",
+      "evaluationPrompt": "Given the following Plano config or CLI usage, identify if it violates the rule \"Choose the Right Listener Type for Your Use Case\" and explain how to fix it."
+    }
+  },
+  {
+    "id": "config-providers",
+    "section": 1,
+    "sectionTitle": "Configuration Fundamentals",
+    "title": "Register Model Providers with Correct Format Identifiers",
+    "impact": "CRITICAL",
+    "tags": [
+      "config",
+      "model-providers",
+      "llm",
+      "api-format"
+    ],
+    "testCase": {
+      "description": "Detect and fix: \"Register Model Providers with Correct Format Identifiers\"",
+      "input": "model_providers:\n  - model: gpt-4o            # Missing openai/ prefix — Plano cannot route this\n    access_key: $OPENAI_API_KEY\n\n  - model: claude-3-5-sonnet # Missing anthropic/ prefix\n    access_key: $ANTHROPIC_API_KEY",
+      "expected": "model_providers:\n  - model: openai/gpt-4o\n    access_key: $OPENAI_API_KEY\n    default: true\n\n  - model: anthropic/claude-sonnet-4-20250514\n    access_key: $ANTHROPIC_API_KEY\n\n  - model: gemini/gemini-2.0-flash\n    access_key: $GOOGLE_API_KEY\n\nmodel_providers:\n  - model: custom/llama3\n    base_url: http://host.docker.internal:11434/v1   # Ollama endpoint\n    provider_interface: openai                        # Ollama speaks OpenAI format\n    default: true",
+      "evaluationPrompt": "Given the following Plano config or CLI usage, identify if it violates the rule \"Register Model Providers with Correct Format Identifiers\" and explain how to fix it."
+    }
+  },
+  {
+    "id": "config-secrets",
+    "section": 1,
+    "sectionTitle": "Configuration Fundamentals",
+    "title": "Use Environment Variable Substitution for All Secrets",
+    "impact": "CRITICAL",
+    "tags": [
+      "config",
+      "security",
+      "secrets",
+      "api-keys",
+      "environment-variables"
+    ],
+    "testCase": {
+      "description": "Detect and fix: \"Use Environment Variable Substitution for All Secrets\"",
+      "input": "version: v0.3.0\n\nmodel_providers:\n  - model: openai/gpt-4o\n    access_key: abcdefghijklmnopqrstuvwxyz...   # Hardcoded — never do this\n\nstate_storage:\n  type: postgres\n  connection_string: \"postgresql://admin:mysecretpassword@prod-db:5432/plano\"\n\nprompt_targets:\n  - name: get_data\n    endpoint:\n      name: my_api\n      http_headers:\n        Authorization: \"Bearer abcdefghijklmnopqrstuvwxyz\"   # Hardcoded token",
+      "expected": "version: v0.3.0\n\nmodel_providers:\n  - model: openai/gpt-4o\n    access_key: $OPENAI_API_KEY\n    default: true\n\n  - model: anthropic/claude-sonnet-4-20250514\n    access_key: $ANTHROPIC_API_KEY\n\nstate_storage:\n  type: postgres\n  connection_string: \"postgresql://${DB_USER}:${DB_PASS}@${DB_HOST}:5432/${DB_NAME}\"\n\nprompt_targets:\n  - name: get_data\n    endpoint:\n      name: my_api\n      http_headers:\n        Authorization: \"Bearer $MY_API_TOKEN\"\n\n# .env — add to .gitignore\nOPENAI_API_KEY=abcdefghijklmnopqrstuvwxyz...\nANTHROPIC_API_KEY=abcdefghijklmnopqrstuvwxyz...\nDB_USER=plano\nDB_PASS=secure-password\nDB_HOST=localhost\nMY_API_TOKEN=abcdefghijklmnopqrstuvwxyz...",
+      "evaluationPrompt": "Given the following Plano config or CLI usage, identify if it violates the rule \"Use Environment Variable Substitution for All Secrets\" and explain how to fix it."
+    }
+  },
+  {
+    "id": "config-version",
+    "section": 1,
+    "sectionTitle": "Configuration Fundamentals",
+    "title": "Always Specify a Supported Config Version",
+    "impact": "CRITICAL",
+    "tags": [
+      "config",
+      "versioning",
+      "validation"
+    ],
+    "testCase": {
+      "description": "Detect and fix: \"Always Specify a Supported Config Version\"",
+      "input": "# No version field — fails schema validation\nlisteners:\n  - type: model\n    name: model_listener\n    port: 12000\n\nmodel_providers:\n  - model: openai/gpt-4o\n    access_key: $OPENAI_API_KEY",
+      "expected": "version: v0.3.0\n\nlisteners:\n  - type: model\n    name: model_listener\n    port: 12000\n\nmodel_providers:\n  - model: openai/gpt-4o\n    access_key: $OPENAI_API_KEY\n    default: true",
+      "evaluationPrompt": "Given the following Plano config or CLI usage, identify if it violates the rule \"Always Specify a Supported Config Version\" and explain how to fix it."
+    }
+  },
+  {
+    "id": "deploy-docker",
+    "section": 7,
+    "sectionTitle": "Deployment & Security",
+    "title": "Understand Plano's Docker Network Topology for Agent URL Configuration",
+    "impact": "HIGH",
+    "tags": [
+      "deployment",
+      "docker",
+      "networking",
+      "agents",
+      "urls"
+    ],
+    "testCase": {
+      "description": "Detect and fix: \"Understand Plano's Docker Network Topology for Agent URL Configuration\"",
+      "input": "version: v0.3.0\n\nagents:\n  - id: weather_agent\n    url: http://localhost:8001       # Wrong: this is Plano's own container\n\n  - id: flight_agent\n    url: http://127.0.0.1:8002      # Wrong: same issue\n\nfilters:\n  - id: input_guards\n    url: http://localhost:10500      # Wrong: filter server unreachable",
+      "expected": "version: v0.3.0\n\nagents:\n  - id: weather_agent\n    url: http://host.docker.internal:8001    # Correct: reaches host port 8001\n\n  - id: flight_agent\n    url: http://host.docker.internal:8002    # Correct: reaches host port 8002\n\nfilters:\n  - id: input_guards\n    url: http://host.docker.internal:10500   # Correct: reaches filter server on host\n\nendpoints:\n  internal_api:\n    endpoint: host.docker.internal            # Correct for internal API on host\n    protocol: http\n\n# Kubernetes / Docker Compose — use service names\nagents:\n  - id: weather_agent\n    url: http://weather-service:8001    # Kubernetes service DNS\n\n# External cloud services — use full domain\nagents:\n  - id: cloud_agent\n    url: https://my-agent.us-east-1.amazonaws.com/v1\n\n# Custom TLS (self-signed or internal CA)\noverrides:\n  upstream_tls_ca_path: /etc/ssl/certs/internal-ca.pem",
+      "evaluationPrompt": "Given the following Plano config or CLI usage, identify if it violates the rule \"Understand Plano's Docker Network Topology for Agent URL Configuration\" and explain how to fix it."
+    }
+  },
+  {
+    "id": "deploy-state",
+    "section": 7,
+    "sectionTitle": "Deployment & Security",
+    "title": "Use PostgreSQL State Storage for Multi-Turn Conversations in Production",
+    "impact": "HIGH",
+    "tags": [
+      "deployment",
+      "state",
+      "postgres",
+      "memory",
+      "multi-turn",
+      "production"
+    ],
+    "testCase": {
+      "description": "Detect and fix: \"Use PostgreSQL State Storage for Multi-Turn Conversations in Production\"",
+      "input": "version: v0.3.0\n\n# Memory storage — all conversations lost on planoai down / container restart\nstate_storage:\n  type: memory\n\nlisteners:\n  - type: agent\n    name: customer_support\n    port: 8000\n    router: plano_orchestrator_v1\n    agents:\n      - id: support_agent\n        description: Customer support assistant with conversation history.",
+      "expected": "version: v0.3.0\n\nstate_storage:\n  type: postgres\n  connection_string: \"postgresql://${DB_USER}:${DB_PASS}@${DB_HOST}:5432/${DB_NAME}\"\n\nlisteners:\n  - type: agent\n    name: customer_support\n    port: 8000\n    router: plano_orchestrator_v1\n    agents:\n      - id: support_agent\n        description: Customer support assistant with access to full conversation history.\n\nmodel_providers:\n  - model: openai/gpt-4o\n    access_key: $OPENAI_API_KEY\n    default: true\n\n# Start PostgreSQL with Docker\ndocker run -d \\\n  --name plano-postgres \\\n  -e POSTGRES_USER=plano \\\n  -e POSTGRES_PASSWORD=devpassword \\\n  -e POSTGRES_DB=plano \\\n  -p 5432:5432 \\\n  postgres:16\n\n# Set environment variables\nexport DB_USER=plano\nexport DB_PASS=devpassword\nexport DB_HOST=host.docker.internal   # Use host.docker.internal from inside Plano container\nexport DB_NAME=plano\n\nDB_USER=plano_prod\nDB_PASS=<strong-random-password>\nDB_HOST=your-rds-endpoint.amazonaws.com\nDB_NAME=plano",
+      "evaluationPrompt": "Given the following Plano config or CLI usage, identify if it violates the rule \"Use PostgreSQL State Storage for Multi-Turn Conversations in Production\" and explain how to fix it."
+    }
+  },
+  {
+    "id": "filter-guardrails",
+    "section": 4,
+    "sectionTitle": "Filter Chains & Guardrails",
+    "title": "Configure Prompt Guards with Actionable Rejection Messages",
+    "impact": "MEDIUM",
+    "tags": [
+      "filter",
+      "guardrails",
+      "jailbreak",
+      "security",
+      "ux"
+    ],
+    "testCase": {
+      "description": "Detect and fix: \"Configure Prompt Guards with Actionable Rejection Messages\"",
+      "input": "version: v0.3.0\n\nprompt_guards:\n  input_guards:\n    jailbreak:\n      on_exception: {}    # Empty — returns unhelpful generic error\n\nprompt_guards:\n  input_guards:\n    jailbreak:\n      on_exception:\n        message: \"Error code 403: guard triggered\"    # Unhelpful to the user",
+      "expected": "version: v0.3.0\n\nprompt_guards:\n  input_guards:\n    jailbreak:\n      on_exception:\n        message: >\n          I'm not able to help with that request. This assistant is designed\n          to help with [your use case, e.g., customer support, coding questions].\n          Please rephrase your question or contact support@yourdomain.com\n          if you believe this is an error.\n\n# Built-in jailbreak detection (fast, no external service needed)\nprompt_guards:\n  input_guards:\n    jailbreak:\n      on_exception:\n        message: \"This request cannot be processed. Please ask about our products and services.\"\n\n# MCP-based custom guards for additional policy enforcement\nfilters:\n  - id: topic_restriction\n    url: http://host.docker.internal:10500\n    type: mcp\n    transport: streamable-http\n    tool: topic_restriction    # Custom filter for domain-specific restrictions\n\nlisteners:\n  - type: agent\n    name: customer_support\n    port: 8000\n    router: plano_orchestrator_v1\n    agents:\n      - id: support_agent\n        description: Customer support assistant for product questions and order issues.\n        filter_chain:\n          - topic_restriction    # Additional custom topic filtering",
+      "evaluationPrompt": "Given the following Plano config or CLI usage, identify if it violates the rule \"Configure Prompt Guards with Actionable Rejection Messages\" and explain how to fix it."
+    }
+  },
+  {
+    "id": "filter-mcp",
+    "section": 4,
+    "sectionTitle": "Filter Chains & Guardrails",
+    "title": "Configure MCP Filters with Explicit Type and Transport",
+    "impact": "MEDIUM",
+    "tags": [
+      "filter",
+      "mcp",
+      "integration",
+      "configuration"
+    ],
+    "testCase": {
+      "description": "Detect and fix: \"Configure MCP Filters with Explicit Type and Transport\"",
+      "input": "filters:\n  - id: my_guard          # Plano infers type=mcp, transport=streamable-http, tool=my_guard\n    url: http://localhost:10500\n    # If your MCP server uses a different tool name or transport, this silently misroutes",
+      "expected": "version: v0.3.0\n\nfilters:\n  - id: input_guards\n    url: http://host.docker.internal:10500\n    type: mcp                        # Explicitly MCP protocol\n    transport: streamable-http       # Streamable HTTP transport\n    tool: input_guards               # MCP tool name (matches MCP server registration)\n\n  - id: query_rewriter\n    url: http://host.docker.internal:10501\n    type: mcp\n    transport: streamable-http\n    tool: rewrite_query              # Tool name differs from filter ID — explicit is safer\n\n  - id: custom_validator\n    url: http://host.docker.internal:10503\n    type: http                       # Plain HTTP filter (not MCP)\n    # No tool field for HTTP filters\n\nfilters:\n  - id: auth_validator\n    url: http://host.docker.internal:9000/validate\n    type: http    # Plano POSTs the request, expects the modified request back",
+      "evaluationPrompt": "Given the following Plano config or CLI usage, identify if it violates the rule \"Configure MCP Filters with Explicit Type and Transport\" and explain how to fix it."
+    }
+  },
+  {
+    "id": "filter-ordering",
+    "section": 4,
+    "sectionTitle": "Filter Chains & Guardrails",
+    "title": "Order Filter Chains with Guards First, Enrichment Last",
+    "impact": "HIGH",
+    "tags": [
+      "filter",
+      "guardrails",
+      "security",
+      "pipeline",
+      "ordering"
+    ],
+    "testCase": {
+      "description": "Detect and fix: \"Order Filter Chains with Guards First, Enrichment Last\"",
+      "input": "filters:\n  - id: context_builder\n    url: http://host.docker.internal:10502    # Runs expensive RAG retrieval first\n  - id: query_rewriter\n    url: http://host.docker.internal:10501\n  - id: input_guards\n    url: http://host.docker.internal:10500    # Guards run last — jailbreak gets context\n\nlisteners:\n  - type: agent\n    name: rag_orchestrator\n    port: 8000\n    router: plano_orchestrator_v1\n    agents:\n      - id: rag_agent\n        filter_chain:\n          - context_builder   # Wrong: expensive enrichment before safety check\n          - query_rewriter\n          - input_guards",
+      "expected": "version: v0.3.0\n\nfilters:\n  - id: input_guards\n    url: http://host.docker.internal:10500\n    type: mcp\n    transport: streamable-http\n  - id: query_rewriter\n    url: http://host.docker.internal:10501\n    type: mcp\n    transport: streamable-http\n  - id: context_builder\n    url: http://host.docker.internal:10502\n    type: mcp\n    transport: streamable-http\n\nlisteners:\n  - type: agent\n    name: rag_orchestrator\n    port: 8000\n    router: plano_orchestrator_v1\n    agents:\n      - id: rag_agent\n        description: Answers questions using internal knowledge base documents.\n        filter_chain:\n          - input_guards      # 1. Block jailbreaks and policy violations\n          - query_rewriter    # 2. Normalize the safe query\n          - context_builder   # 3. Retrieve relevant context for the clean query",
+      "evaluationPrompt": "Given the following Plano config or CLI usage, identify if it violates the rule \"Order Filter Chains with Guards First, Enrichment Last\" and explain how to fix it."
+    }
+  },
+  {
+    "id": "observe-span-attributes",
+    "section": 5,
+    "sectionTitle": "Observability & Debugging",
+    "title": "Add Custom Span Attributes for Correlation and Filtering",
+    "impact": "MEDIUM",
+    "tags": [
+      "observability",
+      "tracing",
+      "span-attributes",
+      "correlation"
+    ],
+    "testCase": {
+      "description": "Detect and fix: \"Add Custom Span Attributes for Correlation and Filtering\"",
+      "input": "tracing:\n  random_sampling: 20\n  # No span_attributes — cannot filter by user, session, or environment",
+      "expected": "version: v0.3.0\n\ntracing:\n  random_sampling: 20\n  trace_arch_internal: true\n\n  span_attributes:\n    # Match all headers with this prefix, then map to span attributes by:\n    # 1) stripping the prefix and 2) converting hyphens to dots\n    header_prefixes:\n      - x-katanemo-\n\n    # Static attributes added to every span from this Plano instance\n    static:\n      environment: production\n      service.name: plano-gateway\n      deployment.region: us-east-1\n      service.version: \"2.1.0\"\n      team: platform-engineering\n\nimport httpx\n\nresponse = httpx.post(\n    \"http://localhost:12000/v1/chat/completions\",\n    headers={\n        \"x-katanemo-request-id\": \"req_abc123\",\n        \"x-katanemo-user-id\": \"usr_12\",\n        \"x-katanemo-session-id\": \"sess_xyz456\",\n        \"x-katanemo-tenant-id\": \"acme-corp\",\n    },\n    json={\"model\": \"plano.v1\", \"messages\": [...]}\n)\n\n# Find all requests from a specific user\nplanoai trace --where user.id=usr_12\n\n# Find all traces from production environment\nplanoai trace --where environment=production\n\n# Find traces from a specific tenant\nplanoai trace --where tenant.id=acme-corp",
+      "evaluationPrompt": "Given the following Plano config or CLI usage, identify if it violates the rule \"Add Custom Span Attributes for Correlation and Filtering\" and explain how to fix it."
+    }
+  },
+  {
+    "id": "observe-tracing",
+    "section": 5,
+    "sectionTitle": "Observability & Debugging",
+    "title": "Enable Tracing with Appropriate Sampling for Your Environment",
+    "impact": "HIGH",
+    "tags": [
+      "observability",
+      "tracing",
+      "opentelemetry",
+      "otel",
+      "debugging"
+    ],
+    "testCase": {
+      "description": "Detect and fix: \"Enable Tracing with Appropriate Sampling for Your Environment\"",
+      "input": "version: v0.3.0\n\nlisteners:\n  - type: model\n    name: model_listener\n    port: 12000\n\nmodel_providers:\n  - model: openai/gpt-4o\n    access_key: $OPENAI_API_KEY\n    default: true\n\n# No tracing block — no visibility into routing, latency, or errors",
+      "expected": "version: v0.3.0\n\nlisteners:\n  - type: model\n    name: model_listener\n    port: 12000\n\nmodel_providers:\n  - model: openai/gpt-4o\n    access_key: $OPENAI_API_KEY\n    default: true\n\ntracing:\n  random_sampling: 100              # 100% for development/debugging\n  trace_arch_internal: true         # Include Plano's internal routing spans\n\ntracing:\n  random_sampling: 10               # Sample 10% of requests in production\n  trace_arch_internal: false        # Skip internal spans to reduce noise\n  span_attributes:\n    header_prefixes:\n      - x-katanemo-               # Match all x-katanemo-* headers\n    static:\n      environment: production\n      service.name: my-plano-service\n      version: \"1.0.0\"\n\n# Start Plano with built-in OTEL collector\nplanoai up config.yaml --with-tracing",
+      "evaluationPrompt": "Given the following Plano config or CLI usage, identify if it violates the rule \"Enable Tracing with Appropriate Sampling for Your Environment\" and explain how to fix it."
+    }
+  },
+  {
+    "id": "routing-aliases",
+    "section": 2,
+    "sectionTitle": "Routing & Model Selection",
+    "title": "Use Model Aliases for Semantic, Stable Model References",
+    "impact": "MEDIUM",
+    "tags": [
+      "routing",
+      "model-aliases",
+      "maintainability",
+      "client-integration"
+    ],
+    "testCase": {
+      "description": "Detect and fix: \"Use Model Aliases for Semantic, Stable Model References\"",
+      "input": "# config.yaml — no aliases defined\nversion: v0.3.0\n\nlisteners:\n  - type: model\n    name: model_listener\n    port: 12000\n\nmodel_providers:\n  - model: openai/gpt-4o\n    access_key: $OPENAI_API_KEY\n    default: true\n\n# Client code — brittle, must be updated when model changes\nclient.chat.completions.create(model=\"gpt-4o\", ...)",
+      "expected": "version: v0.3.0\n\nlisteners:\n  - type: model\n    name: model_listener\n    port: 12000\n\nmodel_providers:\n  - model: openai/gpt-4o-mini\n    access_key: $OPENAI_API_KEY\n    default: true\n  - model: openai/gpt-4o\n    access_key: $OPENAI_API_KEY\n  - model: anthropic/claude-sonnet-4-20250514\n    access_key: $ANTHROPIC_API_KEY\n\nmodel_aliases:\n  plano.fast.v1:\n    target: gpt-4o-mini          # Cheap, fast — for high-volume tasks\n\n  plano.smart.v1:\n    target: gpt-4o               # High capability — for complex reasoning\n\n  plano.creative.v1:\n    target: claude-sonnet-4-20250514  # Strong creative writing and analysis\n\n  plano.v1:\n    target: gpt-4o               # Default production alias\n\n# Client code — stable, alias is the contract\nclient.chat.completions.create(model=\"plano.smart.v1\", ...)",
+      "evaluationPrompt": "Given the following Plano config or CLI usage, identify if it violates the rule \"Use Model Aliases for Semantic, Stable Model References\" and explain how to fix it."
+    }
+  },
+  {
+    "id": "routing-default",
+    "section": 2,
+    "sectionTitle": "Routing & Model Selection",
+    "title": "Always Set Exactly One Default Model Provider",
+    "impact": "HIGH",
+    "tags": [
+      "routing",
+      "defaults",
+      "model-providers",
+      "reliability"
+    ],
+    "testCase": {
+      "description": "Detect and fix: \"Always Set Exactly One Default Model Provider\"",
+      "input": "version: v0.3.0\n\nmodel_providers:\n  - model: openai/gpt-4o-mini     # No default: true anywhere\n    access_key: $OPENAI_API_KEY\n    routing_preferences:\n      - name: summarization\n        description: Summarizing documents and extracting key points\n\n  - model: openai/gpt-4o\n    access_key: $OPENAI_API_KEY\n    routing_preferences:\n      - name: code_generation\n        description: Writing new functions and implementing algorithms\n\nmodel_providers:\n  - model: openai/gpt-4o-mini\n    default: true               # First default\n    access_key: $OPENAI_API_KEY\n\n  - model: openai/gpt-4o\n    default: true               # Second default — confusing\n    access_key: $OPENAI_API_KEY",
+      "expected": "version: v0.3.0\n\nmodel_providers:\n  - model: openai/gpt-4o-mini\n    access_key: $OPENAI_API_KEY\n    default: true               # Handles general/unclassified requests\n    routing_preferences:\n      - name: summarization\n        description: Summarizing documents, articles, and meeting notes\n      - name: classification\n        description: Categorizing inputs, labeling, and intent detection\n\n  - model: openai/gpt-4o\n    access_key: $OPENAI_API_KEY\n    routing_preferences:\n      - name: code_generation\n        description: Writing, debugging, and reviewing code\n      - name: complex_reasoning\n        description: Multi-step math, logical analysis, research synthesis",
+      "evaluationPrompt": "Given the following Plano config or CLI usage, identify if it violates the rule \"Always Set Exactly One Default Model Provider\" and explain how to fix it."
+    }
+  },
+  {
+    "id": "routing-passthrough",
+    "section": 2,
+    "sectionTitle": "Routing & Model Selection",
+    "title": "Use Passthrough Auth for Proxy and Multi-Tenant Setups",
+    "impact": "MEDIUM",
+    "tags": [
+      "routing",
+      "authentication",
+      "proxy",
+      "litellm",
+      "multi-tenant"
+    ],
+    "testCase": {
+      "description": "Detect and fix: \"Use Passthrough Auth for Proxy and Multi-Tenant Setups\"",
+      "input": "model_providers:\n  - model: custom/proxy\n    base_url: http://host.docker.internal:8000\n    access_key: $SOME_KEY    # Plano overwrites the client's auth — proxy rejects it",
+      "expected": "version: v0.3.0\n\nlisteners:\n  - type: model\n    name: model_listener\n    port: 12000\n\nmodel_providers:\n  - model: custom/litellm-proxy\n    base_url: http://host.docker.internal:4000    # LiteLLM server\n    provider_interface: openai                    # LiteLLM uses OpenAI format\n    passthrough_auth: true                        # Forward client's Bearer token\n    default: true\n\nmodel_providers:\n  # Plano acts as a passthrough gateway; each client has their own OpenAI key\n  - model: openai/gpt-4o\n    passthrough_auth: true    # No access_key here — client's key is forwarded\n    default: true\n\nmodel_providers:\n  - model: openai/gpt-4o-mini\n    access_key: $OPENAI_API_KEY    # Plano manages this key\n    default: true\n    routing_preferences:\n      - name: quick tasks\n        description: Short answers, simple lookups, fast completions\n\n  - model: custom/vllm-llama\n    base_url: http://gpu-server:8000\n    provider_interface: openai\n    passthrough_auth: true         # vLLM cluster handles its own auth\n    routing_preferences:\n      - name: long context\n        description: Processing very long documents, multi-document analysis",
+      "evaluationPrompt": "Given the following Plano config or CLI usage, identify if it violates the rule \"Use Passthrough Auth for Proxy and Multi-Tenant Setups\" and explain how to fix it."
+    }
+  },
+  {
+    "id": "routing-preferences",
+    "section": 2,
+    "sectionTitle": "Routing & Model Selection",
+    "title": "Write Task-Specific Routing Preference Descriptions",
+    "impact": "HIGH",
+    "tags": [
+      "routing",
+      "model-selection",
+      "preferences",
+      "llm-routing"
+    ],
+    "testCase": {
+      "description": "Detect and fix: \"Write Task-Specific Routing Preference Descriptions\"",
+      "input": "model_providers:\n  - model: openai/gpt-4o-mini\n    access_key: $OPENAI_API_KEY\n    default: true\n    routing_preferences:\n      - name: simple\n        description: easy tasks      # Too vague — what is \"easy\"?\n\n  - model: openai/gpt-4o\n    access_key: $OPENAI_API_KEY\n    routing_preferences:\n      - name: hard\n        description: hard tasks      # Too vague — overlaps with \"easy\"",
+      "expected": "model_providers:\n  - model: openai/gpt-4o-mini\n    access_key: $OPENAI_API_KEY\n    default: true\n    routing_preferences:\n      - name: summarization\n        description: >\n          Summarizing documents, articles, emails, or meeting transcripts.\n          Extracting key points, generating TL;DR sections, condensing long text.\n      - name: classification\n        description: >\n          Categorizing inputs, sentiment analysis, spam detection,\n          intent classification, labeling structured data fields.\n      - name: translation\n        description: >\n          Translating text between languages, localization tasks.\n\n  - model: openai/gpt-4o\n    access_key: $OPENAI_API_KEY\n    routing_preferences:\n      - name: code_generation\n        description: >\n          Writing new functions, classes, or modules from scratch.\n          Implementing algorithms, boilerplate generation, API integrations.\n      - name: code_review\n        description: >\n          Reviewing code for bugs, security vulnerabilities, performance issues.\n          Suggesting refactors, explaining complex code, debugging errors.\n      - name: complex_reasoning\n        description: >\n          Multi-step math problems, logical deduction, strategic planning,\n          research synthesis requiring chain-of-thought reasoning.",
+      "evaluationPrompt": "Given the following Plano config or CLI usage, identify if it violates the rule \"Write Task-Specific Routing Preference Descriptions\" and explain how to fix it."
+    }
+  }
+]
diff --git a/skills/tsconfig.json b/skills/tsconfig.json
new file mode 100644
index 00000000..83552abb
--- /dev/null
+++ b/skills/tsconfig.json
@@ -0,0 +1,15 @@
+{
+  "compilerOptions": {
+    "target": "ES2022",
+    "module": "NodeNext",
+    "moduleResolution": "NodeNext",
+    "lib": ["ES2022"],
+    "strict": true,
+    "noEmit": true,
+    "types": ["node"],
+    "skipLibCheck": true,
+    "resolveJsonModule": true,
+    "forceConsistentCasingInFileNames": true
+  },
+  "include": ["src/**/*.ts"]
+}

From 711e4dd07d7b7d6450a2645b115d16e5f3b56ec4 Mon Sep 17 00:00:00 2001
From: Adil Hafeez <adil.hafeez@gmail.com>
Date: Fri, 17 Apr 2026 12:25:34 -0700
Subject: [PATCH 04/16] Add DigitalOcean as a first-class LLM provider (#889)

---
 cli/planoai/config_generator.py               |  1 +
 cli/uv.lock                                   |  2 +-
 config/envoy.template.yaml                    | 27 ++++++++++
 crates/hermesllm/src/bin/provider_models.yaml | 52 +++++++++++++++++--
 crates/hermesllm/src/providers/id.rs          | 12 ++++-
 5 files changed, 88 insertions(+), 6 deletions(-)

diff --git a/cli/planoai/config_generator.py b/cli/planoai/config_generator.py
index 5a3d4f63..d9d76d79 100644
--- a/cli/planoai/config_generator.py
+++ b/cli/planoai/config_generator.py
@@ -28,6 +28,7 @@ SUPPORTED_PROVIDERS_WITHOUT_BASE_URL = [
     "xai",
     "moonshotai",
     "zhipu",
+    "digitalocean",
 ]
 
 SUPPORTED_PROVIDERS = (
diff --git a/cli/uv.lock b/cli/uv.lock
index 665ebdb8..e8c85648 100644
--- a/cli/uv.lock
+++ b/cli/uv.lock
@@ -337,7 +337,7 @@ wheels = [
 
 [[package]]
 name = "planoai"
-version = "0.4.18"
+version = "0.4.19"
 source = { editable = "." }
 dependencies = [
     { name = "click" },
diff --git a/config/envoy.template.yaml b/config/envoy.template.yaml
index 5669511d..b2b9fb1f 100644
--- a/config/envoy.template.yaml
+++ b/config/envoy.template.yaml
@@ -901,6 +901,33 @@ static_resources:
             validation_context:
               trusted_ca:
                 filename: {{ upstream_tls_ca_path | default('/etc/ssl/certs/ca-certificates.crt') }}
+    - name: digitalocean
+      connect_timeout: {{ upstream_connect_timeout | default('5s') }}
+      type: LOGICAL_DNS
+      dns_lookup_family: V4_ONLY
+      lb_policy: ROUND_ROBIN
+      load_assignment:
+        cluster_name: digitalocean
+        endpoints:
+          - lb_endpoints:
+              - endpoint:
+                  address:
+                    socket_address:
+                      address: inference.do-ai.run
+                      port_value: 443
+                  hostname: "inference.do-ai.run"
+      transport_socket:
+        name: envoy.transport_sockets.tls
+        typed_config:
+          "@type": type.googleapis.com/envoy.extensions.transport_sockets.tls.v3.UpstreamTlsContext
+          sni: inference.do-ai.run
+          common_tls_context:
+            tls_params:
+              tls_minimum_protocol_version: TLSv1_2
+              tls_maximum_protocol_version: TLSv1_3
+            validation_context:
+              trusted_ca:
+                filename: {{ upstream_tls_ca_path | default('/etc/ssl/certs/ca-certificates.crt') }}
     - name: xiaomi
       connect_timeout: {{ upstream_connect_timeout | default('5s') }}
       type: LOGICAL_DNS
diff --git a/crates/hermesllm/src/bin/provider_models.yaml b/crates/hermesllm/src/bin/provider_models.yaml
index 53dac7f4..22f69a7d 100644
--- a/crates/hermesllm/src/bin/provider_models.yaml
+++ b/crates/hermesllm/src/bin/provider_models.yaml
@@ -328,7 +328,53 @@ providers:
   - xiaomi/mimo-v2-flash
   - xiaomi/mimo-v2-omni
   - xiaomi/mimo-v2-pro
+  digitalocean:
+  - digitalocean/openai-gpt-4.1
+  - digitalocean/openai-gpt-4o
+  - digitalocean/openai-gpt-4o-mini
+  - digitalocean/openai-gpt-5
+  - digitalocean/openai-gpt-5-mini
+  - digitalocean/openai-gpt-5-nano
+  - digitalocean/openai-gpt-5.1-codex-max
+  - digitalocean/openai-gpt-5.2
+  - digitalocean/openai-gpt-5.2-pro
+  - digitalocean/openai-gpt-5.3-codex
+  - digitalocean/openai-gpt-5.4
+  - digitalocean/openai-gpt-5.4-mini
+  - digitalocean/openai-gpt-5.4-nano
+  - digitalocean/openai-gpt-5.4-pro
+  - digitalocean/openai-gpt-oss-120b
+  - digitalocean/openai-gpt-oss-20b
+  - digitalocean/openai-o1
+  - digitalocean/openai-o3
+  - digitalocean/openai-o3-mini
+  - digitalocean/anthropic-claude-4.1-opus
+  - digitalocean/anthropic-claude-4.5-sonnet
+  - digitalocean/anthropic-claude-4.6-sonnet
+  - digitalocean/anthropic-claude-haiku-4.5
+  - digitalocean/anthropic-claude-opus-4
+  - digitalocean/anthropic-claude-opus-4.5
+  - digitalocean/anthropic-claude-opus-4.6
+  - digitalocean/anthropic-claude-opus-4.7
+  - digitalocean/anthropic-claude-sonnet-4
+  - digitalocean/alibaba-qwen3-32b
+  - digitalocean/arcee-trinity-large-thinking
+  - digitalocean/deepseek-3.2
+  - digitalocean/deepseek-r1-distill-llama-70b
+  - digitalocean/gemma-4-31B-it
+  - digitalocean/glm-5
+  - digitalocean/kimi-k2.5
+  - digitalocean/llama3.3-70b-instruct
+  - digitalocean/minimax-m2.5
+  - digitalocean/nvidia-nemotron-3-super-120b
+  - digitalocean/qwen3-coder-flash
+  - digitalocean/qwen3.5-397b-a17b
+  - digitalocean/all-mini-lm-l6-v2
+  - digitalocean/gte-large-en-v1.5
+  - digitalocean/multi-qa-mpnet-base-dot-v1
+  - digitalocean/qwen3-embedding-0.6b
+  - digitalocean/router:software-engineering
 metadata:
-  total_providers: 11
-  total_models: 316
-  last_updated: 2026-04-03T23:14:46.956158+00:00
+  total_providers: 12
+  total_models: 361
+  last_updated: 2026-04-16T00:00:00.000000+00:00
diff --git a/crates/hermesllm/src/providers/id.rs b/crates/hermesllm/src/providers/id.rs
index c410bd78..ee0fcff3 100644
--- a/crates/hermesllm/src/providers/id.rs
+++ b/crates/hermesllm/src/providers/id.rs
@@ -44,6 +44,7 @@ pub enum ProviderId {
     Zhipu,
     Qwen,
     AmazonBedrock,
+    DigitalOcean,
 }
 
 impl TryFrom<&str> for ProviderId {
@@ -71,6 +72,9 @@ impl TryFrom<&str> for ProviderId {
             "qwen" => Ok(ProviderId::Qwen),
             "amazon_bedrock" => Ok(ProviderId::AmazonBedrock),
             "amazon" => Ok(ProviderId::AmazonBedrock), // alias
+            "digitalocean" => Ok(ProviderId::DigitalOcean),
+            "do" => Ok(ProviderId::DigitalOcean),    // alias
+            "do_ai" => Ok(ProviderId::DigitalOcean), // alias
             _ => Err(format!("Unknown provider: {}", value)),
         }
     }
@@ -95,6 +99,7 @@ impl ProviderId {
             ProviderId::Moonshotai => "moonshotai",
             ProviderId::Zhipu => "z-ai",
             ProviderId::Qwen => "qwen",
+            ProviderId::DigitalOcean => "digitalocean",
             _ => return Vec::new(),
         };
 
@@ -148,7 +153,8 @@ impl ProviderId {
                 | ProviderId::Ollama
                 | ProviderId::Moonshotai
                 | ProviderId::Zhipu
-                | ProviderId::Qwen,
+                | ProviderId::Qwen
+                | ProviderId::DigitalOcean,
                 SupportedAPIsFromClient::AnthropicMessagesAPI(_),
             ) => SupportedUpstreamAPIs::OpenAIChatCompletions(OpenAIApi::ChatCompletions),
 
@@ -167,7 +173,8 @@ impl ProviderId {
                 | ProviderId::Ollama
                 | ProviderId::Moonshotai
                 | ProviderId::Zhipu
-                | ProviderId::Qwen,
+                | ProviderId::Qwen
+                | ProviderId::DigitalOcean,
                 SupportedAPIsFromClient::OpenAIChatCompletions(_),
             ) => SupportedUpstreamAPIs::OpenAIChatCompletions(OpenAIApi::ChatCompletions),
 
@@ -234,6 +241,7 @@ impl Display for ProviderId {
             ProviderId::Zhipu => write!(f, "zhipu"),
             ProviderId::Qwen => write!(f, "qwen"),
             ProviderId::AmazonBedrock => write!(f, "amazon_bedrock"),
+            ProviderId::DigitalOcean => write!(f, "digitalocean"),
         }
     }
 }

From 1f701258cb5a00b40b5078a3ad24f0a187735628 Mon Sep 17 00:00:00 2001
From: Adil Hafeez <adil.hafeez@gmail.com>
Date: Fri, 17 Apr 2026 13:11:12 -0700
Subject: [PATCH 05/16] Zero-config planoai up: pass-through proxy with
 auto-detected providers (#890)

---
 cli/planoai/defaults.py            | 163 +++++++++++++++++++++++++++++
 cli/planoai/main.py                |  41 +++++++-
 cli/test/test_defaults.py          |  86 +++++++++++++++
 config/plano_config_schema.yaml    |   2 +
 crates/common/src/configuration.rs |   3 +
 5 files changed, 291 insertions(+), 4 deletions(-)
 create mode 100644 cli/planoai/defaults.py
 create mode 100644 cli/test/test_defaults.py

diff --git a/cli/planoai/defaults.py b/cli/planoai/defaults.py
new file mode 100644
index 00000000..110d0f3b
--- /dev/null
+++ b/cli/planoai/defaults.py
@@ -0,0 +1,163 @@
+"""Default config synthesizer for zero-config ``planoai up``.
+
+When the user runs ``planoai up`` in a directory with no ``config.yaml`` /
+``plano_config.yaml``, we synthesize a pass-through config that covers the
+common LLM providers and auto-wires OTel export to ``localhost:4317`` so
+``planoai obs`` works out of the box.
+
+Auth handling:
+- If the provider's env var is set, bind ``access_key: $ENV_VAR``.
+- Otherwise set ``passthrough_auth: true`` so the client's own Authorization
+  header is forwarded. No env var is required to start the proxy.
+"""
+
+from __future__ import annotations
+
+import os
+from dataclasses import dataclass
+
+DEFAULT_LLM_LISTENER_PORT = 12000
+# plano_config validation requires an http:// scheme on the OTLP endpoint.
+DEFAULT_OTLP_ENDPOINT = "http://localhost:4317"
+
+
+@dataclass(frozen=True)
+class ProviderDefault:
+    name: str
+    env_var: str
+    base_url: str
+    model_pattern: str
+    # Only set for providers whose prefix in the model pattern is NOT one of the
+    # built-in SUPPORTED_PROVIDERS in cli/planoai/config_generator.py. For
+    # built-ins, the validator infers the interface from the model prefix and
+    # rejects configs that set this field explicitly.
+    provider_interface: str | None = None
+
+
+# Keep ordering stable so synthesized configs diff cleanly across runs.
+PROVIDER_DEFAULTS: list[ProviderDefault] = [
+    ProviderDefault(
+        name="openai",
+        env_var="OPENAI_API_KEY",
+        base_url="https://api.openai.com/v1",
+        model_pattern="openai/*",
+    ),
+    ProviderDefault(
+        name="anthropic",
+        env_var="ANTHROPIC_API_KEY",
+        base_url="https://api.anthropic.com/v1",
+        model_pattern="anthropic/*",
+    ),
+    ProviderDefault(
+        name="gemini",
+        env_var="GEMINI_API_KEY",
+        base_url="https://generativelanguage.googleapis.com/v1beta",
+        model_pattern="gemini/*",
+    ),
+    ProviderDefault(
+        name="groq",
+        env_var="GROQ_API_KEY",
+        base_url="https://api.groq.com/openai/v1",
+        model_pattern="groq/*",
+    ),
+    ProviderDefault(
+        name="deepseek",
+        env_var="DEEPSEEK_API_KEY",
+        base_url="https://api.deepseek.com/v1",
+        model_pattern="deepseek/*",
+    ),
+    ProviderDefault(
+        name="mistral",
+        env_var="MISTRAL_API_KEY",
+        base_url="https://api.mistral.ai/v1",
+        model_pattern="mistral/*",
+    ),
+    # DigitalOcean Gradient is a first-class provider post-#889 — the
+    # `digitalocean/` model prefix routes to the built-in Envoy cluster, no
+    # base_url needed at runtime.
+    ProviderDefault(
+        name="digitalocean",
+        env_var="DO_API_KEY",
+        base_url="https://inference.do-ai.run/v1",
+        model_pattern="digitalocean/*",
+    ),
+]
+
+
+@dataclass
+class DetectionResult:
+    with_keys: list[ProviderDefault]
+    passthrough: list[ProviderDefault]
+
+    @property
+    def summary(self) -> str:
+        parts = []
+        if self.with_keys:
+            parts.append("env-keyed: " + ", ".join(p.name for p in self.with_keys))
+        if self.passthrough:
+            parts.append("pass-through: " + ", ".join(p.name for p in self.passthrough))
+        return " | ".join(parts) if parts else "no providers"
+
+
+def detect_providers(env: dict[str, str] | None = None) -> DetectionResult:
+    env = env if env is not None else dict(os.environ)
+    with_keys: list[ProviderDefault] = []
+    passthrough: list[ProviderDefault] = []
+    for p in PROVIDER_DEFAULTS:
+        val = env.get(p.env_var)
+        if val:
+            with_keys.append(p)
+        else:
+            passthrough.append(p)
+    return DetectionResult(with_keys=with_keys, passthrough=passthrough)
+
+
+def synthesize_default_config(
+    env: dict[str, str] | None = None,
+    *,
+    listener_port: int = DEFAULT_LLM_LISTENER_PORT,
+    otel_endpoint: str = DEFAULT_OTLP_ENDPOINT,
+) -> dict:
+    """Build a pass-through config dict suitable for validation + envoy rendering.
+
+    The returned dict can be dumped to YAML and handed to the existing `planoai up`
+    pipeline unchanged.
+    """
+    detection = detect_providers(env)
+
+    def _entry(p: ProviderDefault, base: dict) -> dict:
+        row: dict = {"name": p.name, "model": p.model_pattern, "base_url": p.base_url}
+        if p.provider_interface is not None:
+            row["provider_interface"] = p.provider_interface
+        row.update(base)
+        return row
+
+    model_providers: list[dict] = []
+    for p in detection.with_keys:
+        model_providers.append(_entry(p, {"access_key": f"${p.env_var}"}))
+    for p in detection.passthrough:
+        model_providers.append(_entry(p, {"passthrough_auth": True}))
+
+    # No explicit `default: true` entry is synthesized: the plano config
+    # validator rejects wildcard models as defaults, and brightstaff already
+    # registers bare model names as lookup keys during wildcard expansion
+    # (crates/common/src/llm_providers.rs), so `{"model": "gpt-4o-mini"}`
+    # without a prefix resolves via the openai wildcard without needing
+    # `default: true`. See discussion on #890.
+
+    return {
+        "version": "v0.4.0",
+        "listeners": [
+            {
+                "name": "llm",
+                "type": "model",
+                "port": listener_port,
+                "address": "0.0.0.0",
+            }
+        ],
+        "model_providers": model_providers,
+        "tracing": {
+            "random_sampling": 100,
+            "opentracing_grpc_endpoint": otel_endpoint,
+        },
+    }
diff --git a/cli/planoai/main.py b/cli/planoai/main.py
index c8659a3c..3e094a69 100644
--- a/cli/planoai/main.py
+++ b/cli/planoai/main.py
@@ -6,7 +6,13 @@ import sys
 import contextlib
 import logging
 import rich_click as click
+import yaml
 from planoai import targets
+from planoai.defaults import (
+    DEFAULT_LLM_LISTENER_PORT,
+    detect_providers,
+    synthesize_default_config,
+)
 
 # Brand color - Plano purple
 PLANO_COLOR = "#969FF4"
@@ -317,7 +323,23 @@ def build(docker):
     help="Show detailed startup logs with timestamps.",
     is_flag=True,
 )
-def up(file, path, foreground, with_tracing, tracing_port, docker, verbose):
+@click.option(
+    "--listener-port",
+    default=DEFAULT_LLM_LISTENER_PORT,
+    type=int,
+    show_default=True,
+    help="Override the LLM listener port when running without a config file. Ignored when a config file is present.",
+)
+def up(
+    file,
+    path,
+    foreground,
+    with_tracing,
+    tracing_port,
+    docker,
+    verbose,
+    listener_port,
+):
     """Starts Plano."""
     from rich.status import Status
 
@@ -328,12 +350,23 @@ def up(file, path, foreground, with_tracing, tracing_port, docker, verbose):
         # Use the utility function to find config file
         plano_config_file = find_config_file(path, file)
 
-        # Check if the file exists
+        # Zero-config fallback: when no user config is present, synthesize a
+        # pass-through config that covers the common LLM providers and
+        # auto-wires OTel export to ``planoai obs``. See cli/planoai/defaults.py.
         if not os.path.exists(plano_config_file):
+            detection = detect_providers()
+            cfg_dict = synthesize_default_config(listener_port=listener_port)
+
+            default_dir = os.path.expanduser("~/.plano")
+            os.makedirs(default_dir, exist_ok=True)
+            synthesized_path = os.path.join(default_dir, "default_config.yaml")
+            with open(synthesized_path, "w") as fh:
+                yaml.safe_dump(cfg_dict, fh, sort_keys=False)
+            plano_config_file = synthesized_path
             console.print(
-                f"[red]✗[/red] Config file not found: [dim]{plano_config_file}[/dim]"
+                f"[dim]No plano config found; using defaults ({detection.summary}). "
+                f"Listening on :{listener_port}, tracing -> http://localhost:4317.[/dim]"
             )
-            sys.exit(1)
 
         if not docker:
             from planoai.native_runner import native_validate_config
diff --git a/cli/test/test_defaults.py b/cli/test/test_defaults.py
new file mode 100644
index 00000000..bb16a573
--- /dev/null
+++ b/cli/test/test_defaults.py
@@ -0,0 +1,86 @@
+from pathlib import Path
+
+import jsonschema
+import yaml
+
+from planoai.defaults import (
+    PROVIDER_DEFAULTS,
+    detect_providers,
+    synthesize_default_config,
+)
+
+_SCHEMA_PATH = Path(__file__).parents[2] / "config" / "plano_config_schema.yaml"
+
+
+def _schema() -> dict:
+    return yaml.safe_load(_SCHEMA_PATH.read_text())
+
+
+def test_zero_env_vars_produces_pure_passthrough():
+    cfg = synthesize_default_config(env={})
+    assert cfg["version"] == "v0.4.0"
+    assert cfg["listeners"][0]["port"] == 12000
+    for provider in cfg["model_providers"]:
+        assert provider.get("passthrough_auth") is True
+        assert "access_key" not in provider
+        # No provider should be marked default in pure pass-through mode.
+        assert provider.get("default") is not True
+    # All known providers should be listed.
+    names = {p["name"] for p in cfg["model_providers"]}
+    assert "digitalocean" in names
+    assert "openai" in names
+    assert "anthropic" in names
+
+
+def test_env_keys_promote_providers_to_env_keyed():
+    cfg = synthesize_default_config(
+        env={"OPENAI_API_KEY": "sk-1", "DO_API_KEY": "do-1"}
+    )
+    by_name = {p["name"]: p for p in cfg["model_providers"]}
+    assert by_name["openai"].get("access_key") == "$OPENAI_API_KEY"
+    assert by_name["openai"].get("passthrough_auth") is None
+    assert by_name["digitalocean"].get("access_key") == "$DO_API_KEY"
+    # Unset env keys remain pass-through.
+    assert by_name["anthropic"].get("passthrough_auth") is True
+
+
+def test_no_default_is_synthesized():
+    # Bare model names resolve via brightstaff's wildcard expansion registering
+    # bare keys, so the synthesizer intentionally never sets `default: true`.
+    cfg = synthesize_default_config(
+        env={"OPENAI_API_KEY": "sk-1", "ANTHROPIC_API_KEY": "a-1"}
+    )
+    assert not any(p.get("default") is True for p in cfg["model_providers"])
+
+
+def test_listener_port_is_configurable():
+    cfg = synthesize_default_config(env={}, listener_port=11000)
+    assert cfg["listeners"][0]["port"] == 11000
+
+
+def test_detection_summary_strings():
+    det = detect_providers(env={"OPENAI_API_KEY": "sk", "DO_API_KEY": "d"})
+    summary = det.summary
+    assert "env-keyed" in summary and "openai" in summary and "digitalocean" in summary
+    assert "pass-through" in summary
+
+
+def test_tracing_block_points_at_local_console():
+    cfg = synthesize_default_config(env={})
+    tracing = cfg["tracing"]
+    assert tracing["opentracing_grpc_endpoint"] == "http://localhost:4317"
+    # random_sampling is a percentage in the plano config — 100 = every span.
+    assert tracing["random_sampling"] == 100
+
+
+def test_synthesized_config_validates_against_schema():
+    cfg = synthesize_default_config(env={"OPENAI_API_KEY": "sk"})
+    jsonschema.validate(cfg, _schema())
+
+
+def test_provider_defaults_digitalocean_is_configured():
+    by_name = {p.name: p for p in PROVIDER_DEFAULTS}
+    assert "digitalocean" in by_name
+    assert by_name["digitalocean"].env_var == "DO_API_KEY"
+    assert by_name["digitalocean"].base_url == "https://inference.do-ai.run/v1"
+    assert by_name["digitalocean"].model_pattern == "digitalocean/*"
diff --git a/config/plano_config_schema.yaml b/config/plano_config_schema.yaml
index d3d6a643..3439ebee 100644
--- a/config/plano_config_schema.yaml
+++ b/config/plano_config_schema.yaml
@@ -190,6 +190,7 @@ properties:
             - openai
             - xiaomi
             - gemini
+            - digitalocean
         routing_preferences:
           type: array
           items:
@@ -238,6 +239,7 @@ properties:
             - openai
             - xiaomi
             - gemini
+            - digitalocean
         routing_preferences:
           type: array
           items:
diff --git a/crates/common/src/configuration.rs b/crates/common/src/configuration.rs
index 125a986d..028c8046 100644
--- a/crates/common/src/configuration.rs
+++ b/crates/common/src/configuration.rs
@@ -391,6 +391,8 @@ pub enum LlmProviderType {
     AmazonBedrock,
     #[serde(rename = "plano")]
     Plano,
+    #[serde(rename = "digitalocean")]
+    DigitalOcean,
 }
 
 impl Display for LlmProviderType {
@@ -412,6 +414,7 @@ impl Display for LlmProviderType {
             LlmProviderType::Qwen => write!(f, "qwen"),
             LlmProviderType::AmazonBedrock => write!(f, "amazon_bedrock"),
             LlmProviderType::Plano => write!(f, "plano"),
+            LlmProviderType::DigitalOcean => write!(f, "digitalocean"),
         }
     }
 }

From 0f67b2c8068d81564965b49616cf2e85e6716c73 Mon Sep 17 00:00:00 2001
From: Adil Hafeez <adil.hafeez@gmail.com>
Date: Fri, 17 Apr 2026 14:03:47 -0700
Subject: [PATCH 06/16] planoai obs: live LLM observability TUI (#891)

---
 cli/planoai/main.py                           |   2 +
 cli/planoai/obs/__init__.py                   |   6 +
 cli/planoai/obs/collector.py                  | 266 ++++++++++++++
 cli/planoai/obs/pricing.py                    | 255 ++++++++++++++
 cli/planoai/obs/render.py                     | 328 ++++++++++++++++++
 cli/planoai/obs_cmd.py                        |  99 ++++++
 cli/planoai/rich_click_config.py              |   2 +-
 cli/test/test_obs_collector.py                | 145 ++++++++
 cli/test/test_obs_pricing.py                  | 103 ++++++
 cli/test/test_obs_render.py                   | 106 ++++++
 crates/brightstaff/src/handlers/llm/mod.rs    |  70 +++-
 crates/brightstaff/src/streaming.rs           | 231 ++++++++++++
 crates/brightstaff/src/tracing/constants.rs   |  28 ++
 crates/brightstaff/src/tracing/mod.rs         |   2 +-
 crates/hermesllm/src/apis/anthropic.rs        |   6 +
 crates/hermesllm/src/apis/openai.rs           |  12 +
 crates/hermesllm/src/apis/openai_responses.rs |  12 +
 crates/hermesllm/src/providers/response.rs    |  37 ++
 docs/source/get_started/quickstart.rst        |  61 ++++
 19 files changed, 1766 insertions(+), 5 deletions(-)
 create mode 100644 cli/planoai/obs/__init__.py
 create mode 100644 cli/planoai/obs/collector.py
 create mode 100644 cli/planoai/obs/pricing.py
 create mode 100644 cli/planoai/obs/render.py
 create mode 100644 cli/planoai/obs_cmd.py
 create mode 100644 cli/test/test_obs_collector.py
 create mode 100644 cli/test/test_obs_pricing.py
 create mode 100644 cli/test/test_obs_render.py

diff --git a/cli/planoai/main.py b/cli/planoai/main.py
index 3e094a69..5686b0ff 100644
--- a/cli/planoai/main.py
+++ b/cli/planoai/main.py
@@ -37,6 +37,7 @@ from planoai.core import (
 )
 from planoai.init_cmd import init as init_cmd
 from planoai.trace_cmd import trace as trace_cmd, start_trace_listener_background
+from planoai.obs_cmd import obs as obs_cmd
 from planoai.consts import (
     DEFAULT_OTEL_TRACING_GRPC_ENDPOINT,
     DEFAULT_NATIVE_OTEL_TRACING_GRPC_ENDPOINT,
@@ -714,6 +715,7 @@ main.add_command(cli_agent)
 main.add_command(generate_prompt_targets)
 main.add_command(init_cmd, name="init")
 main.add_command(trace_cmd, name="trace")
+main.add_command(obs_cmd, name="obs")
 
 if __name__ == "__main__":
     main()
diff --git a/cli/planoai/obs/__init__.py b/cli/planoai/obs/__init__.py
new file mode 100644
index 00000000..2f4e14af
--- /dev/null
+++ b/cli/planoai/obs/__init__.py
@@ -0,0 +1,6 @@
+"""Plano observability console: in-memory live view of LLM traffic."""
+
+from planoai.obs.collector import LLMCall, LLMCallStore, ObsCollector
+from planoai.obs.pricing import PricingCatalog
+
+__all__ = ["LLMCall", "LLMCallStore", "ObsCollector", "PricingCatalog"]
diff --git a/cli/planoai/obs/collector.py b/cli/planoai/obs/collector.py
new file mode 100644
index 00000000..7f4cae36
--- /dev/null
+++ b/cli/planoai/obs/collector.py
@@ -0,0 +1,266 @@
+"""In-memory collector for LLM calls, fed by OTLP/gRPC spans from brightstaff."""
+
+from __future__ import annotations
+
+import threading
+from collections import deque
+from concurrent import futures
+from dataclasses import dataclass, field
+from datetime import datetime, timezone
+from typing import Any, Iterable
+
+import grpc
+from opentelemetry.proto.collector.trace.v1 import (
+    trace_service_pb2,
+    trace_service_pb2_grpc,
+)
+
+DEFAULT_GRPC_PORT = 4317
+DEFAULT_CAPACITY = 1000
+
+
+@dataclass
+class LLMCall:
+    """One LLM call as reconstructed from a brightstaff LLM span.
+
+    Fields default to ``None`` when the underlying span attribute was absent.
+    """
+
+    request_id: str
+    timestamp: datetime
+    model: str
+    provider: str | None = None
+    request_model: str | None = None
+    session_id: str | None = None
+    route_name: str | None = None
+    is_streaming: bool | None = None
+    status_code: int | None = None
+    prompt_tokens: int | None = None
+    completion_tokens: int | None = None
+    total_tokens: int | None = None
+    cached_input_tokens: int | None = None
+    cache_creation_tokens: int | None = None
+    reasoning_tokens: int | None = None
+    ttft_ms: float | None = None
+    duration_ms: float | None = None
+    routing_strategy: str | None = None
+    routing_reason: str | None = None
+    cost_usd: float | None = None
+
+    @property
+    def tpt_ms(self) -> float | None:
+        if self.duration_ms is None or self.completion_tokens in (None, 0):
+            return None
+        ttft = self.ttft_ms or 0.0
+        generate_ms = max(0.0, self.duration_ms - ttft)
+        if generate_ms <= 0:
+            return None
+        return generate_ms / self.completion_tokens
+
+    @property
+    def tokens_per_sec(self) -> float | None:
+        tpt = self.tpt_ms
+        if tpt is None or tpt <= 0:
+            return None
+        return 1000.0 / tpt
+
+
+class LLMCallStore:
+    """Thread-safe ring buffer of recent LLM calls."""
+
+    def __init__(self, capacity: int = DEFAULT_CAPACITY) -> None:
+        self._capacity = capacity
+        self._calls: deque[LLMCall] = deque(maxlen=capacity)
+        self._lock = threading.Lock()
+
+    @property
+    def capacity(self) -> int:
+        return self._capacity
+
+    def add(self, call: LLMCall) -> None:
+        with self._lock:
+            self._calls.append(call)
+
+    def clear(self) -> None:
+        with self._lock:
+            self._calls.clear()
+
+    def snapshot(self) -> list[LLMCall]:
+        with self._lock:
+            return list(self._calls)
+
+    def __len__(self) -> int:
+        with self._lock:
+            return len(self._calls)
+
+
+# Span attribute keys used below are the canonical OTel / Plano keys emitted by
+# brightstaff — see crates/brightstaff/src/tracing/constants.rs for the source
+# of truth.
+
+
+def _anyvalue_to_python(value: Any) -> Any:  # AnyValue from OTLP
+    kind = value.WhichOneof("value")
+    if kind == "string_value":
+        return value.string_value
+    if kind == "bool_value":
+        return value.bool_value
+    if kind == "int_value":
+        return value.int_value
+    if kind == "double_value":
+        return value.double_value
+    return None
+
+
+def _attrs_to_dict(attrs: Iterable[Any]) -> dict[str, Any]:
+    out: dict[str, Any] = {}
+    for kv in attrs:
+        py = _anyvalue_to_python(kv.value)
+        if py is not None:
+            out[kv.key] = py
+    return out
+
+
+def _maybe_int(value: Any) -> int | None:
+    if value is None:
+        return None
+    try:
+        return int(value)
+    except (TypeError, ValueError):
+        return None
+
+
+def _maybe_float(value: Any) -> float | None:
+    if value is None:
+        return None
+    try:
+        return float(value)
+    except (TypeError, ValueError):
+        return None
+
+
+def span_to_llm_call(
+    span: Any, service_name: str, pricing: Any | None = None
+) -> LLMCall | None:
+    """Convert an OTLP span into an LLMCall, or return None if it isn't one.
+
+    A span is considered an LLM call iff it carries the ``llm.model`` attribute.
+    """
+    attrs = _attrs_to_dict(span.attributes)
+    model = attrs.get("llm.model")
+    if not model:
+        return None
+
+    # Prefer explicit span attributes; fall back to likely aliases.
+    request_id = next(
+        (
+            str(attrs[key])
+            for key in ("request_id", "http.request_id")
+            if key in attrs and attrs[key] is not None
+        ),
+        span.span_id.hex() if span.span_id else "",
+    )
+    start_ns = span.start_time_unix_nano or 0
+    ts = (
+        datetime.fromtimestamp(start_ns / 1_000_000_000, tz=timezone.utc).astimezone()
+        if start_ns
+        else datetime.now().astimezone()
+    )
+
+    call = LLMCall(
+        request_id=str(request_id),
+        timestamp=ts,
+        model=str(model),
+        provider=(
+            str(attrs["llm.provider"]) if "llm.provider" in attrs else service_name
+        ),
+        request_model=(
+            str(attrs["model.requested"]) if "model.requested" in attrs else None
+        ),
+        session_id=(
+            str(attrs["plano.session_id"]) if "plano.session_id" in attrs else None
+        ),
+        route_name=(
+            str(attrs["plano.route.name"]) if "plano.route.name" in attrs else None
+        ),
+        is_streaming=(
+            bool(attrs["llm.is_streaming"]) if "llm.is_streaming" in attrs else None
+        ),
+        status_code=_maybe_int(attrs.get("http.status_code")),
+        prompt_tokens=_maybe_int(attrs.get("llm.usage.prompt_tokens")),
+        completion_tokens=_maybe_int(attrs.get("llm.usage.completion_tokens")),
+        total_tokens=_maybe_int(attrs.get("llm.usage.total_tokens")),
+        cached_input_tokens=_maybe_int(attrs.get("llm.usage.cached_input_tokens")),
+        cache_creation_tokens=_maybe_int(attrs.get("llm.usage.cache_creation_tokens")),
+        reasoning_tokens=_maybe_int(attrs.get("llm.usage.reasoning_tokens")),
+        ttft_ms=_maybe_float(attrs.get("llm.time_to_first_token")),
+        duration_ms=_maybe_float(attrs.get("llm.duration_ms")),
+        routing_strategy=(
+            str(attrs["routing.strategy"]) if "routing.strategy" in attrs else None
+        ),
+        routing_reason=(
+            str(attrs["routing.selection_reason"])
+            if "routing.selection_reason" in attrs
+            else None
+        ),
+    )
+
+    if pricing is not None:
+        call.cost_usd = pricing.cost_for_call(call)
+
+    return call
+
+
+class _ObsServicer(trace_service_pb2_grpc.TraceServiceServicer):
+    def __init__(self, store: LLMCallStore, pricing: Any | None) -> None:
+        self._store = store
+        self._pricing = pricing
+
+    def Export(self, request, context):  # noqa: N802 — gRPC generated name
+        for resource_spans in request.resource_spans:
+            service_name = "unknown"
+            for attr in resource_spans.resource.attributes:
+                if attr.key == "service.name":
+                    val = _anyvalue_to_python(attr.value)
+                    if val is not None:
+                        service_name = str(val)
+                    break
+            for scope_spans in resource_spans.scope_spans:
+                for span in scope_spans.spans:
+                    call = span_to_llm_call(span, service_name, self._pricing)
+                    if call is not None:
+                        self._store.add(call)
+        return trace_service_pb2.ExportTraceServiceResponse()
+
+
+@dataclass
+class ObsCollector:
+    """Owns the OTLP/gRPC server and the in-memory LLMCall ring buffer."""
+
+    store: LLMCallStore = field(default_factory=LLMCallStore)
+    pricing: Any | None = None
+    host: str = "0.0.0.0"
+    port: int = DEFAULT_GRPC_PORT
+    _server: grpc.Server | None = field(default=None, init=False, repr=False)
+
+    def start(self) -> None:
+        if self._server is not None:
+            return
+        server = grpc.server(futures.ThreadPoolExecutor(max_workers=4))
+        trace_service_pb2_grpc.add_TraceServiceServicer_to_server(
+            _ObsServicer(self.store, self.pricing), server
+        )
+        address = f"{self.host}:{self.port}"
+        bound = server.add_insecure_port(address)
+        if bound == 0:
+            raise OSError(
+                f"Failed to bind OTLP listener on {address}: port already in use. "
+                "Stop tracing via `planoai trace down` or pick another port with --port."
+            )
+        server.start()
+        self._server = server
+
+    def stop(self, grace: float = 2.0) -> None:
+        if self._server is not None:
+            self._server.stop(grace)
+            self._server = None
diff --git a/cli/planoai/obs/pricing.py b/cli/planoai/obs/pricing.py
new file mode 100644
index 00000000..19eb1297
--- /dev/null
+++ b/cli/planoai/obs/pricing.py
@@ -0,0 +1,255 @@
+"""DigitalOcean Gradient pricing catalog for the obs console.
+
+Ported loosely from ``crates/brightstaff/src/router/model_metrics.rs::fetch_do_pricing``.
+Single-source: one fetch at startup, cached for the life of the process.
+"""
+
+from __future__ import annotations
+
+import logging
+import threading
+from dataclasses import dataclass
+from typing import Any
+
+import requests
+
+DEFAULT_PRICING_URL = "https://api.digitalocean.com/v2/gen-ai/models/catalog"
+FETCH_TIMEOUT_SECS = 5.0
+
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass(frozen=True)
+class ModelPrice:
+    """Input/output $/token rates. Token counts are multiplied by these."""
+
+    input_per_token_usd: float
+    output_per_token_usd: float
+    cached_input_per_token_usd: float | None = None
+
+
+class PricingCatalog:
+    """In-memory pricing lookup keyed by model id.
+
+    DO's catalog uses ids like ``openai-gpt-5.4``; Plano's resolved model names
+    may arrive as ``do/openai-gpt-5.4`` or bare ``openai-gpt-5.4``. We strip the
+    leading provider prefix when looking up.
+    """
+
+    def __init__(self, prices: dict[str, ModelPrice] | None = None) -> None:
+        self._prices: dict[str, ModelPrice] = prices or {}
+        self._lock = threading.Lock()
+
+    def __len__(self) -> int:
+        with self._lock:
+            return len(self._prices)
+
+    def sample_models(self, n: int = 5) -> list[str]:
+        with self._lock:
+            return list(self._prices.keys())[:n]
+
+    @classmethod
+    def fetch(cls, url: str = DEFAULT_PRICING_URL) -> "PricingCatalog":
+        """Fetch pricing from DO's catalog endpoint. On failure, returns an
+        empty catalog (cost column will be blank).
+
+        The catalog endpoint is public — no auth required, no signup — so
+        ``planoai obs`` gets cost data on first run out of the box.
+        """
+        try:
+            resp = requests.get(url, timeout=FETCH_TIMEOUT_SECS)
+            resp.raise_for_status()
+            data = resp.json()
+        except Exception as exc:  # noqa: BLE001 — best-effort; never fatal
+            logger.warning(
+                "DO pricing fetch failed: %s; cost column will be blank.",
+                exc,
+            )
+            return cls()
+
+        prices = _parse_do_pricing(data)
+        if not prices:
+            # Dump the first entry's raw shape so we can see which fields DO
+            # actually returned — helps when the catalog adds new fields or
+            # the response doesn't match our parser.
+            import json as _json
+
+            sample_items = _coerce_items(data)
+            sample = sample_items[0] if sample_items else data
+            logger.warning(
+                "DO pricing response had no parseable entries; cost column "
+                "will be blank. Sample entry: %s",
+                _json.dumps(sample, default=str)[:400],
+            )
+        return cls(prices)
+
+    def price_for(self, model_name: str | None) -> ModelPrice | None:
+        if not model_name:
+            return None
+        with self._lock:
+            # Try the full name first, then stripped prefix, then lowercased variants.
+            for candidate in _model_key_candidates(model_name):
+                hit = self._prices.get(candidate)
+                if hit is not None:
+                    return hit
+        return None
+
+    def cost_for_call(self, call: Any) -> float | None:
+        """Compute USD cost for an LLMCall. Returns None when pricing is unknown."""
+        price = self.price_for(getattr(call, "model", None)) or self.price_for(
+            getattr(call, "request_model", None)
+        )
+        if price is None:
+            return None
+        prompt = int(getattr(call, "prompt_tokens", 0) or 0)
+        completion = int(getattr(call, "completion_tokens", 0) or 0)
+        cached = int(getattr(call, "cached_input_tokens", 0) or 0)
+
+        # Cached input tokens are priced separately at the cached rate when known;
+        # otherwise they're already counted in prompt tokens at the regular rate.
+        fresh_prompt = prompt
+        if price.cached_input_per_token_usd is not None and cached:
+            fresh_prompt = max(0, prompt - cached)
+            cost_cached = cached * price.cached_input_per_token_usd
+        else:
+            cost_cached = 0.0
+
+        cost = (
+            fresh_prompt * price.input_per_token_usd
+            + completion * price.output_per_token_usd
+            + cost_cached
+        )
+        return round(cost, 6)
+
+
+def _model_key_candidates(model_name: str) -> list[str]:
+    base = model_name.strip()
+    out = [base]
+    if "/" in base:
+        out.append(base.split("/", 1)[1])
+    out.extend([v.lower() for v in list(out)])
+    # Dedup while preserving order.
+    seen: set[str] = set()
+    uniq = []
+    for key in out:
+        if key not in seen:
+            seen.add(key)
+            uniq.append(key)
+    return uniq
+
+
+def _parse_do_pricing(data: Any) -> dict[str, ModelPrice]:
+    """Parse DO catalog response into a ModelPrice map keyed by model id.
+
+    DO's shape (as of 2026-04):
+        {
+          "data": [
+            {"model_id": "openai-gpt-5.4",
+             "pricing": {"input_price_per_million": 5.0,
+                         "output_price_per_million": 15.0}},
+            ...
+          ]
+        }
+
+    Older/alternate shapes are also accepted (flat top-level fields, or the
+    ``id``/``model``/``name`` key).
+    """
+    prices: dict[str, ModelPrice] = {}
+    items = _coerce_items(data)
+    for item in items:
+        model_id = (
+            item.get("model_id")
+            or item.get("id")
+            or item.get("model")
+            or item.get("name")
+        )
+        if not model_id:
+            continue
+
+        # DO nests rates under `pricing`; try that first, then fall back to
+        # top-level fields for alternate response shapes.
+        sources = [item]
+        if isinstance(item.get("pricing"), dict):
+            sources.insert(0, item["pricing"])
+
+        input_rate = _extract_rate_from_sources(
+            sources,
+            ["input_per_token", "input_token_price", "price_input"],
+            ["input_price_per_million", "input_per_million", "input_per_mtok"],
+        )
+        output_rate = _extract_rate_from_sources(
+            sources,
+            ["output_per_token", "output_token_price", "price_output"],
+            ["output_price_per_million", "output_per_million", "output_per_mtok"],
+        )
+        cached_rate = _extract_rate_from_sources(
+            sources,
+            [
+                "cached_input_per_token",
+                "cached_input_token_price",
+                "prompt_cache_read_per_token",
+            ],
+            [
+                "cached_input_price_per_million",
+                "cached_input_per_million",
+                "cached_input_per_mtok",
+            ],
+        )
+
+        if input_rate is None or output_rate is None:
+            continue
+        # Treat 0-rate entries as "unknown" so cost falls back to `—` rather
+        # than showing a misleading $0.0000. DO's catalog sometimes omits
+        # rates for promo/open-weight models.
+        if input_rate == 0 and output_rate == 0:
+            continue
+        prices[str(model_id)] = ModelPrice(
+            input_per_token_usd=input_rate,
+            output_per_token_usd=output_rate,
+            cached_input_per_token_usd=cached_rate,
+        )
+    return prices
+
+
+def _coerce_items(data: Any) -> list[dict]:
+    if isinstance(data, list):
+        return [x for x in data if isinstance(x, dict)]
+    if isinstance(data, dict):
+        for key in ("data", "models", "pricing", "items"):
+            val = data.get(key)
+            if isinstance(val, list):
+                return [x for x in val if isinstance(x, dict)]
+    return []
+
+
+def _extract_rate_from_sources(
+    sources: list[dict],
+    per_token_keys: list[str],
+    per_million_keys: list[str],
+) -> float | None:
+    """Return a per-token rate in USD, or None if unknown.
+
+    Some DO catalog responses put per-token values under a field whose name
+    says ``_per_million`` (e.g. ``input_price_per_million: 5E-8`` — that's
+    $5e-8 per token, not per million). Heuristic: values < 1 are already
+    per-token (real per-million rates are ~0.1 to ~100); values >= 1 are
+    treated as per-million and divided by 1,000,000.
+    """
+    for src in sources:
+        for key in per_token_keys:
+            if key in src and src[key] is not None:
+                try:
+                    return float(src[key])
+                except (TypeError, ValueError):
+                    continue
+        for key in per_million_keys:
+            if key in src and src[key] is not None:
+                try:
+                    v = float(src[key])
+                except (TypeError, ValueError):
+                    continue
+                if v >= 1:
+                    return v / 1_000_000
+                return v
+    return None
diff --git a/cli/planoai/obs/render.py b/cli/planoai/obs/render.py
new file mode 100644
index 00000000..602b8aed
--- /dev/null
+++ b/cli/planoai/obs/render.py
@@ -0,0 +1,328 @@
+"""Rich TUI renderer for the observability console."""
+
+from __future__ import annotations
+
+from collections import Counter
+from dataclasses import dataclass
+from datetime import datetime, timezone
+
+from rich.box import SIMPLE
+from rich.columns import Columns
+from rich.console import Group
+from rich.panel import Panel
+from rich.table import Table
+from rich.text import Text
+
+from planoai.obs.collector import LLMCall
+
+
+@dataclass
+class AggregateStats:
+    count: int
+    total_cost_usd: float
+    total_input_tokens: int
+    total_output_tokens: int
+    distinct_sessions: int
+    current_session: str | None
+
+
+@dataclass
+class ModelRollup:
+    model: str
+    requests: int
+    input_tokens: int
+    output_tokens: int
+    cache_write: int
+    cache_read: int
+    cost_usd: float
+
+
+def _now() -> datetime:
+    return datetime.now(tz=timezone.utc).astimezone()
+
+
+def aggregates(calls: list[LLMCall]) -> AggregateStats:
+    total_cost = sum((c.cost_usd or 0.0) for c in calls)
+    total_input = sum(int(c.prompt_tokens or 0) for c in calls)
+    total_output = sum(int(c.completion_tokens or 0) for c in calls)
+    session_ids = {c.session_id for c in calls if c.session_id}
+    current = next(
+        (c.session_id for c in reversed(calls) if c.session_id is not None), None
+    )
+    return AggregateStats(
+        count=len(calls),
+        total_cost_usd=total_cost,
+        total_input_tokens=total_input,
+        total_output_tokens=total_output,
+        distinct_sessions=len(session_ids),
+        current_session=current,
+    )
+
+
+def model_rollups(calls: list[LLMCall]) -> list[ModelRollup]:
+    buckets: dict[str, dict[str, float | int]] = {}
+    for c in calls:
+        key = c.model
+        b = buckets.setdefault(
+            key,
+            {
+                "requests": 0,
+                "input": 0,
+                "output": 0,
+                "cache_write": 0,
+                "cache_read": 0,
+                "cost": 0.0,
+            },
+        )
+        b["requests"] = int(b["requests"]) + 1
+        b["input"] = int(b["input"]) + int(c.prompt_tokens or 0)
+        b["output"] = int(b["output"]) + int(c.completion_tokens or 0)
+        b["cache_write"] = int(b["cache_write"]) + int(c.cache_creation_tokens or 0)
+        b["cache_read"] = int(b["cache_read"]) + int(c.cached_input_tokens or 0)
+        b["cost"] = float(b["cost"]) + (c.cost_usd or 0.0)
+
+    rollups: list[ModelRollup] = []
+    for model, b in buckets.items():
+        rollups.append(
+            ModelRollup(
+                model=model,
+                requests=int(b["requests"]),
+                input_tokens=int(b["input"]),
+                output_tokens=int(b["output"]),
+                cache_write=int(b["cache_write"]),
+                cache_read=int(b["cache_read"]),
+                cost_usd=float(b["cost"]),
+            )
+        )
+    rollups.sort(key=lambda r: r.cost_usd, reverse=True)
+    return rollups
+
+
+def route_hits(calls: list[LLMCall]) -> list[tuple[str, int, float]]:
+    counts: Counter[str] = Counter()
+    for c in calls:
+        if c.route_name:
+            counts[c.route_name] += 1
+    total = sum(counts.values())
+    if total == 0:
+        return []
+    return [(r, n, (n / total) * 100.0) for r, n in counts.most_common()]
+
+
+def _fmt_cost(v: float | None) -> str:
+    if v is None:
+        return "—"
+    if v == 0:
+        return "$0"
+    # Adaptive precision so tiny costs ($3.8e-5) remain readable.
+    if abs(v) < 0.0001:
+        return f"${v:.8f}".rstrip("0").rstrip(".")
+    if abs(v) < 0.01:
+        return f"${v:.6f}".rstrip("0").rstrip(".")
+    return f"${v:.4f}"
+
+
+def _fmt_ms(v: float | None) -> str:
+    if v is None:
+        return "—"
+    if v >= 1000:
+        return f"{v / 1000:.1f}s"
+    return f"{v:.0f}ms"
+
+
+def _fmt_int(v: int | None) -> str:
+    if v is None or v == 0:
+        return "—"
+    return f"{v:,}"
+
+
+def _fmt_tokens(v: int | None) -> str:
+    if v is None:
+        return "—"
+    return f"{v:,}"
+
+
+def _request_panel(last: LLMCall | None) -> Panel:
+    if last is None:
+        body = Text("no requests yet", style="dim")
+    else:
+        t = Table.grid(padding=(0, 1))
+        t.add_column(style="bold cyan")
+        t.add_column()
+        t.add_row("Endpoint", "chat/completions")
+        status = "—" if last.status_code is None else str(last.status_code)
+        t.add_row("Status", status)
+        t.add_row("Model", last.model)
+        if last.request_model and last.request_model != last.model:
+            t.add_row("Req model", last.request_model)
+        if last.route_name:
+            t.add_row("Route", last.route_name)
+        body = t
+    return Panel(body, title="[bold]Request[/]", border_style="cyan", box=SIMPLE)
+
+
+def _cost_panel(last: LLMCall | None) -> Panel:
+    if last is None:
+        body = Text("—", style="dim")
+    else:
+        t = Table.grid(padding=(0, 1))
+        t.add_column(style="bold green")
+        t.add_column()
+        t.add_row("Request", _fmt_cost(last.cost_usd))
+        t.add_row("Input", _fmt_tokens(last.prompt_tokens))
+        t.add_row("Output", _fmt_tokens(last.completion_tokens))
+        if last.cached_input_tokens:
+            t.add_row("Cached", _fmt_tokens(last.cached_input_tokens))
+        body = t
+    return Panel(body, title="[bold]Cost[/]", border_style="green", box=SIMPLE)
+
+
+def _totals_panel(stats: AggregateStats) -> Panel:
+    t = Table.grid(padding=(0, 1))
+    t.add_column(style="bold magenta")
+    t.add_column()
+    t.add_column(style="bold magenta")
+    t.add_column()
+    t.add_row(
+        "Total cost",
+        _fmt_cost(stats.total_cost_usd),
+        "Requests",
+        str(stats.count),
+    )
+    t.add_row(
+        "Input",
+        _fmt_tokens(stats.total_input_tokens),
+        "Output",
+        _fmt_tokens(stats.total_output_tokens),
+    )
+    t.add_row(
+        "Sessions",
+        str(stats.distinct_sessions),
+        "Current session",
+        stats.current_session or "—",
+    )
+    return Panel(t, title="[bold]Totals[/]", border_style="magenta", box=SIMPLE)
+
+
+def _model_rollup_table(rollups: list[ModelRollup]) -> Table:
+    table = Table(
+        title="Totals by model",
+        box=SIMPLE,
+        header_style="bold",
+        expand=True,
+    )
+    table.add_column("Model", style="cyan")
+    table.add_column("Req", justify="right")
+    table.add_column("Input", justify="right")
+    table.add_column("Output", justify="right", style="green")
+    table.add_column("Cache write", justify="right", style="yellow")
+    table.add_column("Cache read", justify="right", style="yellow")
+    table.add_column("Cost", justify="right", style="green")
+    if not rollups:
+        table.add_row("—", "—", "—", "—", "—", "—", "—")
+    for r in rollups:
+        table.add_row(
+            r.model,
+            str(r.requests),
+            _fmt_tokens(r.input_tokens),
+            _fmt_tokens(r.output_tokens),
+            _fmt_int(r.cache_write),
+            _fmt_int(r.cache_read),
+            _fmt_cost(r.cost_usd),
+        )
+    return table
+
+
+def _route_hit_table(hits: list[tuple[str, int, float]]) -> Table:
+    table = Table(
+        title="Route hit %",
+        box=SIMPLE,
+        header_style="bold",
+        expand=True,
+    )
+    table.add_column("Route", style="cyan")
+    table.add_column("Hits", justify="right")
+    table.add_column("%", justify="right")
+    for route, n, pct in hits:
+        table.add_row(route, str(n), f"{pct:.1f}")
+    return table
+
+
+def _recent_table(calls: list[LLMCall], limit: int = 15) -> Table:
+    show_route = any(c.route_name for c in calls)
+    table = Table(
+        title="Recent requests",
+        box=SIMPLE,
+        header_style="bold",
+        expand=True,
+    )
+    table.add_column("time")
+    table.add_column("model", style="cyan")
+    if show_route:
+        table.add_column("route", style="yellow")
+    table.add_column("in", justify="right")
+    table.add_column("cache", justify="right", style="yellow")
+    table.add_column("out", justify="right", style="green")
+    table.add_column("rsn", justify="right")
+    table.add_column("cost", justify="right", style="green")
+    table.add_column("TTFT", justify="right")
+    table.add_column("lat", justify="right")
+    table.add_column("st")
+
+    recent = list(reversed(calls))[:limit]
+    for c in recent:
+        status_cell = (
+            "ok"
+            if c.status_code and 200 <= c.status_code < 400
+            else str(c.status_code or "—")
+        )
+        row = [
+            c.timestamp.strftime("%H:%M:%S"),
+            c.model,
+        ]
+        if show_route:
+            row.append(c.route_name or "—")
+        row.extend(
+            [
+                _fmt_tokens(c.prompt_tokens),
+                _fmt_int(c.cached_input_tokens),
+                _fmt_tokens(c.completion_tokens),
+                _fmt_int(c.reasoning_tokens),
+                _fmt_cost(c.cost_usd),
+                _fmt_ms(c.ttft_ms),
+                _fmt_ms(c.duration_ms),
+                status_cell,
+            ]
+        )
+        table.add_row(*row)
+    if not recent:
+        table.add_row(*(["no requests yet"] + ["—"] * (10 if show_route else 9)))
+    return table
+
+
+def render(calls: list[LLMCall]) -> Group:
+    last = calls[-1] if calls else None
+    stats = aggregates(calls)
+    rollups = model_rollups(calls)
+    hits = route_hits(calls)
+
+    header = Columns(
+        [_request_panel(last), _cost_panel(last), _totals_panel(stats)],
+        expand=True,
+        equal=True,
+    )
+    parts = [
+        header,
+        _model_rollup_table(rollups),
+    ]
+    if hits:
+        parts.append(_route_hit_table(hits))
+    parts.append(_recent_table(calls))
+    parts.append(
+        Text(
+            "q quit · c clear · waiting for spans on OTLP :4317 — brightstaff needs "
+            "tracing.opentracing_grpc_endpoint=localhost:4317",
+            style="dim",
+        )
+    )
+    return Group(*parts)
diff --git a/cli/planoai/obs_cmd.py b/cli/planoai/obs_cmd.py
new file mode 100644
index 00000000..6249df30
--- /dev/null
+++ b/cli/planoai/obs_cmd.py
@@ -0,0 +1,99 @@
+"""`planoai obs` — live observability TUI."""
+
+from __future__ import annotations
+
+import time
+
+import rich_click as click
+from rich.console import Console
+from rich.live import Live
+
+from planoai.consts import PLANO_COLOR
+from planoai.obs.collector import (
+    DEFAULT_CAPACITY,
+    DEFAULT_GRPC_PORT,
+    LLMCallStore,
+    ObsCollector,
+)
+from planoai.obs.pricing import PricingCatalog
+from planoai.obs.render import render
+
+
+@click.command(name="obs", help="Live observability console for Plano LLM traffic.")
+@click.option(
+    "--port",
+    type=int,
+    default=DEFAULT_GRPC_PORT,
+    show_default=True,
+    help="OTLP/gRPC port to listen on. Must match the brightstaff tracing endpoint.",
+)
+@click.option(
+    "--host",
+    type=str,
+    default="0.0.0.0",
+    show_default=True,
+    help="Host to bind the OTLP listener.",
+)
+@click.option(
+    "--capacity",
+    type=int,
+    default=DEFAULT_CAPACITY,
+    show_default=True,
+    help="Max LLM calls kept in memory; older calls evicted FIFO.",
+)
+@click.option(
+    "--refresh-ms",
+    type=int,
+    default=500,
+    show_default=True,
+    help="TUI refresh interval.",
+)
+def obs(port: int, host: str, capacity: int, refresh_ms: int) -> None:
+    console = Console()
+    console.print(
+        f"[bold {PLANO_COLOR}]planoai obs[/] — loading DO pricing catalog...",
+        end="",
+    )
+    pricing = PricingCatalog.fetch()
+    if len(pricing):
+        sample = ", ".join(pricing.sample_models(3))
+        console.print(
+            f" [green]{len(pricing)} models loaded[/] [dim]({sample}, ...)[/]"
+        )
+    else:
+        console.print(
+            " [yellow]no pricing loaded[/] — "
+            "[dim]cost column will be blank (DO catalog unreachable)[/]"
+        )
+
+    store = LLMCallStore(capacity=capacity)
+    collector = ObsCollector(store=store, pricing=pricing, host=host, port=port)
+    try:
+        collector.start()
+    except OSError as exc:
+        console.print(f"[red]{exc}[/]")
+        raise SystemExit(1)
+
+    console.print(
+        f"Listening for OTLP spans on [bold]{host}:{port}[/]. "
+        "Ensure plano config has [cyan]tracing.opentracing_grpc_endpoint: http://localhost:4317[/] "
+        "and [cyan]tracing.random_sampling: 100[/] (or run [bold]planoai up[/] "
+        "with no config — it wires this automatically)."
+    )
+    console.print("Press [bold]Ctrl-C[/] to exit.\n")
+
+    refresh = max(0.05, refresh_ms / 1000.0)
+    try:
+        with Live(
+            render(store.snapshot()),
+            console=console,
+            refresh_per_second=1.0 / refresh,
+            screen=False,
+        ) as live:
+            while True:
+                time.sleep(refresh)
+                live.update(render(store.snapshot()))
+    except KeyboardInterrupt:
+        console.print("\n[dim]obs stopped[/]")
+    finally:
+        collector.stop()
diff --git a/cli/planoai/rich_click_config.py b/cli/planoai/rich_click_config.py
index ba75bc23..fe90dcf1 100644
--- a/cli/planoai/rich_click_config.py
+++ b/cli/planoai/rich_click_config.py
@@ -61,7 +61,7 @@ def configure_rich_click(plano_color: str) -> None:
             },
             {
                 "name": "Observability",
-                "commands": ["trace"],
+                "commands": ["trace", "obs"],
             },
             {
                 "name": "Utilities",
diff --git a/cli/test/test_obs_collector.py b/cli/test/test_obs_collector.py
new file mode 100644
index 00000000..a16506d9
--- /dev/null
+++ b/cli/test/test_obs_collector.py
@@ -0,0 +1,145 @@
+import time
+from datetime import datetime, timezone
+from types import SimpleNamespace
+from unittest.mock import MagicMock
+
+import pytest
+
+from planoai.obs.collector import LLMCall, LLMCallStore, span_to_llm_call
+
+
+def _mk_attr(key: str, value):
+    v = MagicMock()
+    if isinstance(value, bool):
+        v.WhichOneof.return_value = "bool_value"
+        v.bool_value = value
+    elif isinstance(value, int):
+        v.WhichOneof.return_value = "int_value"
+        v.int_value = value
+    elif isinstance(value, float):
+        v.WhichOneof.return_value = "double_value"
+        v.double_value = value
+    else:
+        v.WhichOneof.return_value = "string_value"
+        v.string_value = str(value)
+    kv = MagicMock()
+    kv.key = key
+    kv.value = v
+    return kv
+
+
+def _mk_span(
+    attrs: dict, start_ns: int | None = None, span_id_hex: str = "ab"
+) -> MagicMock:
+    span = MagicMock()
+    span.attributes = [_mk_attr(k, v) for k, v in attrs.items()]
+    span.start_time_unix_nano = start_ns or int(time.time() * 1_000_000_000)
+    span.span_id.hex.return_value = span_id_hex
+    return span
+
+
+def test_span_without_llm_model_is_ignored():
+    span = _mk_span({"http.method": "POST"})
+    assert span_to_llm_call(span, "plano(llm)") is None
+
+
+def test_span_with_full_llm_attrs_produces_call():
+    span = _mk_span(
+        {
+            "llm.model": "openai-gpt-5.4",
+            "model.requested": "router:software-engineering",
+            "plano.session_id": "sess-abc",
+            "plano.route.name": "software-engineering",
+            "llm.is_streaming": False,
+            "llm.duration_ms": 1234,
+            "llm.time_to_first_token": 210,
+            "llm.usage.prompt_tokens": 100,
+            "llm.usage.completion_tokens": 50,
+            "llm.usage.total_tokens": 150,
+            "llm.usage.cached_input_tokens": 30,
+            "llm.usage.cache_creation_tokens": 5,
+            "llm.usage.reasoning_tokens": 200,
+            "http.status_code": 200,
+            "request_id": "req-42",
+        }
+    )
+    call = span_to_llm_call(span, "plano(llm)")
+    assert call is not None
+    assert call.request_id == "req-42"
+    assert call.model == "openai-gpt-5.4"
+    assert call.request_model == "router:software-engineering"
+    assert call.session_id == "sess-abc"
+    assert call.route_name == "software-engineering"
+    assert call.is_streaming is False
+    assert call.duration_ms == 1234.0
+    assert call.ttft_ms == 210.0
+    assert call.prompt_tokens == 100
+    assert call.completion_tokens == 50
+    assert call.total_tokens == 150
+    assert call.cached_input_tokens == 30
+    assert call.cache_creation_tokens == 5
+    assert call.reasoning_tokens == 200
+    assert call.status_code == 200
+
+
+def test_pricing_lookup_attaches_cost():
+    class StubPricing:
+        def cost_for_call(self, call):
+            # Simple: 2 * prompt + 3 * completion, in cents
+            return 0.02 * (call.prompt_tokens or 0) + 0.03 * (
+                call.completion_tokens or 0
+            )
+
+    span = _mk_span(
+        {
+            "llm.model": "do/openai-gpt-5.4",
+            "llm.usage.prompt_tokens": 10,
+            "llm.usage.completion_tokens": 2,
+        }
+    )
+    call = span_to_llm_call(span, "plano(llm)", pricing=StubPricing())
+    assert call is not None
+    assert call.cost_usd == pytest.approx(0.26)
+
+
+def test_tpt_and_tokens_per_sec_derived():
+    call = LLMCall(
+        request_id="x",
+        timestamp=datetime.now(tz=timezone.utc),
+        model="m",
+        duration_ms=1000,
+        ttft_ms=200,
+        completion_tokens=80,
+    )
+    # (1000 - 200) / 80 = 10ms per token => 100 tokens/sec
+    assert call.tpt_ms == 10.0
+    assert call.tokens_per_sec == 100.0
+
+
+def test_tpt_returns_none_when_no_completion_tokens():
+    call = LLMCall(
+        request_id="x",
+        timestamp=datetime.now(tz=timezone.utc),
+        model="m",
+        duration_ms=1000,
+        ttft_ms=200,
+        completion_tokens=0,
+    )
+    assert call.tpt_ms is None
+    assert call.tokens_per_sec is None
+
+
+def test_store_evicts_fifo_at_capacity():
+    store = LLMCallStore(capacity=3)
+    now = datetime.now(tz=timezone.utc)
+    for i in range(5):
+        store.add(
+            LLMCall(
+                request_id=f"r{i}",
+                timestamp=now,
+                model="m",
+            )
+        )
+    snap = store.snapshot()
+    assert len(snap) == 3
+    assert [c.request_id for c in snap] == ["r2", "r3", "r4"]
diff --git a/cli/test/test_obs_pricing.py b/cli/test/test_obs_pricing.py
new file mode 100644
index 00000000..95f9a2da
--- /dev/null
+++ b/cli/test/test_obs_pricing.py
@@ -0,0 +1,103 @@
+from datetime import datetime, timezone
+
+from planoai.obs.collector import LLMCall
+from planoai.obs.pricing import ModelPrice, PricingCatalog
+
+
+def _call(model: str, prompt: int, completion: int, cached: int = 0) -> LLMCall:
+    return LLMCall(
+        request_id="r",
+        timestamp=datetime.now(tz=timezone.utc),
+        model=model,
+        prompt_tokens=prompt,
+        completion_tokens=completion,
+        cached_input_tokens=cached,
+    )
+
+
+def test_lookup_matches_bare_and_prefixed():
+    prices = {
+        "openai-gpt-5.4": ModelPrice(
+            input_per_token_usd=0.000001, output_per_token_usd=0.000002
+        )
+    }
+    catalog = PricingCatalog(prices)
+    assert catalog.price_for("openai-gpt-5.4") is not None
+    # do/openai-gpt-5.4 should resolve after stripping the provider prefix.
+    assert catalog.price_for("do/openai-gpt-5.4") is not None
+    assert catalog.price_for("unknown-model") is None
+
+
+def test_cost_computation_without_cache():
+    prices = {
+        "m": ModelPrice(input_per_token_usd=0.000001, output_per_token_usd=0.000002)
+    }
+    cost = PricingCatalog(prices).cost_for_call(_call("m", 1000, 500))
+    assert cost == 0.002  # 1000 * 1e-6 + 500 * 2e-6
+
+
+def test_cost_computation_with_cached_discount():
+    prices = {
+        "m": ModelPrice(
+            input_per_token_usd=0.000001,
+            output_per_token_usd=0.000002,
+            cached_input_per_token_usd=0.0000001,
+        )
+    }
+    # 800 fresh @ 1e-6 = 8e-4; 200 cached @ 1e-7 = 2e-5; 500 out @ 2e-6 = 1e-3
+    cost = PricingCatalog(prices).cost_for_call(_call("m", 1000, 500, cached=200))
+    assert cost == round(0.0008 + 0.00002 + 0.001, 6)
+
+
+def test_empty_catalog_returns_none():
+    assert PricingCatalog().cost_for_call(_call("m", 100, 50)) is None
+
+
+def test_parse_do_catalog_treats_small_values_as_per_token():
+    """DO's real catalog uses per-token values under the `_per_million` key
+    (e.g. 5E-8 for GPT-oss-20b). We treat values < 1 as already per-token."""
+    from planoai.obs.pricing import _parse_do_pricing
+
+    sample = {
+        "data": [
+            {
+                "model_id": "openai-gpt-oss-20b",
+                "pricing": {
+                    "input_price_per_million": 5e-8,
+                    "output_price_per_million": 4.5e-7,
+                },
+            },
+            {
+                "model_id": "openai-gpt-oss-120b",
+                "pricing": {
+                    "input_price_per_million": 1e-7,
+                    "output_price_per_million": 7e-7,
+                },
+            },
+        ]
+    }
+    prices = _parse_do_pricing(sample)
+    # Values < 1 are assumed to already be per-token — no extra division.
+    assert prices["openai-gpt-oss-20b"].input_per_token_usd == 5e-8
+    assert prices["openai-gpt-oss-20b"].output_per_token_usd == 4.5e-7
+    assert prices["openai-gpt-oss-120b"].input_per_token_usd == 1e-7
+
+
+def test_parse_do_catalog_divides_large_values_as_per_million():
+    """A provider that genuinely reports $5-per-million in that field gets divided."""
+    from planoai.obs.pricing import _parse_do_pricing
+
+    sample = {
+        "data": [
+            {
+                "model_id": "mystery-model",
+                "pricing": {
+                    "input_price_per_million": 5.0,  # > 1 → treated as per-million
+                    "output_price_per_million": 15.0,
+                },
+            },
+        ]
+    }
+    prices = _parse_do_pricing(sample)
+    assert prices["mystery-model"].input_per_token_usd == 5.0 / 1_000_000
+    assert prices["mystery-model"].output_per_token_usd == 15.0 / 1_000_000
diff --git a/cli/test/test_obs_render.py b/cli/test/test_obs_render.py
new file mode 100644
index 00000000..11f4a1fc
--- /dev/null
+++ b/cli/test/test_obs_render.py
@@ -0,0 +1,106 @@
+from datetime import datetime, timedelta, timezone
+
+from planoai.obs.collector import LLMCall
+from planoai.obs.render import aggregates, model_rollups, route_hits
+
+
+def _call(
+    model: str,
+    ts: datetime,
+    prompt=0,
+    completion=0,
+    cost=None,
+    route=None,
+    session=None,
+    cache_read=0,
+    cache_write=0,
+):
+    return LLMCall(
+        request_id="r",
+        timestamp=ts,
+        model=model,
+        prompt_tokens=prompt,
+        completion_tokens=completion,
+        cached_input_tokens=cache_read,
+        cache_creation_tokens=cache_write,
+        cost_usd=cost,
+        route_name=route,
+        session_id=session,
+    )
+
+
+def test_aggregates_sum_and_session_counts():
+    now = datetime.now(tz=timezone.utc).astimezone()
+    calls = [
+        _call(
+            "m1",
+            now - timedelta(seconds=50),
+            prompt=10,
+            completion=5,
+            cost=0.001,
+            session="s1",
+        ),
+        _call(
+            "m2",
+            now - timedelta(seconds=40),
+            prompt=20,
+            completion=10,
+            cost=0.002,
+            session="s1",
+        ),
+        _call(
+            "m1",
+            now - timedelta(seconds=30),
+            prompt=30,
+            completion=15,
+            cost=0.003,
+            session="s2",
+        ),
+    ]
+    stats = aggregates(calls)
+    assert stats.count == 3
+    assert stats.total_cost_usd == 0.006
+    assert stats.total_input_tokens == 60
+    assert stats.total_output_tokens == 30
+    assert stats.distinct_sessions == 2
+    assert stats.current_session == "s2"
+
+
+def test_rollups_split_by_model_and_cache():
+    now = datetime.now(tz=timezone.utc).astimezone()
+    calls = [
+        _call(
+            "m1", now, prompt=10, completion=5, cost=0.001, cache_write=3, cache_read=7
+        ),
+        _call("m1", now, prompt=20, completion=10, cost=0.002, cache_read=1),
+        _call("m2", now, prompt=30, completion=15, cost=0.004),
+    ]
+    rollups = model_rollups(calls)
+    by_model = {r.model: r for r in rollups}
+    assert by_model["m1"].requests == 2
+    assert by_model["m1"].input_tokens == 30
+    assert by_model["m1"].cache_write == 3
+    assert by_model["m1"].cache_read == 8
+    assert by_model["m2"].input_tokens == 30
+
+
+def test_route_hits_only_for_routed_calls():
+    now = datetime.now(tz=timezone.utc).astimezone()
+    calls = [
+        _call("m", now, route="code"),
+        _call("m", now, route="code"),
+        _call("m", now, route="summarization"),
+        _call("m", now),  # no route
+    ]
+    hits = route_hits(calls)
+    # Only calls with route names are counted.
+    assert sum(n for _, n, _ in hits) == 3
+    hits_by_name = {name: (n, pct) for name, n, pct in hits}
+    assert hits_by_name["code"][0] == 2
+    assert hits_by_name["summarization"][0] == 1
+
+
+def test_route_hits_empty_when_no_routes():
+    now = datetime.now(tz=timezone.utc).astimezone()
+    calls = [_call("m", now), _call("m", now)]
+    assert route_hits(calls) == []
diff --git a/crates/brightstaff/src/handlers/llm/mod.rs b/crates/brightstaff/src/handlers/llm/mod.rs
index 8f00e4b6..719c048d 100644
--- a/crates/brightstaff/src/handlers/llm/mod.rs
+++ b/crates/brightstaff/src/handlers/llm/mod.rs
@@ -33,7 +33,8 @@ use crate::streaming::{
     ObservableStreamProcessor, StreamProcessor,
 };
 use crate::tracing::{
-    collect_custom_trace_attributes, llm as tracing_llm, operation_component, set_service_name,
+    collect_custom_trace_attributes, llm as tracing_llm, operation_component,
+    plano as tracing_plano, set_service_name,
 };
 use model_selection::router_chat_get_upstream_model;
 
@@ -102,15 +103,36 @@ async fn llm_chat_inner(
         .and_then(|hdr| request_headers.get(hdr))
         .and_then(|v| v.to_str().ok())
         .map(|s| s.to_string());
-    let pinned_model: Option<String> = if let Some(ref sid) = session_id {
+    let cached_route = if let Some(ref sid) = session_id {
         state
             .orchestrator_service
             .get_cached_route(sid, tenant_id.as_deref())
             .await
-            .map(|c| c.model_name)
     } else {
         None
     };
+    let (pinned_model, pinned_route_name): (Option<String>, Option<String>) = match cached_route {
+        Some(c) => (Some(c.model_name), c.route_name),
+        None => (None, None),
+    };
+
+    // Record session id on the LLM span for the observability console.
+    if let Some(ref sid) = session_id {
+        get_active_span(|span| {
+            span.set_attribute(opentelemetry::KeyValue::new(
+                tracing_plano::SESSION_ID,
+                sid.clone(),
+            ));
+        });
+    }
+    if let Some(ref route_name) = pinned_route_name {
+        get_active_span(|span| {
+            span.set_attribute(opentelemetry::KeyValue::new(
+                tracing_plano::ROUTE_NAME,
+                route_name.clone(),
+            ));
+        });
+    }
 
     let full_qualified_llm_provider_url = format!("{}{}", state.llm_provider_url, request_path);
 
@@ -311,6 +333,18 @@ async fn llm_chat_inner(
             alias_resolved_model.clone()
         };
 
+        // Record route name on the LLM span (only when the orchestrator produced one).
+        if let Some(ref rn) = route_name {
+            if !rn.is_empty() && rn != "none" {
+                get_active_span(|span| {
+                    span.set_attribute(opentelemetry::KeyValue::new(
+                        tracing_plano::ROUTE_NAME,
+                        rn.clone(),
+                    ));
+                });
+            }
+        }
+
         if let Some(ref sid) = session_id {
             state
                 .orchestrator_service
@@ -671,6 +705,36 @@ async fn send_upstream(
     // Propagate upstream headers and status
     let response_headers = llm_response.headers().clone();
     let upstream_status = llm_response.status();
+
+    // Upstream routers (e.g. DigitalOcean Gradient) may return an
+    // `x-model-router-selected-route` header indicating which task-level
+    // route the request was classified into (e.g. "Code Generation"). Surface
+    // it as `plano.route.name` so the obs console's Route hit % panel can
+    // show the breakdown even when Plano's own orchestrator wasn't in the
+    // routing path. Any value from Plano's orchestrator already set earlier
+    // takes precedence — this only fires when the span doesn't already have
+    // a route name.
+    if let Some(upstream_route) = response_headers
+        .get("x-model-router-selected-route")
+        .and_then(|v| v.to_str().ok())
+    {
+        if !upstream_route.is_empty() {
+            get_active_span(|span| {
+                span.set_attribute(opentelemetry::KeyValue::new(
+                    crate::tracing::plano::ROUTE_NAME,
+                    upstream_route.to_string(),
+                ));
+            });
+        }
+    }
+    // Record the upstream HTTP status on the span for the obs console.
+    get_active_span(|span| {
+        span.set_attribute(opentelemetry::KeyValue::new(
+            crate::tracing::http::STATUS_CODE,
+            upstream_status.as_u16() as i64,
+        ));
+    });
+
     let mut response = Response::builder().status(upstream_status);
     if let Some(headers) = response.headers_mut() {
         for (name, value) in response_headers.iter() {
diff --git a/crates/brightstaff/src/streaming.rs b/crates/brightstaff/src/streaming.rs
index f7af8ae0..40cbbe7c 100644
--- a/crates/brightstaff/src/streaming.rs
+++ b/crates/brightstaff/src/streaming.rs
@@ -16,10 +16,131 @@ use tracing_opentelemetry::OpenTelemetrySpanExt;
 use crate::handlers::agents::pipeline::{PipelineError, PipelineProcessor};
 
 const STREAM_BUFFER_SIZE: usize = 16;
+/// Cap on accumulated response bytes kept for usage extraction.
+/// Most chat responses are well under this; pathological ones are dropped without
+/// affecting pass-through streaming to the client.
+const USAGE_BUFFER_MAX: usize = 2 * 1024 * 1024;
 use crate::signals::{InteractionQuality, SignalAnalyzer, TextBasedSignalAnalyzer, FLAG_MARKER};
 use crate::tracing::{llm, set_service_name, signals as signal_constants};
 use hermesllm::apis::openai::Message;
 
+/// Parsed usage + resolved-model details from a provider response.
+#[derive(Debug, Default, Clone)]
+struct ExtractedUsage {
+    prompt_tokens: Option<i64>,
+    completion_tokens: Option<i64>,
+    total_tokens: Option<i64>,
+    cached_input_tokens: Option<i64>,
+    cache_creation_tokens: Option<i64>,
+    reasoning_tokens: Option<i64>,
+    /// The model the upstream actually used. For router aliases (e.g.
+    /// `router:software-engineering`), this differs from the request model.
+    resolved_model: Option<String>,
+}
+
+impl ExtractedUsage {
+    fn is_empty(&self) -> bool {
+        self.prompt_tokens.is_none()
+            && self.completion_tokens.is_none()
+            && self.total_tokens.is_none()
+            && self.resolved_model.is_none()
+    }
+
+    fn from_json(value: &serde_json::Value) -> Self {
+        let mut out = Self::default();
+        if let Some(model) = value.get("model").and_then(|v| v.as_str()) {
+            if !model.is_empty() {
+                out.resolved_model = Some(model.to_string());
+            }
+        }
+        if let Some(u) = value.get("usage") {
+            // OpenAI-shape usage
+            out.prompt_tokens = u.get("prompt_tokens").and_then(|v| v.as_i64());
+            out.completion_tokens = u.get("completion_tokens").and_then(|v| v.as_i64());
+            out.total_tokens = u.get("total_tokens").and_then(|v| v.as_i64());
+            out.cached_input_tokens = u
+                .get("prompt_tokens_details")
+                .and_then(|d| d.get("cached_tokens"))
+                .and_then(|v| v.as_i64());
+            out.reasoning_tokens = u
+                .get("completion_tokens_details")
+                .and_then(|d| d.get("reasoning_tokens"))
+                .and_then(|v| v.as_i64());
+
+            // Anthropic-shape fallbacks
+            if out.prompt_tokens.is_none() {
+                out.prompt_tokens = u.get("input_tokens").and_then(|v| v.as_i64());
+            }
+            if out.completion_tokens.is_none() {
+                out.completion_tokens = u.get("output_tokens").and_then(|v| v.as_i64());
+            }
+            if out.total_tokens.is_none() {
+                if let (Some(p), Some(c)) = (out.prompt_tokens, out.completion_tokens) {
+                    out.total_tokens = Some(p + c);
+                }
+            }
+            if out.cached_input_tokens.is_none() {
+                out.cached_input_tokens = u.get("cache_read_input_tokens").and_then(|v| v.as_i64());
+            }
+            if out.cached_input_tokens.is_none() {
+                out.cached_input_tokens =
+                    u.get("cached_content_token_count").and_then(|v| v.as_i64());
+            }
+            out.cache_creation_tokens = u
+                .get("cache_creation_input_tokens")
+                .and_then(|v| v.as_i64());
+            if out.reasoning_tokens.is_none() {
+                out.reasoning_tokens = u.get("thoughts_token_count").and_then(|v| v.as_i64());
+            }
+        }
+        out
+    }
+}
+
+/// Try to pull usage out of an accumulated response body.
+/// Handles both a single JSON object (non-streaming) and SSE streams where the
+/// final `data: {...}` event carries the `usage` field.
+fn extract_usage_from_bytes(buf: &[u8]) -> ExtractedUsage {
+    if buf.is_empty() {
+        return ExtractedUsage::default();
+    }
+
+    // Fast path: full-body JSON (non-streaming).
+    if let Ok(value) = serde_json::from_slice::<serde_json::Value>(buf) {
+        let u = ExtractedUsage::from_json(&value);
+        if !u.is_empty() {
+            return u;
+        }
+    }
+
+    // SSE path: scan from the end for a `data:` line containing a usage object.
+    let text = match std::str::from_utf8(buf) {
+        Ok(t) => t,
+        Err(_) => return ExtractedUsage::default(),
+    };
+    for line in text.lines().rev() {
+        let trimmed = line.trim_start();
+        let payload = match trimmed.strip_prefix("data:") {
+            Some(p) => p.trim_start(),
+            None => continue,
+        };
+        if payload == "[DONE]" || payload.is_empty() {
+            continue;
+        }
+        if !payload.contains("\"usage\"") {
+            continue;
+        }
+        if let Ok(value) = serde_json::from_str::<serde_json::Value>(payload) {
+            let u = ExtractedUsage::from_json(&value);
+            if !u.is_empty() {
+                return u;
+            }
+        }
+    }
+
+    ExtractedUsage::default()
+}
+
 /// Trait for processing streaming chunks
 /// Implementors can inject custom logic during streaming (e.g., hallucination detection, logging)
 pub trait StreamProcessor: Send + 'static {
@@ -60,6 +181,10 @@ pub struct ObservableStreamProcessor {
     start_time: Instant,
     time_to_first_token: Option<u128>,
     messages: Option<Vec<Message>>,
+    /// Accumulated response bytes used only for best-effort usage extraction
+    /// on `on_complete`. Capped at `USAGE_BUFFER_MAX`; excess chunks are dropped
+    /// from the buffer (they still pass through to the client).
+    response_buffer: Vec<u8>,
 }
 
 impl ObservableStreamProcessor {
@@ -93,6 +218,7 @@ impl ObservableStreamProcessor {
             start_time,
             time_to_first_token: None,
             messages,
+            response_buffer: Vec::new(),
         }
     }
 }
@@ -101,6 +227,13 @@ impl StreamProcessor for ObservableStreamProcessor {
     fn process_chunk(&mut self, chunk: Bytes) -> Result<Option<Bytes>, String> {
         self.total_bytes += chunk.len();
         self.chunk_count += 1;
+        // Accumulate for best-effort usage extraction; drop further chunks once
+        // the cap is reached so we don't retain huge response bodies in memory.
+        if self.response_buffer.len() < USAGE_BUFFER_MAX {
+            let remaining = USAGE_BUFFER_MAX - self.response_buffer.len();
+            let take = chunk.len().min(remaining);
+            self.response_buffer.extend_from_slice(&chunk[..take]);
+        }
         Ok(Some(chunk))
     }
 
@@ -124,6 +257,52 @@ impl StreamProcessor for ObservableStreamProcessor {
             );
         }
 
+        // Record total duration on the span for the observability console.
+        let duration_ms = self.start_time.elapsed().as_millis() as i64;
+        {
+            let span = tracing::Span::current();
+            let otel_context = span.context();
+            let otel_span = otel_context.span();
+            otel_span.set_attribute(KeyValue::new(llm::DURATION_MS, duration_ms));
+            otel_span.set_attribute(KeyValue::new(llm::RESPONSE_BYTES, self.total_bytes as i64));
+        }
+
+        // Best-effort usage extraction + emission (works for both streaming
+        // SSE and non-streaming JSON responses that include a `usage` object).
+        let usage = extract_usage_from_bytes(&self.response_buffer);
+        if !usage.is_empty() {
+            let span = tracing::Span::current();
+            let otel_context = span.context();
+            let otel_span = otel_context.span();
+            if let Some(v) = usage.prompt_tokens {
+                otel_span.set_attribute(KeyValue::new(llm::PROMPT_TOKENS, v));
+            }
+            if let Some(v) = usage.completion_tokens {
+                otel_span.set_attribute(KeyValue::new(llm::COMPLETION_TOKENS, v));
+            }
+            if let Some(v) = usage.total_tokens {
+                otel_span.set_attribute(KeyValue::new(llm::TOTAL_TOKENS, v));
+            }
+            if let Some(v) = usage.cached_input_tokens {
+                otel_span.set_attribute(KeyValue::new(llm::CACHED_INPUT_TOKENS, v));
+            }
+            if let Some(v) = usage.cache_creation_tokens {
+                otel_span.set_attribute(KeyValue::new(llm::CACHE_CREATION_TOKENS, v));
+            }
+            if let Some(v) = usage.reasoning_tokens {
+                otel_span.set_attribute(KeyValue::new(llm::REASONING_TOKENS, v));
+            }
+            // Override `llm.model` with the model the upstream actually ran
+            // (e.g. `openai-gpt-5.4` resolved from `router:software-engineering`).
+            // Cost lookup keys off the real model, not the alias.
+            if let Some(resolved) = usage.resolved_model.clone() {
+                otel_span.set_attribute(KeyValue::new(llm::MODEL_NAME, resolved));
+            }
+        }
+        // Release the buffered bytes early; nothing downstream needs them.
+        self.response_buffer.clear();
+        self.response_buffer.shrink_to_fit();
+
         // Analyze signals if messages are available and record as span attributes
         if let Some(ref messages) = self.messages {
             let analyzer: Box<dyn SignalAnalyzer> = Box::new(TextBasedSignalAnalyzer::new());
@@ -404,3 +583,55 @@ pub fn truncate_message(message: &str, max_length: usize) -> String {
         message.to_string()
     }
 }
+
+#[cfg(test)]
+mod usage_extraction_tests {
+    use super::*;
+
+    #[test]
+    fn non_streaming_openai_with_cached() {
+        let body = br#"{"id":"x","model":"gpt-4o","choices":[],"usage":{"prompt_tokens":12,"completion_tokens":34,"total_tokens":46,"prompt_tokens_details":{"cached_tokens":5}}}"#;
+        let u = extract_usage_from_bytes(body);
+        assert_eq!(u.prompt_tokens, Some(12));
+        assert_eq!(u.completion_tokens, Some(34));
+        assert_eq!(u.total_tokens, Some(46));
+        assert_eq!(u.cached_input_tokens, Some(5));
+        assert_eq!(u.reasoning_tokens, None);
+    }
+
+    #[test]
+    fn non_streaming_anthropic_with_cache_creation() {
+        let body = br#"{"id":"x","model":"claude","usage":{"input_tokens":100,"output_tokens":50,"cache_creation_input_tokens":20,"cache_read_input_tokens":30}}"#;
+        let u = extract_usage_from_bytes(body);
+        assert_eq!(u.prompt_tokens, Some(100));
+        assert_eq!(u.completion_tokens, Some(50));
+        assert_eq!(u.total_tokens, Some(150));
+        assert_eq!(u.cached_input_tokens, Some(30));
+        assert_eq!(u.cache_creation_tokens, Some(20));
+    }
+
+    #[test]
+    fn streaming_openai_final_chunk_has_usage() {
+        let sse = b"data: {\"choices\":[{\"delta\":{\"content\":\"hi\"}}]}
+
+data: {\"choices\":[{\"delta\":{}, \"finish_reason\":\"stop\"}],\"usage\":{\"prompt_tokens\":7,\"completion_tokens\":3,\"total_tokens\":10}}
+
+data: [DONE]
+
+";
+        let u = extract_usage_from_bytes(sse);
+        assert_eq!(u.prompt_tokens, Some(7));
+        assert_eq!(u.completion_tokens, Some(3));
+        assert_eq!(u.total_tokens, Some(10));
+    }
+
+    #[test]
+    fn empty_returns_default() {
+        assert!(extract_usage_from_bytes(b"").is_empty());
+    }
+
+    #[test]
+    fn no_usage_in_body_returns_default() {
+        assert!(extract_usage_from_bytes(br#"{"ok":true}"#).is_empty());
+    }
+}
diff --git a/crates/brightstaff/src/tracing/constants.rs b/crates/brightstaff/src/tracing/constants.rs
index 15e3cf57..79a40401 100644
--- a/crates/brightstaff/src/tracing/constants.rs
+++ b/crates/brightstaff/src/tracing/constants.rs
@@ -80,6 +80,18 @@ pub mod llm {
     /// Total tokens used (prompt + completion)
     pub const TOTAL_TOKENS: &str = "llm.usage.total_tokens";
 
+    /// Tokens served from a prompt cache read
+    /// (OpenAI `prompt_tokens_details.cached_tokens`, Anthropic `cache_read_input_tokens`,
+    /// Google `cached_content_token_count`)
+    pub const CACHED_INPUT_TOKENS: &str = "llm.usage.cached_input_tokens";
+
+    /// Tokens used to write a prompt cache entry (Anthropic `cache_creation_input_tokens`)
+    pub const CACHE_CREATION_TOKENS: &str = "llm.usage.cache_creation_tokens";
+
+    /// Reasoning tokens for reasoning models
+    /// (OpenAI `completion_tokens_details.reasoning_tokens`, Google `thoughts_token_count`)
+    pub const REASONING_TOKENS: &str = "llm.usage.reasoning_tokens";
+
     /// Temperature parameter used
     pub const TEMPERATURE: &str = "llm.temperature";
 
@@ -119,6 +131,22 @@ pub mod routing {
     pub const SELECTION_REASON: &str = "routing.selection_reason";
 }
 
+// =============================================================================
+// Span Attributes - Plano-specific
+// =============================================================================
+
+/// Attributes specific to Plano (session affinity, routing decisions).
+pub mod plano {
+    /// Session identifier propagated via the `x-model-affinity` header.
+    /// Absent when the client did not send the header.
+    pub const SESSION_ID: &str = "plano.session_id";
+
+    /// Matched route name from routing (e.g. "code", "summarization",
+    /// "software-engineering"). Absent when the client routed directly
+    /// to a concrete model.
+    pub const ROUTE_NAME: &str = "plano.route.name";
+}
+
 // =============================================================================
 // Span Attributes - Error Handling
 // =============================================================================
diff --git a/crates/brightstaff/src/tracing/mod.rs b/crates/brightstaff/src/tracing/mod.rs
index 644db31a..8e09a21c 100644
--- a/crates/brightstaff/src/tracing/mod.rs
+++ b/crates/brightstaff/src/tracing/mod.rs
@@ -4,7 +4,7 @@ mod init;
 mod service_name_exporter;
 
 pub use constants::{
-    error, http, llm, operation_component, routing, signals, OperationNameBuilder,
+    error, http, llm, operation_component, plano, routing, signals, OperationNameBuilder,
 };
 pub use custom_attributes::collect_custom_trace_attributes;
 pub use init::init_tracer;
diff --git a/crates/hermesllm/src/apis/anthropic.rs b/crates/hermesllm/src/apis/anthropic.rs
index 4df4bb00..ee572268 100644
--- a/crates/hermesllm/src/apis/anthropic.rs
+++ b/crates/hermesllm/src/apis/anthropic.rs
@@ -435,6 +435,12 @@ impl TokenUsage for MessagesResponse {
     fn total_tokens(&self) -> usize {
         (self.usage.input_tokens + self.usage.output_tokens) as usize
     }
+    fn cached_input_tokens(&self) -> Option<usize> {
+        self.usage.cache_read_input_tokens.map(|t| t as usize)
+    }
+    fn cache_creation_tokens(&self) -> Option<usize> {
+        self.usage.cache_creation_input_tokens.map(|t| t as usize)
+    }
 }
 
 impl ProviderResponse for MessagesResponse {
diff --git a/crates/hermesllm/src/apis/openai.rs b/crates/hermesllm/src/apis/openai.rs
index d22ff756..bb93fd34 100644
--- a/crates/hermesllm/src/apis/openai.rs
+++ b/crates/hermesllm/src/apis/openai.rs
@@ -596,6 +596,18 @@ impl TokenUsage for Usage {
     fn total_tokens(&self) -> usize {
         self.total_tokens as usize
     }
+
+    fn cached_input_tokens(&self) -> Option<usize> {
+        self.prompt_tokens_details
+            .as_ref()
+            .and_then(|d| d.cached_tokens.map(|t| t as usize))
+    }
+
+    fn reasoning_tokens(&self) -> Option<usize> {
+        self.completion_tokens_details
+            .as_ref()
+            .and_then(|d| d.reasoning_tokens.map(|t| t as usize))
+    }
 }
 
 /// Implementation of ProviderRequest for ChatCompletionsRequest
diff --git a/crates/hermesllm/src/apis/openai_responses.rs b/crates/hermesllm/src/apis/openai_responses.rs
index eac8a452..92d362b2 100644
--- a/crates/hermesllm/src/apis/openai_responses.rs
+++ b/crates/hermesllm/src/apis/openai_responses.rs
@@ -710,6 +710,18 @@ impl crate::providers::response::TokenUsage for ResponseUsage {
     fn total_tokens(&self) -> usize {
         self.total_tokens as usize
     }
+
+    fn cached_input_tokens(&self) -> Option<usize> {
+        self.input_tokens_details
+            .as_ref()
+            .map(|d| d.cached_tokens.max(0) as usize)
+    }
+
+    fn reasoning_tokens(&self) -> Option<usize> {
+        self.output_tokens_details
+            .as_ref()
+            .map(|d| d.reasoning_tokens.max(0) as usize)
+    }
 }
 
 /// Token details
diff --git a/crates/hermesllm/src/providers/response.rs b/crates/hermesllm/src/providers/response.rs
index 5f46f97b..b8565ddf 100644
--- a/crates/hermesllm/src/providers/response.rs
+++ b/crates/hermesllm/src/providers/response.rs
@@ -23,6 +23,31 @@ pub trait TokenUsage {
     fn completion_tokens(&self) -> usize;
     fn prompt_tokens(&self) -> usize;
     fn total_tokens(&self) -> usize;
+    /// Tokens served from a prompt cache read (OpenAI `prompt_tokens_details.cached_tokens`,
+    /// Anthropic `cache_read_input_tokens`, Google `cached_content_token_count`).
+    fn cached_input_tokens(&self) -> Option<usize> {
+        None
+    }
+    /// Tokens used to write a cache entry (Anthropic `cache_creation_input_tokens`).
+    fn cache_creation_tokens(&self) -> Option<usize> {
+        None
+    }
+    /// Reasoning tokens for reasoning models (OpenAI `completion_tokens_details.reasoning_tokens`,
+    /// Google `thoughts_token_count`).
+    fn reasoning_tokens(&self) -> Option<usize> {
+        None
+    }
+}
+
+/// Rich usage breakdown extracted from a provider response.
+#[derive(Debug, Clone, Copy, Default, PartialEq, Eq)]
+pub struct UsageDetails {
+    pub prompt_tokens: usize,
+    pub completion_tokens: usize,
+    pub total_tokens: usize,
+    pub cached_input_tokens: Option<usize>,
+    pub cache_creation_tokens: Option<usize>,
+    pub reasoning_tokens: Option<usize>,
 }
 
 pub trait ProviderResponse: Send + Sync {
@@ -34,6 +59,18 @@ pub trait ProviderResponse: Send + Sync {
         self.usage()
             .map(|u| (u.prompt_tokens(), u.completion_tokens(), u.total_tokens()))
     }
+
+    /// Extract a rich usage breakdown including cached/cache-creation/reasoning tokens.
+    fn extract_usage_details(&self) -> Option<UsageDetails> {
+        self.usage().map(|u| UsageDetails {
+            prompt_tokens: u.prompt_tokens(),
+            completion_tokens: u.completion_tokens(),
+            total_tokens: u.total_tokens(),
+            cached_input_tokens: u.cached_input_tokens(),
+            cache_creation_tokens: u.cache_creation_tokens(),
+            reasoning_tokens: u.reasoning_tokens(),
+        })
+    }
 }
 
 impl ProviderResponse for ProviderResponseType {
diff --git a/docs/source/get_started/quickstart.rst b/docs/source/get_started/quickstart.rst
index 6f1a86ac..50916eae 100644
--- a/docs/source/get_started/quickstart.rst
+++ b/docs/source/get_started/quickstart.rst
@@ -340,6 +340,67 @@ And to get the list of supported currencies:
    "Here is a list of the currencies that are supported for conversion from USD, along with their symbols:\n\n1. AUD - Australian Dollar\n2. BGN - Bulgarian Lev\n3. BRL - Brazilian Real\n4. CAD - Canadian Dollar\n5. CHF - Swiss Franc\n6. CNY - Chinese Renminbi Yuan\n7. CZK - Czech Koruna\n8. DKK - Danish Krone\n9. EUR - Euro\n10. GBP - British Pound\n11. HKD - Hong Kong Dollar\n12. HUF - Hungarian Forint\n13. IDR - Indonesian Rupiah\n14. ILS - Israeli New Sheqel\n15. INR - Indian Rupee\n16. ISK - Icelandic Króna\n17. JPY - Japanese Yen\n18. KRW - South Korean Won\n19. MXN - Mexican Peso\n20. MYR - Malaysian Ringgit\n21. NOK - Norwegian Krone\n22. NZD - New Zealand Dollar\n23. PHP - Philippine Peso\n24. PLN - Polish Złoty\n25. RON - Romanian Leu\n26. SEK - Swedish Krona\n27. SGD - Singapore Dollar\n28. THB - Thai Baht\n29. TRY - Turkish Lira\n30. USD - United States Dollar\n31. ZAR - South African Rand\n\nIf you want to convert USD to any of these currencies, you can select the one you are interested in."
 
 
+Observability
+-------------
+
+Plano ships two CLI tools for visibility into LLM traffic. Both consume the same OTLP/gRPC span stream from brightstaff; they just slice it differently — use whichever (or both) fits the question you're answering.
+
+=====================  ============================================  =============================================================
+Command                When to use                                   Shows
+=====================  ============================================  =============================================================
+``planoai obs``        Live view while you drive traffic              Per-request rows + aggregates: tokens (prompt / completion / cached / cache-creation / reasoning), TTFT, latency, cost, session id, route name, totals by model
+``planoai trace``      Deep-dive into a single request after the fact  Full span tree for a trace id: brightstaff → routing → upstream LLM, attributes on every span, status codes, errors
+=====================  ============================================  =============================================================
+
+Both require brightstaff to be exporting spans. If you're running the zero-config path (``planoai up`` with no config file), tracing is auto-wired to ``http://localhost:4317``. If you have your own ``plano_config.yaml``, add:
+
+.. code-block:: yaml
+
+   tracing:
+     random_sampling: 100
+     opentracing_grpc_endpoint: http://localhost:4317
+
+Live console — ``planoai obs``
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. code-block:: console
+
+   $ planoai obs
+   # In another terminal:
+   $ planoai up
+
+Cost is populated automatically from DigitalOcean's public pricing catalog — no signup or token required.
+
+With no API keys set, every provider runs in pass-through mode — supply the ``Authorization`` header yourself on each request:
+
+.. code-block:: console
+
+   $ curl localhost:12000/v1/chat/completions \
+       -H "Content-Type: application/json" \
+       -H "Authorization: Bearer $DO_API_KEY" \
+       -d '{"model":"digitalocean/router:software-engineering",
+            "messages":[{"role":"user","content":"write code to print prime numbers in python"}],
+            "stream":false}'
+
+When you export ``OPENAI_API_KEY`` / ``ANTHROPIC_API_KEY`` / ``DO_API_KEY`` / etc. before ``planoai up``, Plano picks them up and clients no longer need to send ``Authorization``.
+
+Press ``Ctrl-C`` in the obs terminal to exit. Data lives in memory only — nothing is persisted to disk.
+
+Single-request traces — ``planoai trace``
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+When you need to understand what happened on one specific request (which model was picked, how long each hop took, what an upstream returned), use ``trace``:
+
+.. code-block:: console
+
+   $ planoai trace listen                 # start the OTLP listener (daemon)
+   # drive some traffic through localhost:12000 ...
+   $ planoai trace                        # show the most recent trace
+   $ planoai trace <trace-id>             # show a specific trace by id
+   $ planoai trace --list                 # list the last 50 trace ids
+
+Use ``obs`` to spot that p95 latency spiked for ``openai-gpt-5.4``; switch to ``trace`` on one of those slow request ids to see which hop burned the time.
+
 Next Steps
 ==========
 

From 37600fd07a0efb17c40b0577239f7f7d26d7a38b Mon Sep 17 00:00:00 2001
From: Adil Hafeez <adil.hafeez@gmail.com>
Date: Fri, 17 Apr 2026 17:23:05 -0700
Subject: [PATCH 07/16] fix: passthrough_auth accepts Anthropic x-api-key and
 normalizes to upstream format (#892)

---
 crates/llm_gateway/src/stream_context.rs | 145 +++++++++++++++++++----
 1 file changed, 120 insertions(+), 25 deletions(-)

diff --git a/crates/llm_gateway/src/stream_context.rs b/crates/llm_gateway/src/stream_context.rs
index afb0b050..e7763ee0 100644
--- a/crates/llm_gateway/src/stream_context.rs
+++ b/crates/llm_gateway/src/stream_context.rs
@@ -177,24 +177,33 @@ impl StreamContext {
     }
 
     fn modify_auth_headers(&mut self) -> Result<(), ServerError> {
-        if self.llm_provider().passthrough_auth == Some(true) {
-            // Check if client provided an Authorization header
-            if self.get_http_request_header("Authorization").is_none() {
-                warn!(
-                    "request_id={}: passthrough_auth enabled but no authorization header present in client request",
-                    self.request_identifier()
-                );
-            } else {
-                debug!(
-                    "request_id={}: preserving client authorization header for provider '{}'",
-                    self.request_identifier(),
-                    self.llm_provider().name
-                );
+        // Determine the credential to forward upstream. Either the client
+        // supplied one (passthrough_auth) or it's configured on the provider.
+        let credential: String = if self.llm_provider().passthrough_auth == Some(true) {
+            // Client auth may arrive in either Anthropic-style (`x-api-key`)
+            // or OpenAI-style (`Authorization: Bearer ...`). Accept both so
+            // clients using Anthropic SDKs (which default to `x-api-key`)
+            // work when the upstream is OpenAI-compatible, and vice versa.
+            let authorization = self.get_http_request_header("Authorization");
+            let x_api_key = self.get_http_request_header("x-api-key");
+            match extract_client_credential(authorization.as_deref(), x_api_key.as_deref()) {
+                Some(key) => {
+                    debug!(
+                        "request_id={}: forwarding client credential to provider '{}'",
+                        self.request_identifier(),
+                        self.llm_provider().name
+                    );
+                    key
+                }
+                None => {
+                    warn!(
+                        "request_id={}: passthrough_auth enabled but no Authorization / x-api-key header present in client request",
+                        self.request_identifier()
+                    );
+                    return Ok(());
+                }
             }
-            return Ok(());
-        }
-
-        let llm_provider_api_key_value =
+        } else {
             self.llm_provider()
                 .access_key
                 .as_ref()
@@ -203,15 +212,19 @@ impl StreamContext {
                         "No access key configured for selected LLM Provider \"{}\"",
                         self.llm_provider()
                     ),
-                })?;
+                })?
+                .clone()
+        };
 
-        // Set API-specific headers based on the resolved upstream API
+        // Normalize the credential into whichever header the upstream expects.
+        // This lets an Anthropic-SDK client reach an OpenAI-compatible upstream
+        // (and vice versa) without the caller needing to know what format the
+        // upstream uses.
         match self.resolved_api.as_ref() {
             Some(SupportedUpstreamAPIs::AnthropicMessagesAPI(_)) => {
-                // Anthropic API requires x-api-key and anthropic-version headers
-                // Remove any existing Authorization header since Anthropic doesn't use it
+                // Anthropic expects `x-api-key` + `anthropic-version`.
                 self.remove_http_request_header("Authorization");
-                self.set_http_request_header("x-api-key", Some(llm_provider_api_key_value));
+                self.set_http_request_header("x-api-key", Some(&credential));
                 self.set_http_request_header("anthropic-version", Some("2023-06-01"));
             }
             Some(
@@ -221,10 +234,9 @@ impl StreamContext {
                 | SupportedUpstreamAPIs::OpenAIResponsesAPI(_),
             )
             | None => {
-                // OpenAI and default: use Authorization Bearer token
-                // Remove any existing x-api-key header since OpenAI doesn't use it
+                // OpenAI (and default): `Authorization: Bearer ...`.
                 self.remove_http_request_header("x-api-key");
-                let authorization_header_value = format!("Bearer {}", llm_provider_api_key_value);
+                let authorization_header_value = format!("Bearer {}", credential);
                 self.set_http_request_header("Authorization", Some(&authorization_header_value));
             }
         }
@@ -1235,3 +1247,86 @@ fn current_time_ns() -> u128 {
 }
 
 impl Context for StreamContext {}
+
+/// Extract the credential a client sent in either an OpenAI-style
+/// `Authorization` header or an Anthropic-style `x-api-key` header.
+///
+/// Returns `None` when neither header is present or both are empty/whitespace.
+/// The `Bearer ` prefix on the `Authorization` value is stripped if present;
+/// otherwise the value is taken verbatim (some clients send a raw token).
+fn extract_client_credential(
+    authorization: Option<&str>,
+    x_api_key: Option<&str>,
+) -> Option<String> {
+    // Strip the optional "Bearer " / "Bearer" prefix (case-sensitive, matches
+    // OpenAI SDK behavior) and trim surrounding whitespace before validating
+    // non-empty.
+    let from_authorization = authorization
+        .map(|v| {
+            v.strip_prefix("Bearer ")
+                .or_else(|| v.strip_prefix("Bearer"))
+                .unwrap_or(v)
+                .trim()
+                .to_string()
+        })
+        .filter(|s| !s.is_empty());
+    if from_authorization.is_some() {
+        return from_authorization;
+    }
+    x_api_key
+        .map(str::trim)
+        .filter(|s| !s.is_empty())
+        .map(|s| s.to_string())
+}
+
+#[cfg(test)]
+mod tests {
+    use super::extract_client_credential;
+
+    #[test]
+    fn authorization_bearer_strips_prefix() {
+        assert_eq!(
+            extract_client_credential(Some("Bearer sk-abc"), None),
+            Some("sk-abc".to_string())
+        );
+    }
+
+    #[test]
+    fn authorization_raw_token_preserved() {
+        // Some clients send the raw token without "Bearer " — accept it.
+        assert_eq!(
+            extract_client_credential(Some("sk-abc"), None),
+            Some("sk-abc".to_string())
+        );
+    }
+
+    #[test]
+    fn x_api_key_used_when_authorization_absent() {
+        assert_eq!(
+            extract_client_credential(None, Some("sk-ant-api-key")),
+            Some("sk-ant-api-key".to_string())
+        );
+    }
+
+    #[test]
+    fn authorization_wins_when_both_present() {
+        // If a client is particularly exotic and sends both, prefer the
+        // OpenAI-style Authorization header.
+        assert_eq!(
+            extract_client_credential(Some("Bearer openai-key"), Some("anthropic-key")),
+            Some("openai-key".to_string())
+        );
+    }
+
+    #[test]
+    fn returns_none_when_neither_present() {
+        assert!(extract_client_credential(None, None).is_none());
+    }
+
+    #[test]
+    fn empty_and_whitespace_headers_are_ignored() {
+        assert!(extract_client_credential(Some(""), None).is_none());
+        assert!(extract_client_credential(Some("Bearer "), None).is_none());
+        assert!(extract_client_credential(Some("   "), Some("   ")).is_none());
+    }
+}

From 95a7beaab3e8c1974aa3f43e81a668c1e4cedd0a Mon Sep 17 00:00:00 2001
From: Adil Hafeez <adil.hafeez@gmail.com>
Date: Fri, 17 Apr 2026 21:01:30 -0700
Subject: [PATCH 08/16] fix: truncate oversized user messages in orchestrator
 routing prompt (#895)

---
 crates/brightstaff/src/router/http.rs         | 133 ++++++-
 .../src/router/orchestrator_model_v1.rs       | 329 ++++++++++++++++--
 2 files changed, 423 insertions(+), 39 deletions(-)

diff --git a/crates/brightstaff/src/router/http.rs b/crates/brightstaff/src/router/http.rs
index ad1b711c..e1f2be1e 100644
--- a/crates/brightstaff/src/router/http.rs
+++ b/crates/brightstaff/src/router/http.rs
@@ -1,8 +1,14 @@
 use hermesllm::apis::openai::ChatCompletionsResponse;
 use hyper::header;
+use serde::Deserialize;
 use thiserror::Error;
 use tracing::warn;
 
+/// Max bytes of raw upstream body we include in a log message or error text
+/// when the body is not a recognizable error envelope. Keeps logs from being
+/// flooded by huge HTML error pages.
+const RAW_BODY_LOG_LIMIT: usize = 512;
+
 #[derive(Debug, Error)]
 pub enum HttpError {
     #[error("Failed to send request: {0}")]
@@ -10,13 +16,64 @@ pub enum HttpError {
 
     #[error("Failed to parse JSON response: {0}")]
     Json(serde_json::Error, String),
+
+    #[error("Upstream returned {status}: {message}")]
+    Upstream { status: u16, message: String },
+}
+
+/// Shape of an OpenAI-style error response body, e.g.
+/// `{"error": {"message": "...", "type": "...", "param": "...", "code": ...}}`.
+#[derive(Debug, Deserialize)]
+struct UpstreamErrorEnvelope {
+    error: UpstreamErrorBody,
+}
+
+#[derive(Debug, Deserialize)]
+struct UpstreamErrorBody {
+    message: String,
+    #[serde(default, rename = "type")]
+    err_type: Option<String>,
+    #[serde(default)]
+    param: Option<String>,
+}
+
+/// Extract a human-readable error message from an upstream response body.
+/// Tries to parse an OpenAI-style `{"error": {"message": ...}}` envelope; if
+/// that fails, falls back to the first `RAW_BODY_LOG_LIMIT` bytes of the raw
+/// body (UTF-8 safe).
+fn extract_upstream_error_message(body: &str) -> String {
+    if let Ok(env) = serde_json::from_str::<UpstreamErrorEnvelope>(body) {
+        let mut msg = env.error.message;
+        if let Some(param) = env.error.param {
+            msg.push_str(&format!(" (param={param})"));
+        }
+        if let Some(err_type) = env.error.err_type {
+            msg.push_str(&format!(" [type={err_type}]"));
+        }
+        return msg;
+    }
+    truncate_for_log(body).to_string()
+}
+
+fn truncate_for_log(s: &str) -> &str {
+    if s.len() <= RAW_BODY_LOG_LIMIT {
+        return s;
+    }
+    let mut end = RAW_BODY_LOG_LIMIT;
+    while end > 0 && !s.is_char_boundary(end) {
+        end -= 1;
+    }
+    &s[..end]
 }
 
 /// Sends a POST request to the given URL and extracts the text content
 /// from the first choice of the `ChatCompletionsResponse`.
 ///
-/// Returns `Some((content, elapsed))` on success, or `None` if the response
-/// had no choices or the first choice had no content.
+/// Returns `Some((content, elapsed))` on success, `None` if the response
+/// had no choices or the first choice had no content. Returns
+/// `HttpError::Upstream` for any non-2xx status, carrying a message
+/// extracted from the OpenAI-style error envelope (or a truncated raw body
+/// if the body is not in that shape).
 pub async fn post_and_extract_content(
     client: &reqwest::Client,
     url: &str,
@@ -26,17 +83,36 @@ pub async fn post_and_extract_content(
     let start_time = std::time::Instant::now();
 
     let res = client.post(url).headers(headers).body(body).send().await?;
+    let status = res.status();
 
     let body = res.text().await?;
     let elapsed = start_time.elapsed();
 
+    if !status.is_success() {
+        let message = extract_upstream_error_message(&body);
+        warn!(
+            status = status.as_u16(),
+            message = %message,
+            body_size = body.len(),
+            "upstream returned error response"
+        );
+        return Err(HttpError::Upstream {
+            status: status.as_u16(),
+            message,
+        });
+    }
+
     let response: ChatCompletionsResponse = serde_json::from_str(&body).map_err(|err| {
-        warn!(error = %err, body = %body, "failed to parse json response");
+        warn!(
+            error = %err,
+            body = %truncate_for_log(&body),
+            "failed to parse json response",
+        );
         HttpError::Json(err, format!("Failed to parse JSON: {}", body))
     })?;
 
     if response.choices.is_empty() {
-        warn!(body = %body, "no choices in response");
+        warn!(body = %truncate_for_log(&body), "no choices in response");
         return Ok(None);
     }
 
@@ -46,3 +122,52 @@ pub async fn post_and_extract_content(
         .as_ref()
         .map(|c| (c.clone(), elapsed)))
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn extracts_message_from_openai_style_error_envelope() {
+        let body = r#"{"error":{"code":400,"message":"This model's maximum context length is 32768 tokens. However, you requested 0 output tokens and your prompt contains at least 32769 input tokens, for a total of at least 32769 tokens.","param":"input_tokens","type":"BadRequestError"}}"#;
+        let msg = extract_upstream_error_message(body);
+        assert!(
+            msg.starts_with("This model's maximum context length is 32768 tokens."),
+            "unexpected message: {msg}"
+        );
+        assert!(msg.contains("(param=input_tokens)"));
+        assert!(msg.contains("[type=BadRequestError]"));
+    }
+
+    #[test]
+    fn extracts_message_without_optional_fields() {
+        let body = r#"{"error":{"message":"something broke"}}"#;
+        let msg = extract_upstream_error_message(body);
+        assert_eq!(msg, "something broke");
+    }
+
+    #[test]
+    fn falls_back_to_raw_body_when_not_error_envelope() {
+        let body = "<html><body>502 Bad Gateway</body></html>";
+        let msg = extract_upstream_error_message(body);
+        assert_eq!(msg, body);
+    }
+
+    #[test]
+    fn truncates_non_envelope_bodies_in_logs() {
+        let body = "x".repeat(RAW_BODY_LOG_LIMIT * 3);
+        let msg = extract_upstream_error_message(&body);
+        assert_eq!(msg.len(), RAW_BODY_LOG_LIMIT);
+    }
+
+    #[test]
+    fn truncate_for_log_respects_utf8_boundaries() {
+        // 2-byte characters; picking a length that would split mid-char.
+        let body = "é".repeat(RAW_BODY_LOG_LIMIT);
+        let out = truncate_for_log(&body);
+        // Should be a valid &str (implicit — would panic if we returned
+        // a non-boundary slice) and at most RAW_BODY_LOG_LIMIT bytes.
+        assert!(out.len() <= RAW_BODY_LOG_LIMIT);
+        assert!(out.chars().all(|c| c == 'é'));
+    }
+}
diff --git a/crates/brightstaff/src/router/orchestrator_model_v1.rs b/crates/brightstaff/src/router/orchestrator_model_v1.rs
index 75e5c586..693aacc2 100644
--- a/crates/brightstaff/src/router/orchestrator_model_v1.rs
+++ b/crates/brightstaff/src/router/orchestrator_model_v1.rs
@@ -10,6 +10,18 @@ use super::orchestrator_model::{OrchestratorModel, OrchestratorModelError};
 
 pub const MAX_TOKEN_LEN: usize = 8192; // Default max token length for the orchestration model
 
+/// Hard cap on the number of recent messages considered when building the
+/// routing prompt. Bounds prompt growth for long-running conversations and
+/// acts as an outer guardrail before the token-budget loop runs. The most
+/// recent `MAX_ROUTING_TURNS` filtered messages are kept; older turns are
+/// dropped entirely.
+pub const MAX_ROUTING_TURNS: usize = 16;
+
+/// Unicode ellipsis used to mark where content was trimmed out of a long
+/// message. Helps signal to the downstream router model that the message was
+/// truncated.
+const TRIM_MARKER: &str = "…";
+
 /// Custom JSON formatter that produces spaced JSON (space after colons and commas), same as JSON in python
 struct SpacedJsonFormatter;
 
@@ -176,10 +188,9 @@ impl OrchestratorModel for OrchestratorModelV1 {
         messages: &[Message],
         usage_preferences_from_request: &Option<Vec<AgentUsagePreference>>,
     ) -> ChatCompletionsRequest {
-        // remove system prompt, tool calls, tool call response and messages without content
-        // if content is empty its likely a tool call
-        // when role == tool its tool call response
-        let messages_vec = messages
+        // Remove system/developer/tool messages and messages without extractable
+        // text (tool calls have no text content we can classify against).
+        let filtered: Vec<&Message> = messages
             .iter()
             .filter(|m| {
                 m.role != Role::System
@@ -187,37 +198,72 @@ impl OrchestratorModel for OrchestratorModelV1 {
                     && m.role != Role::Tool
                     && !m.content.extract_text().is_empty()
             })
-            .collect::<Vec<&Message>>();
+            .collect();
 
-        // Following code is to ensure that the conversation does not exceed max token length
-        // Note: we use a simple heuristic to estimate token count based on character length to optimize for performance
+        // Outer guardrail: only consider the last `MAX_ROUTING_TURNS` filtered
+        // messages when building the routing prompt. Keeps prompt growth
+        // predictable for long conversations regardless of per-message size.
+        let start = filtered.len().saturating_sub(MAX_ROUTING_TURNS);
+        let messages_vec: &[&Message] = &filtered[start..];
+
+        // Ensure the conversation does not exceed the configured token budget.
+        // We use `len() / TOKEN_LENGTH_DIVISOR` as a cheap token estimate to
+        // avoid running a real tokenizer on the hot path.
         let mut token_count = ARCH_ORCHESTRATOR_V1_SYSTEM_PROMPT.len() / TOKEN_LENGTH_DIVISOR;
-        let mut selected_messages_list_reversed: Vec<&Message> = vec![];
+        let mut selected_messages_list_reversed: Vec<Message> = vec![];
         for (selected_messsage_count, message) in messages_vec.iter().rev().enumerate() {
-            let message_token_count = message.content.extract_text().len() / TOKEN_LENGTH_DIVISOR;
-            token_count += message_token_count;
-            if token_count > self.max_token_length {
+            let message_text = message.content.extract_text();
+            let message_token_count = message_text.len() / TOKEN_LENGTH_DIVISOR;
+            if token_count + message_token_count > self.max_token_length {
+                let remaining_tokens = self.max_token_length.saturating_sub(token_count);
                 debug!(
-                    token_count = token_count,
+                    attempted_total_tokens = token_count + message_token_count,
                     max_tokens = self.max_token_length,
+                    remaining_tokens,
                     selected = selected_messsage_count,
                     total = messages_vec.len(),
                     "token count exceeds max, truncating conversation"
                 );
-                if message.role == Role::User {
-                    // If message that exceeds max token length is from user, we need to keep it
-                    selected_messages_list_reversed.push(message);
+                // If the overflow message is from the user we need to keep
+                // some of it so the orchestrator still sees the latest user
+                // intent. Use a middle-trim (head + ellipsis + tail): users
+                // often frame the task at the start AND put the actual ask
+                // at the end of a long pasted block, so preserving both is
+                // better than a head-only cut. The ellipsis also signals to
+                // the router model that content was dropped.
+                if message.role == Role::User && remaining_tokens > 0 {
+                    let max_bytes = remaining_tokens.saturating_mul(TOKEN_LENGTH_DIVISOR);
+                    let truncated = trim_middle_utf8(&message_text, max_bytes);
+                    selected_messages_list_reversed.push(Message {
+                        role: Role::User,
+                        content: Some(MessageContent::Text(truncated)),
+                        name: None,
+                        tool_calls: None,
+                        tool_call_id: None,
+                    });
                 }
                 break;
             }
-            // If we are here, it means that the message is within the max token length
-            selected_messages_list_reversed.push(message);
+            token_count += message_token_count;
+            selected_messages_list_reversed.push(Message {
+                role: message.role.clone(),
+                content: Some(MessageContent::Text(message_text)),
+                name: None,
+                tool_calls: None,
+                tool_call_id: None,
+            });
         }
 
         if selected_messages_list_reversed.is_empty() {
             debug!("no messages selected, using last message");
             if let Some(last_message) = messages_vec.last() {
-                selected_messages_list_reversed.push(last_message);
+                selected_messages_list_reversed.push(Message {
+                    role: last_message.role.clone(),
+                    content: Some(MessageContent::Text(last_message.content.extract_text())),
+                    name: None,
+                    tool_calls: None,
+                    tool_call_id: None,
+                });
             }
         }
 
@@ -237,22 +283,8 @@ impl OrchestratorModel for OrchestratorModelV1 {
         }
 
         // Reverse the selected messages to maintain the conversation order
-        let selected_conversation_list = selected_messages_list_reversed
-            .iter()
-            .rev()
-            .map(|message| Message {
-                role: message.role.clone(),
-                content: Some(MessageContent::Text(
-                    message
-                        .content
-                        .as_ref()
-                        .map_or(String::new(), |c| c.to_string()),
-                )),
-                name: None,
-                tool_calls: None,
-                tool_call_id: None,
-            })
-            .collect::<Vec<Message>>();
+        let selected_conversation_list: Vec<Message> =
+            selected_messages_list_reversed.into_iter().rev().collect();
 
         // Generate the orchestrator request message based on the usage preferences.
         // If preferences are passed in request then we use them;
@@ -405,6 +437,45 @@ fn fix_json_response(body: &str) -> String {
     body.replace("'", "\"").replace("\\n", "")
 }
 
+/// Truncate `s` so the result is at most `max_bytes` bytes long, keeping
+/// roughly 60% from the start and 40% from the end, with a Unicode ellipsis
+/// separating the two. All splits respect UTF-8 character boundaries. When
+/// `max_bytes` is too small to fit the marker at all, falls back to a
+/// head-only truncation.
+fn trim_middle_utf8(s: &str, max_bytes: usize) -> String {
+    if s.len() <= max_bytes {
+        return s.to_string();
+    }
+    if max_bytes <= TRIM_MARKER.len() {
+        // Not enough room even for the marker — just keep the start.
+        let mut end = max_bytes;
+        while end > 0 && !s.is_char_boundary(end) {
+            end -= 1;
+        }
+        return s[..end].to_string();
+    }
+
+    let available = max_bytes - TRIM_MARKER.len();
+    // Bias toward the start (60%) where task framing typically lives, while
+    // still preserving ~40% of the tail where the user's actual ask often
+    // appears after a long paste.
+    let mut start_len = available * 3 / 5;
+    while start_len > 0 && !s.is_char_boundary(start_len) {
+        start_len -= 1;
+    }
+    let end_len = available - start_len;
+    let mut end_start = s.len().saturating_sub(end_len);
+    while end_start < s.len() && !s.is_char_boundary(end_start) {
+        end_start += 1;
+    }
+
+    let mut out = String::with_capacity(start_len + TRIM_MARKER.len() + (s.len() - end_start));
+    out.push_str(&s[..start_len]);
+    out.push_str(TRIM_MARKER);
+    out.push_str(&s[end_start..]);
+    out
+}
+
 impl std::fmt::Debug for dyn OrchestratorModel {
     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
         write!(f, "OrchestratorModel")
@@ -777,6 +848,10 @@ If no routes are needed, return an empty list for `route`.
 
     #[test]
     fn test_conversation_trim_upto_user_message() {
+        // With max_token_length=230, the older user message "given the image
+        // In style of Andy Warhol" overflows the remaining budget and gets
+        // middle-trimmed (head + ellipsis + tail) until it fits. Newer turns
+        // are kept in full.
         let expected_prompt = r#"
 You are a helpful assistant that selects the most suitable routes based on user intent.
 You are provided with a list of available routes enclosed within <routes></routes> XML tags:
@@ -789,7 +864,7 @@ You are also given the conversation context enclosed within <conversation></conv
 [
     {
         "role": "user",
-        "content": "given the image In style of Andy Warhol"
+        "content": "given…rhol"
     },
     {
         "role": "assistant",
@@ -862,6 +937,190 @@ If no routes are needed, return an empty list for `route`.
         assert_eq!(expected_prompt, prompt);
     }
 
+    #[test]
+    fn test_huge_single_user_message_is_middle_trimmed() {
+        // Regression test for the case where a single, extremely large user
+        // message was being passed to the orchestrator verbatim and blowing
+        // past the upstream model's context window. The trimmer must now
+        // middle-trim (head + ellipsis + tail) the oversized message so the
+        // resulting request stays within the configured budget, and the
+        // trim marker must be present so the router model knows content
+        // was dropped.
+        let orchestrations_str = r#"
+          {
+            "gpt-4o": [
+              {"name": "Image generation", "description": "generating image"}
+            ]
+        }
+        "#;
+        let agent_orchestrations = serde_json::from_str::<
+            HashMap<String, Vec<OrchestrationPreference>>,
+        >(orchestrations_str)
+        .unwrap();
+
+        let max_token_length = 2048;
+        let orchestrator = OrchestratorModelV1::new(
+            agent_orchestrations,
+            "test-model".to_string(),
+            max_token_length,
+        );
+
+        // ~500KB of content — same scale as the real payload that triggered
+        // the production upstream 400.
+        let head = "HEAD_MARKER_START ";
+        let tail = " TAIL_MARKER_END";
+        let filler = "A".repeat(500_000);
+        let huge_user_content = format!("{head}{filler}{tail}");
+
+        let conversation = vec![Message {
+            role: Role::User,
+            content: Some(MessageContent::Text(huge_user_content.clone())),
+            name: None,
+            tool_calls: None,
+            tool_call_id: None,
+        }];
+
+        let req = orchestrator.generate_request(&conversation, &None);
+        let prompt = req.messages[0].content.extract_text();
+
+        // Prompt must stay bounded. Generous ceiling = budget-in-bytes +
+        // scaffolding + slack. Real result should be well under this.
+        let byte_ceiling = max_token_length * TOKEN_LENGTH_DIVISOR
+            + ARCH_ORCHESTRATOR_V1_SYSTEM_PROMPT.len()
+            + 1024;
+        assert!(
+            prompt.len() < byte_ceiling,
+            "prompt length {} exceeded ceiling {} — truncation did not apply",
+            prompt.len(),
+            byte_ceiling,
+        );
+
+        // Not all 500k filler chars survive.
+        let a_count = prompt.chars().filter(|c| *c == 'A').count();
+        assert!(
+            a_count < filler.len(),
+            "expected user message to be truncated; all {} 'A's survived",
+            a_count
+        );
+        assert!(
+            a_count > 0,
+            "expected some of the user message to survive truncation"
+        );
+
+        // Head and tail of the message must both be preserved (that's the
+        // whole point of middle-trim over head-only).
+        assert!(
+            prompt.contains(head),
+            "head marker missing — head was not preserved"
+        );
+        assert!(
+            prompt.contains(tail),
+            "tail marker missing — tail was not preserved"
+        );
+
+        // Trim marker must be present so the router model can see that
+        // content was omitted.
+        assert!(
+            prompt.contains(TRIM_MARKER),
+            "ellipsis trim marker missing from truncated prompt"
+        );
+
+        // Routing prompt scaffolding remains intact.
+        assert!(prompt.contains("<conversation>"));
+        assert!(prompt.contains("<routes>"));
+    }
+
+    #[test]
+    fn test_turn_cap_limits_routing_history() {
+        // The outer turn-cap guardrail should keep only the last
+        // `MAX_ROUTING_TURNS` filtered messages regardless of how long the
+        // conversation is. We build a conversation with alternating
+        // user/assistant turns tagged with their index and verify that only
+        // the tail of the conversation makes it into the prompt.
+        let orchestrations_str = r#"
+          {
+            "gpt-4o": [
+              {"name": "Image generation", "description": "generating image"}
+            ]
+        }
+        "#;
+        let agent_orchestrations = serde_json::from_str::<
+            HashMap<String, Vec<OrchestrationPreference>>,
+        >(orchestrations_str)
+        .unwrap();
+
+        let orchestrator =
+            OrchestratorModelV1::new(agent_orchestrations, "test-model".to_string(), usize::MAX);
+
+        let mut conversation: Vec<Message> = Vec::new();
+        let total_turns = MAX_ROUTING_TURNS * 2; // well past the cap
+        for i in 0..total_turns {
+            let role = if i % 2 == 0 {
+                Role::User
+            } else {
+                Role::Assistant
+            };
+            conversation.push(Message {
+                role,
+                content: Some(MessageContent::Text(format!("turn-{i:03}"))),
+                name: None,
+                tool_calls: None,
+                tool_call_id: None,
+            });
+        }
+
+        let req = orchestrator.generate_request(&conversation, &None);
+        let prompt = req.messages[0].content.extract_text();
+
+        // The last MAX_ROUTING_TURNS messages (indexes total-cap..total)
+        // must all appear.
+        for i in (total_turns - MAX_ROUTING_TURNS)..total_turns {
+            let tag = format!("turn-{i:03}");
+            assert!(
+                prompt.contains(&tag),
+                "expected recent turn tag {tag} to be present"
+            );
+        }
+
+        // And earlier turns (indexes 0..total-cap) must all be dropped.
+        for i in 0..(total_turns - MAX_ROUTING_TURNS) {
+            let tag = format!("turn-{i:03}");
+            assert!(
+                !prompt.contains(&tag),
+                "old turn tag {tag} leaked past turn cap into the prompt"
+            );
+        }
+    }
+
+    #[test]
+    fn test_trim_middle_utf8_helper() {
+        // No-op when already small enough.
+        assert_eq!(trim_middle_utf8("hello", 100), "hello");
+        assert_eq!(trim_middle_utf8("hello", 5), "hello");
+
+        // 60/40 split with ellipsis when too long.
+        let long = "a".repeat(20);
+        let out = trim_middle_utf8(&long, 10);
+        assert!(out.len() <= 10);
+        assert!(out.contains(TRIM_MARKER));
+        // Exactly one ellipsis, rest are 'a's.
+        assert_eq!(out.matches(TRIM_MARKER).count(), 1);
+        assert!(out.chars().filter(|c| *c == 'a').count() > 0);
+
+        // When max_bytes is smaller than the marker, falls back to
+        // head-only truncation (no marker).
+        let out = trim_middle_utf8("abcdefgh", 2);
+        assert_eq!(out, "ab");
+
+        // UTF-8 boundary safety: 2-byte chars.
+        let s = "é".repeat(50); // 100 bytes
+        let out = trim_middle_utf8(&s, 25);
+        assert!(out.len() <= 25);
+        // Must still be valid UTF-8 that only contains 'é' and the marker.
+        let ok = out.chars().all(|c| c == 'é' || c == '…');
+        assert!(ok, "unexpected char in trimmed output: {out:?}");
+    }
+
     #[test]
     fn test_non_text_input() {
         let expected_prompt = r#"

From 254d2b03bcbad838eeb12f70385e6f71d028fe35 Mon Sep 17 00:00:00 2001
From: Adil Hafeez <adil.hafeez@gmail.com>
Date: Fri, 17 Apr 2026 21:16:12 -0700
Subject: [PATCH 09/16] release: bump version to 0.4.20 (#897)

---
 .github/workflows/ci.yml                             | 4 ++--
 apps/www/src/components/Hero.tsx                     | 2 +-
 build_filter_image.sh                                | 2 +-
 cli/planoai/__init__.py                              | 2 +-
 cli/planoai/consts.py                                | 2 +-
 cli/pyproject.toml                                   | 2 +-
 cli/uv.lock                                          | 2 +-
 demos/llm_routing/preference_based_routing/README.md | 2 +-
 docs/source/conf.py                                  | 2 +-
 docs/source/get_started/quickstart.rst               | 4 ++--
 docs/source/resources/deployment.rst                 | 4 ++--
 11 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 9e8d3223..3c5f9372 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -133,13 +133,13 @@ jobs:
           load: true
           tags: |
             ${{ env.PLANO_DOCKER_IMAGE }}
-            ${{ env.DOCKER_IMAGE }}:0.4.19
+            ${{ env.DOCKER_IMAGE }}:0.4.20
             ${{ env.DOCKER_IMAGE }}:latest
           cache-from: type=gha
           cache-to: type=gha,mode=max
 
       - name: Save image as artifact
-        run: docker save ${{ env.PLANO_DOCKER_IMAGE }} ${{ env.DOCKER_IMAGE }}:0.4.19 ${{ env.DOCKER_IMAGE }}:latest -o /tmp/plano-image.tar
+        run: docker save ${{ env.PLANO_DOCKER_IMAGE }} ${{ env.DOCKER_IMAGE }}:0.4.20 ${{ env.DOCKER_IMAGE }}:latest -o /tmp/plano-image.tar
 
       - name: Upload image artifact
         uses: actions/upload-artifact@v6
diff --git a/apps/www/src/components/Hero.tsx b/apps/www/src/components/Hero.tsx
index 05e615b9..bf243ce9 100644
--- a/apps/www/src/components/Hero.tsx
+++ b/apps/www/src/components/Hero.tsx
@@ -24,7 +24,7 @@ export function Hero() {
             >
               <div className="inline-flex flex-wrap items-center gap-1.5 sm:gap-2 px-3 sm:px-4 py-1 rounded-full bg-[rgba(185,191,255,0.4)] border border-[var(--secondary)] shadow backdrop-blur hover:bg-[rgba(185,191,255,0.6)] transition-colors cursor-pointer">
                 <span className="text-xs sm:text-sm font-medium text-black/65">
-                  v0.4.19
+                  v0.4.20
                 </span>
                 <span className="text-xs sm:text-sm font-medium text-black ">
                   —
diff --git a/build_filter_image.sh b/build_filter_image.sh
index 73e51b61..2fbee244 100644
--- a/build_filter_image.sh
+++ b/build_filter_image.sh
@@ -1 +1 @@
-docker build  -f Dockerfile . -t katanemo/plano -t katanemo/plano:0.4.19
+docker build  -f Dockerfile . -t katanemo/plano -t katanemo/plano:0.4.20
diff --git a/cli/planoai/__init__.py b/cli/planoai/__init__.py
index 2492d40c..3ae1c625 100644
--- a/cli/planoai/__init__.py
+++ b/cli/planoai/__init__.py
@@ -1,3 +1,3 @@
 """Plano CLI - Intelligent Prompt Gateway."""
 
-__version__ = "0.4.19"
+__version__ = "0.4.20"
diff --git a/cli/planoai/consts.py b/cli/planoai/consts.py
index af76d7cf..05c213e3 100644
--- a/cli/planoai/consts.py
+++ b/cli/planoai/consts.py
@@ -5,7 +5,7 @@ PLANO_COLOR = "#969FF4"
 
 SERVICE_NAME_ARCHGW = "plano"
 PLANO_DOCKER_NAME = "plano"
-PLANO_DOCKER_IMAGE = os.getenv("PLANO_DOCKER_IMAGE", "katanemo/plano:0.4.19")
+PLANO_DOCKER_IMAGE = os.getenv("PLANO_DOCKER_IMAGE", "katanemo/plano:0.4.20")
 DEFAULT_OTEL_TRACING_GRPC_ENDPOINT = "http://localhost:4317"
 
 # Native mode constants
diff --git a/cli/pyproject.toml b/cli/pyproject.toml
index 1864a915..da297d70 100644
--- a/cli/pyproject.toml
+++ b/cli/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "planoai"
-version = "0.4.19"
+version = "0.4.20"
 description = "Python-based CLI tool to manage Plano."
 authors = [{name = "Katanemo Labs, Inc."}]
 readme = "README.md"
diff --git a/cli/uv.lock b/cli/uv.lock
index e8c85648..75105275 100644
--- a/cli/uv.lock
+++ b/cli/uv.lock
@@ -337,7 +337,7 @@ wheels = [
 
 [[package]]
 name = "planoai"
-version = "0.4.19"
+version = "0.4.20"
 source = { editable = "." }
 dependencies = [
     { name = "click" },
diff --git a/demos/llm_routing/preference_based_routing/README.md b/demos/llm_routing/preference_based_routing/README.md
index f04fcf06..89ea00bb 100644
--- a/demos/llm_routing/preference_based_routing/README.md
+++ b/demos/llm_routing/preference_based_routing/README.md
@@ -3,7 +3,7 @@ This demo shows how you can use user preferences to route user prompts to approp
 
 ## How to start the demo
 
-Make sure you have Plano CLI installed (`pip install planoai==0.4.19` or `uv tool install planoai==0.4.19`).
+Make sure you have Plano CLI installed (`pip install planoai==0.4.20` or `uv tool install planoai==0.4.20`).
 
 ```bash
 cd demos/llm_routing/preference_based_routing
diff --git a/docs/source/conf.py b/docs/source/conf.py
index 401f6cff..a32e1383 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -17,7 +17,7 @@ from sphinxawesome_theme.postprocess import Icons
 project = "Plano Docs"
 copyright = "2026, Katanemo Labs, a DigitalOcean Company"
 author = "Katanemo Labs, Inc"
-release = " v0.4.19"
+release = " v0.4.20"
 
 # -- General configuration ---------------------------------------------------
 # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration
diff --git a/docs/source/get_started/quickstart.rst b/docs/source/get_started/quickstart.rst
index 50916eae..92abef23 100644
--- a/docs/source/get_started/quickstart.rst
+++ b/docs/source/get_started/quickstart.rst
@@ -43,7 +43,7 @@ Plano's CLI allows you to manage and interact with the Plano efficiently. To ins
 
 .. code-block:: console
 
-   $ uv tool install planoai==0.4.19
+   $ uv tool install planoai==0.4.20
 
 **Option 2: Install with pip (Traditional)**
 
@@ -51,7 +51,7 @@ Plano's CLI allows you to manage and interact with the Plano efficiently. To ins
 
    $ python -m venv venv
    $ source venv/bin/activate   # On Windows, use: venv\Scripts\activate
-   $ pip install planoai==0.4.19
+   $ pip install planoai==0.4.20
 
 
 .. _llm_routing_quickstart:
diff --git a/docs/source/resources/deployment.rst b/docs/source/resources/deployment.rst
index 1aab49c9..fd2a3c7e 100644
--- a/docs/source/resources/deployment.rst
+++ b/docs/source/resources/deployment.rst
@@ -65,7 +65,7 @@ Create a ``docker-compose.yml`` file with the following configuration:
    # docker-compose.yml
    services:
      plano:
-       image: katanemo/plano:0.4.19
+       image: katanemo/plano:0.4.20
        container_name: plano
        ports:
          - "10000:10000" # ingress (client -> plano)
@@ -153,7 +153,7 @@ Create a ``plano-deployment.yaml``:
        spec:
          containers:
            - name: plano
-             image: katanemo/plano:0.4.19
+             image: katanemo/plano:0.4.20
              ports:
                - containerPort: 12000  # LLM gateway (chat completions, model routing)
                  name: llm-gateway

From e7464b817ab5d413ccdb4881b700ad95dde2e1a9 Mon Sep 17 00:00:00 2001
From: Adil Hafeez <adil.hafeez@gmail.com>
Date: Sat, 18 Apr 2026 15:57:34 -0700
Subject: [PATCH 10/16] fix(anthropic-stream): avoid bare/duplicate
 message_stop on OpenAI upstream (#898)

---
 .../anthropic_streaming_buffer.rs             | 326 ++++++++++++++++--
 1 file changed, 289 insertions(+), 37 deletions(-)

diff --git a/crates/hermesllm/src/apis/streaming_shapes/anthropic_streaming_buffer.rs b/crates/hermesllm/src/apis/streaming_shapes/anthropic_streaming_buffer.rs
index eb9ec5b1..d3e3bbff 100644
--- a/crates/hermesllm/src/apis/streaming_shapes/anthropic_streaming_buffer.rs
+++ b/crates/hermesllm/src/apis/streaming_shapes/anthropic_streaming_buffer.rs
@@ -1,6 +1,9 @@
-use crate::apis::anthropic::MessagesStreamEvent;
+use crate::apis::anthropic::{
+    MessagesMessageDelta, MessagesStopReason, MessagesStreamEvent, MessagesUsage,
+};
 use crate::apis::streaming_shapes::sse::{SseEvent, SseStreamBufferTrait};
 use crate::providers::streaming_response::ProviderStreamResponseType;
+use log::warn;
 use std::collections::HashSet;
 
 /// SSE Stream Buffer for Anthropic Messages API streaming.
@@ -11,13 +14,24 @@ use std::collections::HashSet;
 ///
 /// When converting from OpenAI to Anthropic format, this buffer injects the required
 /// ContentBlockStart and ContentBlockStop events to maintain proper Anthropic protocol.
+///
+/// Guarantees (Anthropic Messages API contract):
+/// 1. `message_stop` is never emitted unless a matching `message_start` was emitted first.
+/// 2. `message_stop` is emitted at most once per stream (no double-close).
+/// 3. If upstream terminates with no content (empty/filtered/errored response), a
+///    minimal but well-formed envelope is synthesized so the client's state machine
+///    stays consistent.
 pub struct AnthropicMessagesStreamBuffer {
     /// Buffered SSE events ready to be written to wire
     buffered_events: Vec<SseEvent>,
 
-    /// Track if we've seen a message_start event
+    /// Track if we've emitted a message_start event
     message_started: bool,
 
+    /// Track if we've emitted a terminal message_stop event (for idempotency /
+    /// double-close protection).
+    message_stopped: bool,
+
     /// Track content block indices that have received ContentBlockStart events
     content_block_start_indices: HashSet<i32>,
 
@@ -42,6 +56,7 @@ impl AnthropicMessagesStreamBuffer {
         Self {
             buffered_events: Vec::new(),
             message_started: false,
+            message_stopped: false,
             content_block_start_indices: HashSet::new(),
             needs_content_block_stop: false,
             seen_message_delta: false,
@@ -49,6 +64,66 @@ impl AnthropicMessagesStreamBuffer {
         }
     }
 
+    /// Inject a `message_start` event into the buffer if one hasn't been emitted yet.
+    /// This is the single source of truth for opening a message — every handler
+    /// that can legitimately be the first event on the wire must call this before
+    /// pushing its own event.
+    fn ensure_message_started(&mut self) {
+        if self.message_started {
+            return;
+        }
+        let model = self.model.as_deref().unwrap_or("unknown");
+        let message_start = AnthropicMessagesStreamBuffer::create_message_start_event(model);
+        self.buffered_events.push(message_start);
+        self.message_started = true;
+    }
+
+    /// Inject a synthetic `message_delta` with `end_turn` / zero usage.
+    /// Used when we must close a message but upstream never produced a terminal
+    /// event (e.g. `[DONE]` arrives with no prior `finish_reason`).
+    fn push_synthetic_message_delta(&mut self) {
+        let event = MessagesStreamEvent::MessageDelta {
+            delta: MessagesMessageDelta {
+                stop_reason: MessagesStopReason::EndTurn,
+                stop_sequence: None,
+            },
+            usage: MessagesUsage {
+                input_tokens: 0,
+                output_tokens: 0,
+                cache_creation_input_tokens: None,
+                cache_read_input_tokens: None,
+            },
+        };
+        let sse_string: String = event.clone().into();
+        self.buffered_events.push(SseEvent {
+            data: None,
+            event: Some("message_delta".to_string()),
+            raw_line: sse_string.clone(),
+            sse_transformed_lines: sse_string,
+            provider_stream_response: Some(ProviderStreamResponseType::MessagesStreamEvent(event)),
+        });
+        self.seen_message_delta = true;
+    }
+
+    /// Inject a `message_stop` event into the buffer, marking the stream as closed.
+    /// Idempotent — subsequent calls are no-ops.
+    fn push_message_stop(&mut self) {
+        if self.message_stopped {
+            return;
+        }
+        let message_stop = MessagesStreamEvent::MessageStop;
+        let sse_string: String = message_stop.into();
+        self.buffered_events.push(SseEvent {
+            data: None,
+            event: Some("message_stop".to_string()),
+            raw_line: sse_string.clone(),
+            sse_transformed_lines: sse_string,
+            provider_stream_response: None,
+        });
+        self.message_stopped = true;
+        self.seen_message_delta = false;
+    }
+
     /// Check if a content_block_start event has been sent for the given index
     fn has_content_block_start_been_sent(&self, index: i32) -> bool {
         self.content_block_start_indices.contains(&index)
@@ -149,6 +224,27 @@ impl SseStreamBufferTrait for AnthropicMessagesStreamBuffer {
         // We match on a reference first to determine the type, then move the event
         match &event.provider_stream_response {
             Some(ProviderStreamResponseType::MessagesStreamEvent(evt)) => {
+                // If the message has already been closed, drop any trailing events
+                // to avoid emitting data after `message_stop` (protocol violation).
+                // This typically indicates a duplicate `[DONE]` from upstream or a
+                // replay of previously-buffered bytes — worth surfacing so we can
+                // spot misbehaving providers.
+                if self.message_stopped {
+                    warn!(
+                        "anthropic stream buffer: dropping event after message_stop (variant={})",
+                        match evt {
+                            MessagesStreamEvent::MessageStart { .. } => "message_start",
+                            MessagesStreamEvent::ContentBlockStart { .. } => "content_block_start",
+                            MessagesStreamEvent::ContentBlockDelta { .. } => "content_block_delta",
+                            MessagesStreamEvent::ContentBlockStop { .. } => "content_block_stop",
+                            MessagesStreamEvent::MessageDelta { .. } => "message_delta",
+                            MessagesStreamEvent::MessageStop => "message_stop",
+                            MessagesStreamEvent::Ping => "ping",
+                        }
+                    );
+                    return;
+                }
+
                 match evt {
                     MessagesStreamEvent::MessageStart { .. } => {
                         // Add the message_start event
@@ -157,14 +253,7 @@ impl SseStreamBufferTrait for AnthropicMessagesStreamBuffer {
                     }
                     MessagesStreamEvent::ContentBlockStart { index, .. } => {
                         let index = *index as i32;
-                        // Inject message_start if needed
-                        if !self.message_started {
-                            let model = self.model.as_deref().unwrap_or("unknown");
-                            let message_start =
-                                AnthropicMessagesStreamBuffer::create_message_start_event(model);
-                            self.buffered_events.push(message_start);
-                            self.message_started = true;
-                        }
+                        self.ensure_message_started();
 
                         // Add the content_block_start event (from tool calls or other sources)
                         self.buffered_events.push(event);
@@ -173,14 +262,7 @@ impl SseStreamBufferTrait for AnthropicMessagesStreamBuffer {
                     }
                     MessagesStreamEvent::ContentBlockDelta { index, .. } => {
                         let index = *index as i32;
-                        // Inject message_start if needed
-                        if !self.message_started {
-                            let model = self.model.as_deref().unwrap_or("unknown");
-                            let message_start =
-                                AnthropicMessagesStreamBuffer::create_message_start_event(model);
-                            self.buffered_events.push(message_start);
-                            self.message_started = true;
-                        }
+                        self.ensure_message_started();
 
                         // Check if ContentBlockStart was sent for this index
                         if !self.has_content_block_start_been_sent(index) {
@@ -196,6 +278,11 @@ impl SseStreamBufferTrait for AnthropicMessagesStreamBuffer {
                         self.buffered_events.push(event);
                     }
                     MessagesStreamEvent::MessageDelta { usage, .. } => {
+                        // `message_delta` is only meaningful inside an open message.
+                        // Upstream can send it with no prior content (empty completion,
+                        // content filter, etc.), so we must open a message first.
+                        self.ensure_message_started();
+
                         // Inject ContentBlockStop before message_delta
                         if self.needs_content_block_stop {
                             let content_block_stop =
@@ -230,15 +317,52 @@ impl SseStreamBufferTrait for AnthropicMessagesStreamBuffer {
                     }
                     MessagesStreamEvent::ContentBlockStop { .. } => {
                         // ContentBlockStop received from upstream (e.g., Bedrock)
+                        self.ensure_message_started();
                         // Clear the flag so we don't inject another one
                         self.needs_content_block_stop = false;
                         self.buffered_events.push(event);
                     }
                     MessagesStreamEvent::MessageStop => {
-                        // MessageStop received from upstream (e.g., OpenAI via [DONE])
-                        // Clear the flag so we don't inject another one
-                        self.seen_message_delta = false;
+                        // MessageStop received from upstream (e.g., OpenAI via [DONE]).
+                        //
+                        // The Anthropic protocol requires the full envelope
+                        //   message_start → [content blocks] → message_delta → message_stop
+                        // so we must not emit a bare `message_stop`. Synthesize whatever
+                        // is missing to keep the client's state machine consistent.
+                        self.ensure_message_started();
+
+                        if self.needs_content_block_stop {
+                            let content_block_stop =
+                                AnthropicMessagesStreamBuffer::create_content_block_stop_event();
+                            self.buffered_events.push(content_block_stop);
+                            self.needs_content_block_stop = false;
+                        }
+
+                        // If no message_delta has been emitted yet (empty/filtered upstream
+                        // response), synthesize a minimal one carrying `end_turn`.
+                        if !self.seen_message_delta {
+                            // If we also never opened a content block, open and close one
+                            // so clients that expect at least one block are happy.
+                            if self.content_block_start_indices.is_empty() {
+                                let content_block_start =
+                                    AnthropicMessagesStreamBuffer::create_content_block_start_event(
+                                    );
+                                self.buffered_events.push(content_block_start);
+                                self.set_content_block_start_sent(0);
+                                let content_block_stop =
+                                    AnthropicMessagesStreamBuffer::create_content_block_stop_event(
+                                    );
+                                self.buffered_events.push(content_block_stop);
+                            }
+                            self.push_synthetic_message_delta();
+                        }
+
+                        // Push the upstream-provided message_stop and mark closed.
+                        // `push_message_stop` is idempotent but we want to reuse the
+                        // original SseEvent so raw passthrough semantics are preserved.
                         self.buffered_events.push(event);
+                        self.message_stopped = true;
+                        self.seen_message_delta = false;
                     }
                     _ => {
                         // Other Anthropic event types (Ping, etc.), just accumulate
@@ -254,24 +378,23 @@ impl SseStreamBufferTrait for AnthropicMessagesStreamBuffer {
     }
 
     fn to_bytes(&mut self) -> Vec<u8> {
-        // Convert all accumulated events to bytes and clear buffer
+        // Convert all accumulated events to bytes and clear buffer.
+        //
         // NOTE: We do NOT inject ContentBlockStop here because it's injected when we see MessageDelta
         // or MessageStop. Injecting it here causes premature ContentBlockStop in the middle of streaming.
-
-        // Inject MessageStop after MessageDelta if we've seen one
-        // This completes the Anthropic Messages API event sequence
-        if self.seen_message_delta {
-            let message_stop = MessagesStreamEvent::MessageStop;
-            let sse_string: String = message_stop.into();
-            let message_stop_event = SseEvent {
-                data: None,
-                event: Some("message_stop".to_string()),
-                raw_line: sse_string.clone(),
-                sse_transformed_lines: sse_string,
-                provider_stream_response: None,
-            };
-            self.buffered_events.push(message_stop_event);
-            self.seen_message_delta = false;
+        //
+        // Inject a synthetic `message_stop` only when:
+        //   1. A `message_delta` has been seen (otherwise we'd violate the Anthropic
+        //      protocol by emitting `message_stop` without a preceding `message_delta`), AND
+        //   2. We haven't already emitted `message_stop` (either synthetic from a
+        //      previous flush, or real from an upstream `[DONE]`).
+        //
+        // Without the `!message_stopped` guard, a stream whose `finish_reason` chunk
+        // and `[DONE]` marker land in separate HTTP body chunks would receive two
+        // `message_stop` events, triggering Claude Code's "Received message_stop
+        // without a current message" error.
+        if self.seen_message_delta && !self.message_stopped {
+            self.push_message_stop();
         }
 
         let mut buffer = Vec::new();
@@ -615,4 +738,133 @@ data: [DONE]"#;
         println!("✓ Stop reason: tool_use");
         println!("✓ Proper Anthropic tool_use protocol\n");
     }
+
+    /// Regression test for:
+    ///   Claude Code CLI error: "Received message_stop without a current message"
+    ///
+    /// Reproduces the *double-close* scenario: OpenAI's final `finish_reason`
+    /// chunk and the `[DONE]` marker arrive in **separate** HTTP body chunks, so
+    /// `to_bytes()` is called between them. Before the fix, this produced two
+    /// `message_stop` events on the wire (one synthetic, one from `[DONE]`).
+    #[test]
+    fn test_openai_to_anthropic_emits_single_message_stop_across_chunk_boundary() {
+        let client_api = SupportedAPIsFromClient::AnthropicMessagesAPI(AnthropicApi::Messages);
+        let upstream_api = SupportedUpstreamAPIs::OpenAIChatCompletions(OpenAIApi::ChatCompletions);
+        let mut buffer = AnthropicMessagesStreamBuffer::new();
+
+        // --- HTTP chunk 1: content + finish_reason (no [DONE] yet) -----------
+        let chunk_1 = r#"data: {"id":"c1","object":"chat.completion.chunk","created":1,"model":"gpt-4o","choices":[{"index":0,"delta":{"role":"assistant","content":"Hi"},"finish_reason":null}]}
+
+data: {"id":"c1","object":"chat.completion.chunk","created":1,"model":"gpt-4o","choices":[{"index":0,"delta":{},"finish_reason":"stop"}]}"#;
+
+        for raw in SseStreamIter::try_from(chunk_1.as_bytes()).unwrap() {
+            let e = SseEvent::try_from((raw, &client_api, &upstream_api)).unwrap();
+            buffer.add_transformed_event(e);
+        }
+        let out_1 = String::from_utf8(buffer.to_bytes()).unwrap();
+
+        // --- HTTP chunk 2: just the [DONE] marker ----------------------------
+        let chunk_2 = "data: [DONE]";
+        for raw in SseStreamIter::try_from(chunk_2.as_bytes()).unwrap() {
+            let e = SseEvent::try_from((raw, &client_api, &upstream_api)).unwrap();
+            buffer.add_transformed_event(e);
+        }
+        let out_2 = String::from_utf8(buffer.to_bytes()).unwrap();
+
+        let combined = format!("{}{}", out_1, out_2);
+        let start_count = combined.matches("event: message_start").count();
+        let stop_count = combined.matches("event: message_stop").count();
+
+        assert_eq!(
+            start_count, 1,
+            "Must emit exactly one message_start across chunks, got {start_count}. Output:\n{combined}"
+        );
+        assert_eq!(
+            stop_count, 1,
+            "Must emit exactly one message_stop across chunks (no double-close), got {stop_count}. Output:\n{combined}"
+        );
+        // Every message_stop must be preceded by a message_start earlier in the stream.
+        let start_pos = combined.find("event: message_start").unwrap();
+        let stop_pos = combined.find("event: message_stop").unwrap();
+        assert!(
+            start_pos < stop_pos,
+            "message_start must come before message_stop. Output:\n{combined}"
+        );
+    }
+
+    /// Regression test for:
+    ///   "Received message_stop without a current message" on empty upstream responses.
+    ///
+    /// OpenAI returns only `[DONE]` with no content deltas and no `finish_reason`
+    /// (this happens with content filters, truncated upstream streams, and some
+    /// 5xx recoveries). Before the fix, the buffer emitted a bare `message_stop`
+    /// with no preceding `message_start`. After the fix, it synthesizes a
+    /// minimal but well-formed envelope.
+    #[test]
+    fn test_openai_done_only_stream_synthesizes_valid_envelope() {
+        let client_api = SupportedAPIsFromClient::AnthropicMessagesAPI(AnthropicApi::Messages);
+        let upstream_api = SupportedUpstreamAPIs::OpenAIChatCompletions(OpenAIApi::ChatCompletions);
+        let mut buffer = AnthropicMessagesStreamBuffer::new();
+
+        let raw_input = "data: [DONE]";
+        for raw in SseStreamIter::try_from(raw_input.as_bytes()).unwrap() {
+            let e = SseEvent::try_from((raw, &client_api, &upstream_api)).unwrap();
+            buffer.add_transformed_event(e);
+        }
+        let out = String::from_utf8(buffer.to_bytes()).unwrap();
+
+        assert!(
+            out.contains("event: message_start"),
+            "Empty upstream must still produce message_start. Output:\n{out}"
+        );
+        assert!(
+            out.contains("event: message_delta"),
+            "Empty upstream must produce a synthesized message_delta. Output:\n{out}"
+        );
+        assert_eq!(
+            out.matches("event: message_stop").count(),
+            1,
+            "Empty upstream must produce exactly one message_stop. Output:\n{out}"
+        );
+
+        // Protocol ordering: start < delta < stop.
+        let p_start = out.find("event: message_start").unwrap();
+        let p_delta = out.find("event: message_delta").unwrap();
+        let p_stop = out.find("event: message_stop").unwrap();
+        assert!(
+            p_start < p_delta && p_delta < p_stop,
+            "Bad ordering. Output:\n{out}"
+        );
+    }
+
+    /// Regression test: events arriving after `message_stop` (e.g. a stray `[DONE]`
+    /// echo, or late-arriving deltas from a racing upstream) must be dropped
+    /// rather than written after the terminal frame.
+    #[test]
+    fn test_events_after_message_stop_are_dropped() {
+        let client_api = SupportedAPIsFromClient::AnthropicMessagesAPI(AnthropicApi::Messages);
+        let upstream_api = SupportedUpstreamAPIs::OpenAIChatCompletions(OpenAIApi::ChatCompletions);
+        let mut buffer = AnthropicMessagesStreamBuffer::new();
+
+        let first = r#"data: {"id":"c1","object":"chat.completion.chunk","created":1,"model":"gpt-4o","choices":[{"index":0,"delta":{"content":"ok"},"finish_reason":"stop"}]}
+
+data: [DONE]"#;
+        for raw in SseStreamIter::try_from(first.as_bytes()).unwrap() {
+            let e = SseEvent::try_from((raw, &client_api, &upstream_api)).unwrap();
+            buffer.add_transformed_event(e);
+        }
+        let _ = buffer.to_bytes();
+
+        // Simulate a duplicate / late `[DONE]` after the stream was already closed.
+        let late = "data: [DONE]";
+        for raw in SseStreamIter::try_from(late.as_bytes()).unwrap() {
+            let e = SseEvent::try_from((raw, &client_api, &upstream_api)).unwrap();
+            buffer.add_transformed_event(e);
+        }
+        let tail = String::from_utf8(buffer.to_bytes()).unwrap();
+        assert!(
+            tail.is_empty(),
+            "No bytes should be emitted after message_stop, got: {tail:?}"
+        );
+    }
 }

From ffea891dbaa08303da7d2636040488324aa9b014 Mon Sep 17 00:00:00 2001
From: Adil Hafeez <adil.hafeez@gmail.com>
Date: Sat, 18 Apr 2026 16:24:02 -0700
Subject: [PATCH 11/16] fix: prevent index-out-of-bounds panic in signal
 analyzer follow-up (#896)

---
 crates/brightstaff/src/signals/analyzer.rs | 75 ++++++++++++++++++++--
 1 file changed, 70 insertions(+), 5 deletions(-)

diff --git a/crates/brightstaff/src/signals/analyzer.rs b/crates/brightstaff/src/signals/analyzer.rs
index 5ee3c7d9..8dffdd96 100644
--- a/crates/brightstaff/src/signals/analyzer.rs
+++ b/crates/brightstaff/src/signals/analyzer.rs
@@ -1250,7 +1250,7 @@ impl TextBasedSignalAnalyzer {
         let mut repair_phrases = Vec::new();
         let mut user_turn_count = 0;
 
-        for (i, role, norm_msg) in normalized_messages {
+        for (pos, (i, role, norm_msg)) in normalized_messages.iter().enumerate() {
             if *role != Role::User {
                 continue;
             }
@@ -1274,10 +1274,13 @@ impl TextBasedSignalAnalyzer {
                 }
             }
 
-            // Only check for semantic similarity if no pattern matched
-            if !found_in_turn && *i >= 2 {
-                // Find previous user message
-                for j in (0..*i).rev() {
+            // Only check for semantic similarity if no pattern matched. Walk
+            // backwards through the *normalized* list (not the original
+            // conversation indices, which may be non-contiguous because
+            // messages without extractable text are filtered out) to find the
+            // most recent prior user message.
+            if !found_in_turn && pos >= 1 {
+                for j in (0..pos).rev() {
                     let (_, prev_role, prev_norm_msg) = &normalized_messages[j];
                     if *prev_role == Role::User {
                         if self.is_similar_rephrase(norm_msg, prev_norm_msg) {
@@ -2199,6 +2202,68 @@ mod tests {
         println!("test_follow_up_detection took: {:?}", start.elapsed());
     }
 
+    #[test]
+    fn test_follow_up_does_not_panic_with_filtered_messages() {
+        // Regression test: the preprocessing pipeline filters out messages
+        // without extractable text (tool calls, tool results, empty content).
+        // The stored tuple index `i` is the ORIGINAL-conversation index, so
+        // once anything is filtered out, `i` no longer matches the position
+        // inside `normalized_messages`. The old code used `*i` to index into
+        // `normalized_messages`, which panicked with "index out of bounds"
+        // when a user message appeared after filtered entries.
+        let analyzer = TextBasedSignalAnalyzer::new();
+        let messages = vec![
+            Message {
+                role: Role::User,
+                content: Some(hermesllm::apis::openai::MessageContent::Text(
+                    "first question".to_string(),
+                )),
+                name: None,
+                tool_calls: None,
+                tool_call_id: None,
+            },
+            // Assistant message with no text content (e.g. tool call) — filtered out.
+            Message {
+                role: Role::Assistant,
+                content: None,
+                name: None,
+                tool_calls: None,
+                tool_call_id: None,
+            },
+            // Tool-role message with no extractable text — filtered out.
+            Message {
+                role: Role::Tool,
+                content: None,
+                name: None,
+                tool_calls: None,
+                tool_call_id: None,
+            },
+            Message {
+                role: Role::Assistant,
+                content: Some(hermesllm::apis::openai::MessageContent::Text(
+                    "some answer".to_string(),
+                )),
+                name: None,
+                tool_calls: None,
+                tool_call_id: None,
+            },
+            // Rephrased user turn — original index 4, but after filtering
+            // only 3 messages remain in `normalized_messages` before it.
+            Message {
+                role: Role::User,
+                content: Some(hermesllm::apis::openai::MessageContent::Text(
+                    "first question please".to_string(),
+                )),
+                name: None,
+                tool_calls: None,
+                tool_call_id: None,
+            },
+        ];
+
+        // Must not panic — exercises the full analyze pipeline.
+        let _report = analyzer.analyze(&messages);
+    }
+
     #[test]
     fn test_frustration_detection() {
         let start = Instant::now();

From 78d8c90184088ad779f439ecb839592018b71d93 Mon Sep 17 00:00:00 2001
From: Adil Hafeez <adil.hafeez@gmail.com>
Date: Sat, 18 Apr 2026 19:10:57 -0700
Subject: [PATCH 12/16] Add claude-opus-4-7 to anthropic provider models (#901)

---
 crates/hermesllm/src/bin/provider_models.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/crates/hermesllm/src/bin/provider_models.yaml b/crates/hermesllm/src/bin/provider_models.yaml
index 22f69a7d..d07e265d 100644
--- a/crates/hermesllm/src/bin/provider_models.yaml
+++ b/crates/hermesllm/src/bin/provider_models.yaml
@@ -95,6 +95,7 @@ providers:
   anthropic:
   - anthropic/claude-sonnet-4-6
   - anthropic/claude-opus-4-6
+  - anthropic/claude-opus-4-7
   - anthropic/claude-opus-4-5-20251101
   - anthropic/claude-opus-4-5
   - anthropic/claude-haiku-4-5-20251001

From c3c213b2fdad44171c2b5ede026689201fee56cd Mon Sep 17 00:00:00 2001
From: Adil Hafeez <adil.hafeez@gmail.com>
Date: Sat, 18 Apr 2026 21:20:34 -0700
Subject: [PATCH 13/16] Fix request closures during long-running streaming
 (#899)

---
 cli/planoai/utils.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/cli/planoai/utils.py b/cli/planoai/utils.py
index 8f73bf18..214fd0a3 100644
--- a/cli/planoai/utils.py
+++ b/cli/planoai/utils.py
@@ -91,7 +91,12 @@ def convert_legacy_listeners(
         "type": "model",
         "port": 12000,
         "address": "0.0.0.0",
-        "timeout": "30s",
+        # LLM streaming responses routinely exceed 30s (extended thinking,
+        # long tool reasoning, large completions). Match the 300s ceiling
+        # used by the direct upstream-provider routes so Envoy doesn't
+        # abort streams with UT mid-response. Users can override via their
+        # plano_config.yaml `listeners.timeout` field.
+        "timeout": "300s",
         "model_providers": model_providers or [],
     }
 
@@ -100,7 +105,7 @@ def convert_legacy_listeners(
         "type": "prompt",
         "port": 10000,
         "address": "0.0.0.0",
-        "timeout": "30s",
+        "timeout": "300s",
     }
 
     # Handle None case

From 98125406025a76c320a059eb5087eb75c9f66ac8 Mon Sep 17 00:00:00 2001
From: Adil Hafeez <adil.hafeez@gmail.com>
Date: Sat, 18 Apr 2026 21:21:15 -0700
Subject: [PATCH 14/16] Improve obs model name matching, latency metrics, and
 error reporting (#900)

---
 cli/planoai/obs/pricing.py   |  70 ++++-
 cli/planoai/obs/render.py    | 576 +++++++++++++++++++++++++++--------
 cli/test/test_obs_pricing.py |  43 +++
 cli/test/test_obs_render.py  |   8 +-
 4 files changed, 556 insertions(+), 141 deletions(-)

diff --git a/cli/planoai/obs/pricing.py b/cli/planoai/obs/pricing.py
index 19eb1297..6f2ce5b4 100644
--- a/cli/planoai/obs/pricing.py
+++ b/cli/planoai/obs/pricing.py
@@ -7,6 +7,7 @@ Single-source: one fetch at startup, cached for the life of the process.
 from __future__ import annotations
 
 import logging
+import re
 import threading
 from dataclasses import dataclass
 from typing import Any
@@ -123,13 +124,28 @@ class PricingCatalog:
         return round(cost, 6)
 
 
+_DATE_SUFFIX_RE = re.compile(r"-\d{8}$")
+_PROVIDER_PREFIXES = ("anthropic", "openai", "google", "meta", "cohere", "mistral")
+_ANTHROPIC_FAMILIES = {"opus", "sonnet", "haiku"}
+
+
 def _model_key_candidates(model_name: str) -> list[str]:
+    """Lookup-side variants of a Plano-emitted model name.
+
+    Plano resolves names like ``claude-haiku-4-5-20251001``; the catalog stores
+    them as ``anthropic-claude-haiku-4.5``. We strip the date suffix and the
+    ``provider/`` prefix here; the catalog itself registers the dash/dot and
+    family-order aliases at parse time (see :func:`_expand_aliases`).
+    """
     base = model_name.strip()
     out = [base]
     if "/" in base:
         out.append(base.split("/", 1)[1])
+    for k in list(out):
+        stripped = _DATE_SUFFIX_RE.sub("", k)
+        if stripped != k:
+            out.append(stripped)
     out.extend([v.lower() for v in list(out)])
-    # Dedup while preserving order.
     seen: set[str] = set()
     uniq = []
     for key in out:
@@ -139,6 +155,54 @@ def _model_key_candidates(model_name: str) -> list[str]:
     return uniq
 
 
+def _expand_aliases(model_id: str) -> set[str]:
+    """Catalog-side variants of a DO model id.
+
+    DO publishes Anthropic models under ids like ``anthropic-claude-opus-4.7``
+    or ``anthropic-claude-4.6-sonnet`` while Plano emits ``claude-opus-4-7`` /
+    ``claude-sonnet-4-6``. Generate a set covering provider-prefix stripping,
+    dash↔dot in version segments, and family↔version word order so a single
+    catalog entry matches every name shape we'll see at lookup.
+    """
+    aliases: set[str] = set()
+
+    def add(name: str) -> None:
+        if not name:
+            return
+        aliases.add(name)
+        aliases.add(name.lower())
+
+    add(model_id)
+
+    base = model_id
+    head, _, rest = base.partition("-")
+    if head.lower() in _PROVIDER_PREFIXES and rest:
+        add(rest)
+        base = rest
+
+    for key in list(aliases):
+        if "." in key:
+            add(key.replace(".", "-"))
+
+    parts = base.split("-")
+    if len(parts) >= 3 and parts[0].lower() == "claude":
+        rest_parts = parts[1:]
+        for i, p in enumerate(rest_parts):
+            if p.lower() in _ANTHROPIC_FAMILIES:
+                others = rest_parts[:i] + rest_parts[i + 1 :]
+                if not others:
+                    break
+                family_last = "claude-" + "-".join(others) + "-" + p
+                family_first = "claude-" + p + "-" + "-".join(others)
+                add(family_last)
+                add(family_first)
+                add(family_last.replace(".", "-"))
+                add(family_first.replace(".", "-"))
+                break
+
+    return aliases
+
+
 def _parse_do_pricing(data: Any) -> dict[str, ModelPrice]:
     """Parse DO catalog response into a ModelPrice map keyed by model id.
 
@@ -204,11 +268,13 @@ def _parse_do_pricing(data: Any) -> dict[str, ModelPrice]:
         # rates for promo/open-weight models.
         if input_rate == 0 and output_rate == 0:
             continue
-        prices[str(model_id)] = ModelPrice(
+        price = ModelPrice(
             input_per_token_usd=input_rate,
             output_per_token_usd=output_rate,
             cached_input_per_token_usd=cached_rate,
         )
+        for alias in _expand_aliases(str(model_id)):
+            prices.setdefault(alias, price)
     return prices
 
 
diff --git a/cli/planoai/obs/render.py b/cli/planoai/obs/render.py
index 602b8aed..e3583747 100644
--- a/cli/planoai/obs/render.py
+++ b/cli/planoai/obs/render.py
@@ -4,15 +4,18 @@ from __future__ import annotations
 
 from collections import Counter
 from dataclasses import dataclass
-from datetime import datetime, timezone
+from datetime import datetime
+from http import HTTPStatus
 
-from rich.box import SIMPLE
-from rich.columns import Columns
+from rich.align import Align
+from rich.box import SIMPLE, SIMPLE_HEAVY
 from rich.console import Group
 from rich.panel import Panel
 from rich.table import Table
 from rich.text import Text
 
+MAX_WIDTH = 160
+
 from planoai.obs.collector import LLMCall
 
 
@@ -24,6 +27,16 @@ class AggregateStats:
     total_output_tokens: int
     distinct_sessions: int
     current_session: str | None
+    p50_latency_ms: float | None = None
+    p95_latency_ms: float | None = None
+    p99_latency_ms: float | None = None
+    p50_ttft_ms: float | None = None
+    p95_ttft_ms: float | None = None
+    p99_ttft_ms: float | None = None
+    error_count: int = 0
+    errors_4xx: int = 0
+    errors_5xx: int = 0
+    has_cost: bool = False
 
 
 @dataclass
@@ -35,10 +48,16 @@ class ModelRollup:
     cache_write: int
     cache_read: int
     cost_usd: float
+    has_cost: bool = False
+    avg_tokens_per_sec: float | None = None
 
 
-def _now() -> datetime:
-    return datetime.now(tz=timezone.utc).astimezone()
+def _percentile(values: list[float], pct: float) -> float | None:
+    if not values:
+        return None
+    s = sorted(values)
+    k = max(0, min(len(s) - 1, int(round((pct / 100.0) * (len(s) - 1)))))
+    return s[k]
 
 
 def aggregates(calls: list[LLMCall]) -> AggregateStats:
@@ -49,6 +68,15 @@ def aggregates(calls: list[LLMCall]) -> AggregateStats:
     current = next(
         (c.session_id for c in reversed(calls) if c.session_id is not None), None
     )
+    durations = [c.duration_ms for c in calls if c.duration_ms is not None]
+    ttfts = [c.ttft_ms for c in calls if c.ttft_ms is not None]
+    errors_4xx = sum(
+        1 for c in calls if c.status_code is not None and 400 <= c.status_code < 500
+    )
+    errors_5xx = sum(
+        1 for c in calls if c.status_code is not None and c.status_code >= 500
+    )
+    has_cost = any(c.cost_usd is not None for c in calls)
     return AggregateStats(
         count=len(calls),
         total_cost_usd=total_cost,
@@ -56,11 +84,22 @@ def aggregates(calls: list[LLMCall]) -> AggregateStats:
         total_output_tokens=total_output,
         distinct_sessions=len(session_ids),
         current_session=current,
+        p50_latency_ms=_percentile(durations, 50),
+        p95_latency_ms=_percentile(durations, 95),
+        p99_latency_ms=_percentile(durations, 99),
+        p50_ttft_ms=_percentile(ttfts, 50),
+        p95_ttft_ms=_percentile(ttfts, 95),
+        p99_ttft_ms=_percentile(ttfts, 99),
+        error_count=errors_4xx + errors_5xx,
+        errors_4xx=errors_4xx,
+        errors_5xx=errors_5xx,
+        has_cost=has_cost,
     )
 
 
 def model_rollups(calls: list[LLMCall]) -> list[ModelRollup]:
-    buckets: dict[str, dict[str, float | int]] = {}
+    buckets: dict[str, dict[str, float | int | bool]] = {}
+    tps_samples: dict[str, list[float]] = {}
     for c in calls:
         key = c.model
         b = buckets.setdefault(
@@ -72,6 +111,7 @@ def model_rollups(calls: list[LLMCall]) -> list[ModelRollup]:
                 "cache_write": 0,
                 "cache_read": 0,
                 "cost": 0.0,
+                "has_cost": False,
             },
         )
         b["requests"] = int(b["requests"]) + 1
@@ -80,9 +120,16 @@ def model_rollups(calls: list[LLMCall]) -> list[ModelRollup]:
         b["cache_write"] = int(b["cache_write"]) + int(c.cache_creation_tokens or 0)
         b["cache_read"] = int(b["cache_read"]) + int(c.cached_input_tokens or 0)
         b["cost"] = float(b["cost"]) + (c.cost_usd or 0.0)
+        if c.cost_usd is not None:
+            b["has_cost"] = True
+        tps = c.tokens_per_sec
+        if tps is not None:
+            tps_samples.setdefault(key, []).append(tps)
 
     rollups: list[ModelRollup] = []
     for model, b in buckets.items():
+        samples = tps_samples.get(model)
+        avg_tps = (sum(samples) / len(samples)) if samples else None
         rollups.append(
             ModelRollup(
                 model=model,
@@ -92,34 +139,62 @@ def model_rollups(calls: list[LLMCall]) -> list[ModelRollup]:
                 cache_write=int(b["cache_write"]),
                 cache_read=int(b["cache_read"]),
                 cost_usd=float(b["cost"]),
+                has_cost=bool(b["has_cost"]),
+                avg_tokens_per_sec=avg_tps,
             )
         )
-    rollups.sort(key=lambda r: r.cost_usd, reverse=True)
+    rollups.sort(key=lambda r: (r.cost_usd, r.requests), reverse=True)
     return rollups
 
 
-def route_hits(calls: list[LLMCall]) -> list[tuple[str, int, float]]:
+@dataclass
+class RouteHit:
+    route: str
+    hits: int
+    pct: float
+    p95_latency_ms: float | None
+    error_count: int
+
+
+def route_hits(calls: list[LLMCall]) -> list[RouteHit]:
     counts: Counter[str] = Counter()
+    per_route_latency: dict[str, list[float]] = {}
+    per_route_errors: dict[str, int] = {}
     for c in calls:
-        if c.route_name:
-            counts[c.route_name] += 1
+        if not c.route_name:
+            continue
+        counts[c.route_name] += 1
+        if c.duration_ms is not None:
+            per_route_latency.setdefault(c.route_name, []).append(c.duration_ms)
+        if c.status_code is not None and c.status_code >= 400:
+            per_route_errors[c.route_name] = per_route_errors.get(c.route_name, 0) + 1
     total = sum(counts.values())
     if total == 0:
         return []
-    return [(r, n, (n / total) * 100.0) for r, n in counts.most_common()]
+    return [
+        RouteHit(
+            route=r,
+            hits=n,
+            pct=(n / total) * 100.0,
+            p95_latency_ms=_percentile(per_route_latency.get(r, []), 95),
+            error_count=per_route_errors.get(r, 0),
+        )
+        for r, n in counts.most_common()
+    ]
 
 
-def _fmt_cost(v: float | None) -> str:
+def _fmt_cost(v: float | None, *, zero: str = "—") -> str:
     if v is None:
         return "—"
     if v == 0:
-        return "$0"
-    # Adaptive precision so tiny costs ($3.8e-5) remain readable.
+        return zero
     if abs(v) < 0.0001:
         return f"${v:.8f}".rstrip("0").rstrip(".")
     if abs(v) < 0.01:
         return f"${v:.6f}".rstrip("0").rstrip(".")
-    return f"${v:.4f}"
+    if abs(v) < 1:
+        return f"${v:.4f}"
+    return f"${v:,.2f}"
 
 
 def _fmt_ms(v: float | None) -> str:
@@ -142,187 +217,418 @@ def _fmt_tokens(v: int | None) -> str:
     return f"{v:,}"
 
 
-def _request_panel(last: LLMCall | None) -> Panel:
+def _fmt_tps(v: float | None) -> str:
+    if v is None or v <= 0:
+        return "—"
+    if v >= 100:
+        return f"{v:.0f}/s"
+    return f"{v:.1f}/s"
+
+
+def _latency_style(v: float | None) -> str:
+    if v is None:
+        return "dim"
+    if v < 500:
+        return "green"
+    if v < 2000:
+        return "yellow"
+    return "red"
+
+
+def _ttft_style(v: float | None) -> str:
+    if v is None:
+        return "dim"
+    if v < 300:
+        return "green"
+    if v < 1000:
+        return "yellow"
+    return "red"
+
+
+def _truncate_model(name: str, limit: int = 32) -> str:
+    if len(name) <= limit:
+        return name
+    return name[: limit - 1] + "…"
+
+
+def _status_text(code: int | None) -> Text:
+    if code is None:
+        return Text("—", style="dim")
+    if 200 <= code < 300:
+        return Text("● ok", style="green")
+    if 300 <= code < 400:
+        return Text(f"● {code}", style="yellow")
+    if 400 <= code < 500:
+        return Text(f"● {code}", style="yellow bold")
+    return Text(f"● {code}", style="red bold")
+
+
+def _summary_panel(last: LLMCall | None, stats: AggregateStats) -> Panel:
+    # Content-sized columns with a fixed gutter keep the two blocks close
+    # together instead of stretching across the full terminal on wide screens.
+    grid = Table.grid(padding=(0, 4))
+    grid.add_column(no_wrap=True)
+    grid.add_column(no_wrap=True)
+
+    # Left: latest request snapshot.
+    left = Table.grid(padding=(0, 1))
+    left.add_column(style="dim", no_wrap=True)
+    left.add_column(no_wrap=True)
     if last is None:
-        body = Text("no requests yet", style="dim")
+        left.add_row("latest", Text("waiting for spans…", style="dim italic"))
     else:
-        t = Table.grid(padding=(0, 1))
-        t.add_column(style="bold cyan")
-        t.add_column()
-        t.add_row("Endpoint", "chat/completions")
-        status = "—" if last.status_code is None else str(last.status_code)
-        t.add_row("Status", status)
-        t.add_row("Model", last.model)
+        model_text = Text(_truncate_model(last.model, 48), style="bold cyan")
+        if last.is_streaming:
+            model_text.append("  ⟳ stream", style="dim")
+        left.add_row("model", model_text)
         if last.request_model and last.request_model != last.model:
-            t.add_row("Req model", last.request_model)
+            left.add_row(
+                "requested", Text(_truncate_model(last.request_model, 48), style="cyan")
+            )
         if last.route_name:
-            t.add_row("Route", last.route_name)
-        body = t
-    return Panel(body, title="[bold]Request[/]", border_style="cyan", box=SIMPLE)
-
-
-def _cost_panel(last: LLMCall | None) -> Panel:
-    if last is None:
-        body = Text("—", style="dim")
-    else:
-        t = Table.grid(padding=(0, 1))
-        t.add_column(style="bold green")
-        t.add_column()
-        t.add_row("Request", _fmt_cost(last.cost_usd))
-        t.add_row("Input", _fmt_tokens(last.prompt_tokens))
-        t.add_row("Output", _fmt_tokens(last.completion_tokens))
+            left.add_row("route", Text(last.route_name, style="yellow"))
+        left.add_row("status", _status_text(last.status_code))
+        tokens = Text()
+        tokens.append(_fmt_tokens(last.prompt_tokens))
+        tokens.append(" in", style="dim")
+        tokens.append("  ·  ", style="dim")
+        tokens.append(_fmt_tokens(last.completion_tokens), style="green")
+        tokens.append(" out", style="dim")
         if last.cached_input_tokens:
-            t.add_row("Cached", _fmt_tokens(last.cached_input_tokens))
-        body = t
-    return Panel(body, title="[bold]Cost[/]", border_style="green", box=SIMPLE)
+            tokens.append("  ·  ", style="dim")
+            tokens.append(_fmt_tokens(last.cached_input_tokens), style="yellow")
+            tokens.append(" cached", style="dim")
+        left.add_row("tokens", tokens)
+        timing = Text()
+        timing.append("TTFT ", style="dim")
+        timing.append(_fmt_ms(last.ttft_ms), style=_ttft_style(last.ttft_ms))
+        timing.append("  ·  ", style="dim")
+        timing.append("lat ", style="dim")
+        timing.append(_fmt_ms(last.duration_ms), style=_latency_style(last.duration_ms))
+        tps = last.tokens_per_sec
+        if tps:
+            timing.append("  ·  ", style="dim")
+            timing.append(_fmt_tps(tps), style="green")
+        left.add_row("timing", timing)
+        left.add_row("cost", Text(_fmt_cost(last.cost_usd), style="green bold"))
 
+    # Right: lifetime totals.
+    right = Table.grid(padding=(0, 1))
+    right.add_column(style="dim", no_wrap=True)
+    right.add_column(no_wrap=True)
+    right.add_row(
+        "requests",
+        Text(f"{stats.count:,}", style="bold"),
+    )
+    if stats.error_count:
+        err_text = Text()
+        err_text.append(f"{stats.error_count:,}", style="red bold")
+        parts: list[str] = []
+        if stats.errors_4xx:
+            parts.append(f"{stats.errors_4xx} 4xx")
+        if stats.errors_5xx:
+            parts.append(f"{stats.errors_5xx} 5xx")
+        if parts:
+            err_text.append(f"  ({' · '.join(parts)})", style="dim")
+        right.add_row("errors", err_text)
+    cost_str = _fmt_cost(stats.total_cost_usd) if stats.has_cost else "—"
+    right.add_row("total cost", Text(cost_str, style="green bold"))
+    tokens_total = Text()
+    tokens_total.append(_fmt_tokens(stats.total_input_tokens))
+    tokens_total.append(" in", style="dim")
+    tokens_total.append("  ·  ", style="dim")
+    tokens_total.append(_fmt_tokens(stats.total_output_tokens), style="green")
+    tokens_total.append(" out", style="dim")
+    right.add_row("tokens", tokens_total)
+    lat_text = Text()
+    lat_text.append("p50 ", style="dim")
+    lat_text.append(
+        _fmt_ms(stats.p50_latency_ms), style=_latency_style(stats.p50_latency_ms)
+    )
+    lat_text.append("  ·  ", style="dim")
+    lat_text.append("p95 ", style="dim")
+    lat_text.append(
+        _fmt_ms(stats.p95_latency_ms), style=_latency_style(stats.p95_latency_ms)
+    )
+    lat_text.append("  ·  ", style="dim")
+    lat_text.append("p99 ", style="dim")
+    lat_text.append(
+        _fmt_ms(stats.p99_latency_ms), style=_latency_style(stats.p99_latency_ms)
+    )
+    right.add_row("latency", lat_text)
+    ttft_text = Text()
+    ttft_text.append("p50 ", style="dim")
+    ttft_text.append(_fmt_ms(stats.p50_ttft_ms), style=_ttft_style(stats.p50_ttft_ms))
+    ttft_text.append("  ·  ", style="dim")
+    ttft_text.append("p95 ", style="dim")
+    ttft_text.append(_fmt_ms(stats.p95_ttft_ms), style=_ttft_style(stats.p95_ttft_ms))
+    ttft_text.append("  ·  ", style="dim")
+    ttft_text.append("p99 ", style="dim")
+    ttft_text.append(_fmt_ms(stats.p99_ttft_ms), style=_ttft_style(stats.p99_ttft_ms))
+    right.add_row("TTFT", ttft_text)
+    sess = Text()
+    sess.append(f"{stats.distinct_sessions}")
+    if stats.current_session:
+        sess.append("  ·  current ", style="dim")
+        sess.append(stats.current_session, style="magenta")
+    right.add_row("sessions", sess)
 
-def _totals_panel(stats: AggregateStats) -> Panel:
-    t = Table.grid(padding=(0, 1))
-    t.add_column(style="bold magenta")
-    t.add_column()
-    t.add_column(style="bold magenta")
-    t.add_column()
-    t.add_row(
-        "Total cost",
-        _fmt_cost(stats.total_cost_usd),
-        "Requests",
-        str(stats.count),
+    grid.add_row(left, right)
+    return Panel(
+        grid,
+        title="[bold]live LLM traffic[/]",
+        border_style="cyan",
+        box=SIMPLE_HEAVY,
+        padding=(0, 1),
     )
-    t.add_row(
-        "Input",
-        _fmt_tokens(stats.total_input_tokens),
-        "Output",
-        _fmt_tokens(stats.total_output_tokens),
-    )
-    t.add_row(
-        "Sessions",
-        str(stats.distinct_sessions),
-        "Current session",
-        stats.current_session or "—",
-    )
-    return Panel(t, title="[bold]Totals[/]", border_style="magenta", box=SIMPLE)
 
 
 def _model_rollup_table(rollups: list[ModelRollup]) -> Table:
     table = Table(
-        title="Totals by model",
+        title="by model",
+        title_justify="left",
+        title_style="bold dim",
+        caption="cost via DigitalOcean Gradient catalog",
+        caption_justify="left",
+        caption_style="dim italic",
         box=SIMPLE,
         header_style="bold",
-        expand=True,
+        pad_edge=False,
+        padding=(0, 1),
     )
-    table.add_column("Model", style="cyan")
-    table.add_column("Req", justify="right")
-    table.add_column("Input", justify="right")
-    table.add_column("Output", justify="right", style="green")
-    table.add_column("Cache write", justify="right", style="yellow")
-    table.add_column("Cache read", justify="right", style="yellow")
-    table.add_column("Cost", justify="right", style="green")
+    table.add_column("model", style="cyan", no_wrap=True)
+    table.add_column("req", justify="right")
+    table.add_column("input", justify="right")
+    table.add_column("output", justify="right", style="green")
+    table.add_column("cache wr", justify="right", style="yellow")
+    table.add_column("cache rd", justify="right", style="yellow")
+    table.add_column("tok/s", justify="right")
+    table.add_column("cost", justify="right", style="green")
     if not rollups:
-        table.add_row("—", "—", "—", "—", "—", "—", "—")
-    for r in rollups:
         table.add_row(
-            r.model,
-            str(r.requests),
+            Text("no requests yet", style="dim italic"),
+            *(["—"] * 7),
+        )
+        return table
+    for r in rollups:
+        cost_cell = _fmt_cost(r.cost_usd) if r.has_cost else "—"
+        table.add_row(
+            _truncate_model(r.model),
+            f"{r.requests:,}",
             _fmt_tokens(r.input_tokens),
             _fmt_tokens(r.output_tokens),
             _fmt_int(r.cache_write),
             _fmt_int(r.cache_read),
-            _fmt_cost(r.cost_usd),
+            _fmt_tps(r.avg_tokens_per_sec),
+            cost_cell,
         )
     return table
 
 
-def _route_hit_table(hits: list[tuple[str, int, float]]) -> Table:
+def _route_hit_table(hits: list[RouteHit]) -> Table:
     table = Table(
-        title="Route hit %",
+        title="route share",
+        title_justify="left",
+        title_style="bold dim",
         box=SIMPLE,
         header_style="bold",
-        expand=True,
+        pad_edge=False,
+        padding=(0, 1),
     )
-    table.add_column("Route", style="cyan")
-    table.add_column("Hits", justify="right")
+    table.add_column("route", style="cyan")
+    table.add_column("hits", justify="right")
     table.add_column("%", justify="right")
-    for route, n, pct in hits:
-        table.add_row(route, str(n), f"{pct:.1f}")
+    table.add_column("p95", justify="right")
+    table.add_column("err", justify="right")
+    for h in hits:
+        err_cell = (
+            Text(f"{h.error_count:,}", style="red bold") if h.error_count else "—"
+        )
+        table.add_row(
+            h.route,
+            f"{h.hits:,}",
+            f"{h.pct:5.1f}%",
+            Text(_fmt_ms(h.p95_latency_ms), style=_latency_style(h.p95_latency_ms)),
+            err_cell,
+        )
     return table
 
 
 def _recent_table(calls: list[LLMCall], limit: int = 15) -> Table:
     show_route = any(c.route_name for c in calls)
+    show_cache = any((c.cached_input_tokens or 0) > 0 for c in calls)
+    show_rsn = any((c.reasoning_tokens or 0) > 0 for c in calls)
+
+    caption_parts = ["in·new = fresh prompt tokens"]
+    if show_cache:
+        caption_parts.append("in·cache = cached read")
+    if show_rsn:
+        caption_parts.append("rsn = reasoning")
+    caption_parts.append("lat = total latency")
+
     table = Table(
-        title="Recent requests",
+        title=f"recent · last {min(limit, len(calls)) if calls else 0}",
+        title_justify="left",
+        title_style="bold dim",
+        caption="  ·  ".join(caption_parts),
+        caption_justify="left",
+        caption_style="dim italic",
         box=SIMPLE,
         header_style="bold",
-        expand=True,
+        pad_edge=False,
+        padding=(0, 1),
     )
-    table.add_column("time")
-    table.add_column("model", style="cyan")
+    table.add_column("time", no_wrap=True)
+    table.add_column("model", style="cyan", no_wrap=True)
     if show_route:
-        table.add_column("route", style="yellow")
-    table.add_column("in", justify="right")
-    table.add_column("cache", justify="right", style="yellow")
+        table.add_column("route", style="yellow", no_wrap=True)
+    table.add_column("in·new", justify="right")
+    if show_cache:
+        table.add_column("in·cache", justify="right", style="yellow")
     table.add_column("out", justify="right", style="green")
-    table.add_column("rsn", justify="right")
-    table.add_column("cost", justify="right", style="green")
+    if show_rsn:
+        table.add_column("rsn", justify="right")
+    table.add_column("tok/s", justify="right")
     table.add_column("TTFT", justify="right")
     table.add_column("lat", justify="right")
-    table.add_column("st")
+    table.add_column("cost", justify="right", style="green")
+    table.add_column("status")
+
+    if not calls:
+        cols = len(table.columns)
+        table.add_row(
+            Text("waiting for spans…", style="dim italic"),
+            *(["—"] * (cols - 1)),
+        )
+        return table
 
     recent = list(reversed(calls))[:limit]
-    for c in recent:
-        status_cell = (
-            "ok"
-            if c.status_code and 200 <= c.status_code < 400
-            else str(c.status_code or "—")
-        )
-        row = [
-            c.timestamp.strftime("%H:%M:%S"),
-            c.model,
+    for idx, c in enumerate(recent):
+        is_newest = idx == 0
+        time_style = "bold white" if is_newest else None
+        model_style = "bold cyan" if is_newest else "cyan"
+        row: list[object] = [
+            (
+                Text(c.timestamp.strftime("%H:%M:%S"), style=time_style)
+                if time_style
+                else c.timestamp.strftime("%H:%M:%S")
+            ),
+            Text(_truncate_model(c.model), style=model_style),
         ]
         if show_route:
             row.append(c.route_name or "—")
+        row.append(_fmt_tokens(c.prompt_tokens))
+        if show_cache:
+            row.append(_fmt_int(c.cached_input_tokens))
+        row.append(_fmt_tokens(c.completion_tokens))
+        if show_rsn:
+            row.append(_fmt_int(c.reasoning_tokens))
         row.extend(
             [
-                _fmt_tokens(c.prompt_tokens),
-                _fmt_int(c.cached_input_tokens),
-                _fmt_tokens(c.completion_tokens),
-                _fmt_int(c.reasoning_tokens),
+                _fmt_tps(c.tokens_per_sec),
+                Text(_fmt_ms(c.ttft_ms), style=_ttft_style(c.ttft_ms)),
+                Text(_fmt_ms(c.duration_ms), style=_latency_style(c.duration_ms)),
                 _fmt_cost(c.cost_usd),
-                _fmt_ms(c.ttft_ms),
-                _fmt_ms(c.duration_ms),
-                status_cell,
+                _status_text(c.status_code),
             ]
         )
         table.add_row(*row)
-    if not recent:
-        table.add_row(*(["no requests yet"] + ["—"] * (10 if show_route else 9)))
     return table
 
 
-def render(calls: list[LLMCall]) -> Group:
+def _last_error(calls: list[LLMCall]) -> LLMCall | None:
+    for c in reversed(calls):
+        if c.status_code is not None and c.status_code >= 400:
+            return c
+    return None
+
+
+def _http_reason(code: int) -> str:
+    try:
+        return HTTPStatus(code).phrase
+    except ValueError:
+        return ""
+
+
+def _fmt_ago(ts: datetime) -> str:
+    # `ts` is produced in collector.py via datetime.now(tz=...), but fall back
+    # gracefully if a naive timestamp ever sneaks in.
+    now = datetime.now(tz=ts.tzinfo) if ts.tzinfo else datetime.now()
+    delta = (now - ts).total_seconds()
+    if delta < 0:
+        delta = 0
+    if delta < 60:
+        return f"{int(delta)}s ago"
+    if delta < 3600:
+        return f"{int(delta // 60)}m ago"
+    return f"{int(delta // 3600)}h ago"
+
+
+def _error_banner(call: LLMCall) -> Panel:
+    code = call.status_code or 0
+    border = "red" if code >= 500 else "yellow"
+    header = Text()
+    header.append(f"● {code}", style=f"{border} bold")
+    reason = _http_reason(code)
+    if reason:
+        header.append(f" {reason}", style=border)
+    header.append("  ·  ", style="dim")
+    header.append(_truncate_model(call.model, 48), style="cyan")
+    if call.route_name:
+        header.append("  ·  ", style="dim")
+        header.append(call.route_name, style="yellow")
+    header.append("  ·  ", style="dim")
+    header.append(_fmt_ago(call.timestamp), style="dim")
+    if call.request_id:
+        header.append("  ·  req ", style="dim")
+        header.append(call.request_id, style="magenta")
+    return Panel(
+        header,
+        title="[bold]last error[/]",
+        title_align="left",
+        border_style=border,
+        box=SIMPLE,
+        padding=(0, 1),
+    )
+
+
+def _footer(stats: AggregateStats) -> Text:
+    waiting = stats.count == 0
+    text = Text()
+    text.append("Ctrl-C ", style="bold")
+    text.append("exit", style="dim")
+    text.append("  ·  OTLP :4317", style="dim")
+    text.append("  ·  pricing: DigitalOcean ", style="dim")
+    if waiting:
+        text.append("waiting for spans", style="yellow")
+        text.append(
+            " — set tracing.opentracing_grpc_endpoint=localhost:4317", style="dim"
+        )
+    else:
+        text.append(f"receiving · {stats.count:,} call(s) buffered", style="green")
+    return text
+
+
+def render(calls: list[LLMCall]) -> Align:
     last = calls[-1] if calls else None
     stats = aggregates(calls)
     rollups = model_rollups(calls)
     hits = route_hits(calls)
 
-    header = Columns(
-        [_request_panel(last), _cost_panel(last), _totals_panel(stats)],
-        expand=True,
-        equal=True,
-    )
-    parts = [
-        header,
-        _model_rollup_table(rollups),
-    ]
+    parts: list[object] = [_summary_panel(last, stats)]
+    err = _last_error(calls)
+    if err is not None:
+        parts.append(_error_banner(err))
     if hits:
-        parts.append(_route_hit_table(hits))
+        split = Table.grid(padding=(0, 2))
+        split.add_column(no_wrap=False)
+        split.add_column(no_wrap=False)
+        split.add_row(_model_rollup_table(rollups), _route_hit_table(hits))
+        parts.append(split)
+    else:
+        parts.append(_model_rollup_table(rollups))
     parts.append(_recent_table(calls))
-    parts.append(
-        Text(
-            "q quit · c clear · waiting for spans on OTLP :4317 — brightstaff needs "
-            "tracing.opentracing_grpc_endpoint=localhost:4317",
-            style="dim",
-        )
-    )
-    return Group(*parts)
+    parts.append(_footer(stats))
+    # Cap overall width so wide terminals don't stretch the layout into a
+    # mostly-whitespace gap between columns.
+    return Align.left(Group(*parts), width=MAX_WIDTH)
diff --git a/cli/test/test_obs_pricing.py b/cli/test/test_obs_pricing.py
index 95f9a2da..02247d3d 100644
--- a/cli/test/test_obs_pricing.py
+++ b/cli/test/test_obs_pricing.py
@@ -83,6 +83,49 @@ def test_parse_do_catalog_treats_small_values_as_per_token():
     assert prices["openai-gpt-oss-120b"].input_per_token_usd == 1e-7
 
 
+def test_anthropic_aliases_match_plano_emitted_names():
+    """DO publishes 'anthropic-claude-opus-4.7' and 'anthropic-claude-haiku-4.5';
+    Plano emits 'claude-opus-4-7' and 'claude-haiku-4-5-20251001'. Aliases
+    registered at parse time should bridge the gap."""
+    from planoai.obs.pricing import _parse_do_pricing
+
+    sample = {
+        "data": [
+            {
+                "model_id": "anthropic-claude-opus-4.7",
+                "pricing": {
+                    "input_price_per_million": 15.0,
+                    "output_price_per_million": 75.0,
+                },
+            },
+            {
+                "model_id": "anthropic-claude-haiku-4.5",
+                "pricing": {
+                    "input_price_per_million": 1.0,
+                    "output_price_per_million": 5.0,
+                },
+            },
+            {
+                "model_id": "anthropic-claude-4.6-sonnet",
+                "pricing": {
+                    "input_price_per_million": 3.0,
+                    "output_price_per_million": 15.0,
+                },
+            },
+        ]
+    }
+    catalog = PricingCatalog(_parse_do_pricing(sample))
+    # Family-last shapes Plano emits.
+    assert catalog.price_for("claude-opus-4-7") is not None
+    assert catalog.price_for("claude-haiku-4-5") is not None
+    # Date-suffixed name (Anthropic API style).
+    assert catalog.price_for("claude-haiku-4-5-20251001") is not None
+    # Word-order swap: DO has 'claude-4.6-sonnet', Plano emits 'claude-sonnet-4-6'.
+    assert catalog.price_for("claude-sonnet-4-6") is not None
+    # Original DO ids still resolve.
+    assert catalog.price_for("anthropic-claude-opus-4.7") is not None
+
+
 def test_parse_do_catalog_divides_large_values_as_per_million():
     """A provider that genuinely reports $5-per-million in that field gets divided."""
     from planoai.obs.pricing import _parse_do_pricing
diff --git a/cli/test/test_obs_render.py b/cli/test/test_obs_render.py
index 11f4a1fc..dd598363 100644
--- a/cli/test/test_obs_render.py
+++ b/cli/test/test_obs_render.py
@@ -94,10 +94,10 @@ def test_route_hits_only_for_routed_calls():
     ]
     hits = route_hits(calls)
     # Only calls with route names are counted.
-    assert sum(n for _, n, _ in hits) == 3
-    hits_by_name = {name: (n, pct) for name, n, pct in hits}
-    assert hits_by_name["code"][0] == 2
-    assert hits_by_name["summarization"][0] == 1
+    assert sum(h.hits for h in hits) == 3
+    hits_by_name = {h.route: h for h in hits}
+    assert hits_by_name["code"].hits == 2
+    assert hits_by_name["summarization"].hits == 1
 
 
 def test_route_hits_empty_when_no_routes():

From 22f332f62dfc1c642140b0e66f12251fadd186fb Mon Sep 17 00:00:00 2001
From: Adil Hafeez <adil.hafeez@gmail.com>
Date: Wed, 22 Apr 2026 11:19:10 -0700
Subject: [PATCH 15/16] Add Prometheus metrics endpoint and Grafana dashboard
 for brightstaff (#904)

---
 config/grafana/brightstaff_dashboard.json     | 541 ++++++++++++++++++
 config/grafana/docker-compose.yaml            |  43 ++
 config/grafana/prometheus_scrape.yaml         |  44 ++
 .../provisioning/dashboards/brightstaff.yaml  |  15 +
 .../provisioning/datasources/prometheus.yaml  |  14 +
 crates/Cargo.lock                             | 332 ++++++++++-
 crates/brightstaff/Cargo.toml                 |   3 +
 crates/brightstaff/src/handlers/llm/mod.rs    |  25 +-
 .../src/handlers/llm/model_selection.rs       |  31 +-
 .../src/handlers/routing_service.rs           |  14 +
 crates/brightstaff/src/lib.rs                 |   1 +
 crates/brightstaff/src/main.rs                |  72 +++
 crates/brightstaff/src/metrics/labels.rs      |  38 ++
 crates/brightstaff/src/metrics/mod.rs         | 377 ++++++++++++
 crates/brightstaff/src/router/orchestrator.rs |  11 +-
 crates/brightstaff/src/streaming.rs           |  76 ++-
 .../guides/observability/monitoring.rst       |  51 ++
 17 files changed, 1682 insertions(+), 6 deletions(-)
 create mode 100644 config/grafana/brightstaff_dashboard.json
 create mode 100644 config/grafana/docker-compose.yaml
 create mode 100644 config/grafana/prometheus_scrape.yaml
 create mode 100644 config/grafana/provisioning/dashboards/brightstaff.yaml
 create mode 100644 config/grafana/provisioning/datasources/prometheus.yaml
 create mode 100644 crates/brightstaff/src/metrics/labels.rs
 create mode 100644 crates/brightstaff/src/metrics/mod.rs

diff --git a/config/grafana/brightstaff_dashboard.json b/config/grafana/brightstaff_dashboard.json
new file mode 100644
index 00000000..4b54721f
--- /dev/null
+++ b/config/grafana/brightstaff_dashboard.json
@@ -0,0 +1,541 @@
+{
+  "annotations": {
+    "list": [
+      {
+        "builtIn": 1,
+        "datasource": "-- Grafana --",
+        "enable": true,
+        "hide": true,
+        "iconColor": "rgba(0, 211, 255, 1)",
+        "name": "Annotations & Alerts",
+        "type": "dashboard"
+      }
+    ]
+  },
+  "description": "RED, LLM upstream, routing service, and process metrics for brightstaff. Pair with Envoy admin metrics from cluster=bright_staff.",
+  "editable": true,
+  "fiscalYearStartMonth": 0,
+  "graphTooltip": 1,
+  "id": null,
+  "links": [],
+  "liveNow": false,
+  "panels": [
+    {
+      "collapsed": false,
+      "gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 },
+      "id": 100,
+      "panels": [],
+      "title": "HTTP RED",
+      "type": "row"
+    },
+    {
+      "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
+      "fieldConfig": {
+        "defaults": {
+          "color": { "mode": "palette-classic" },
+          "custom": {
+            "axisLabel": "req/s",
+            "drawStyle": "line",
+            "fillOpacity": 10,
+            "lineWidth": 1,
+            "showPoints": "never"
+          },
+          "unit": "reqps"
+        }
+      },
+      "gridPos": { "h": 8, "w": 12, "x": 0, "y": 1 },
+      "id": 1,
+      "options": {
+        "legend": { "displayMode": "table", "placement": "bottom", "showLegend": true },
+        "tooltip": { "mode": "multi" }
+      },
+      "targets": [
+        {
+          "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
+          "expr": "sum by (handler) (rate(brightstaff_http_requests_total[1m]))",
+          "legendFormat": "{{handler}}",
+          "refId": "A"
+        }
+      ],
+      "title": "Rate — brightstaff RPS by handler",
+      "type": "timeseries"
+    },
+    {
+      "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
+      "description": "5xx fraction over 5m. Page-worthy when sustained above ~1%.",
+      "fieldConfig": {
+        "defaults": {
+          "color": { "mode": "thresholds" },
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              { "color": "green", "value": null },
+              { "color": "yellow", "value": 0.01 },
+              { "color": "red", "value": 0.05 }
+            ]
+          },
+          "unit": "percentunit"
+        }
+      },
+      "gridPos": { "h": 8, "w": 12, "x": 12, "y": 1 },
+      "id": 2,
+      "options": {
+        "colorMode": "background",
+        "graphMode": "area",
+        "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }
+      },
+      "targets": [
+        {
+          "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
+          "expr": "sum(rate(brightstaff_http_requests_total{status_class=\"5xx\"}[5m])) / clamp_min(sum(rate(brightstaff_http_requests_total[5m])), 1)",
+          "legendFormat": "5xx rate",
+          "refId": "A"
+        }
+      ],
+      "title": "Errors — brightstaff 5xx rate",
+      "type": "stat"
+    },
+    {
+      "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
+      "description": "p50/p95/p99 by handler, computed from histogram buckets over 5m.",
+      "fieldConfig": {
+        "defaults": {
+          "color": { "mode": "palette-classic" },
+          "custom": { "drawStyle": "line", "fillOpacity": 5, "lineWidth": 1, "showPoints": "never" },
+          "unit": "s"
+        }
+      },
+      "gridPos": { "h": 9, "w": 24, "x": 0, "y": 9 },
+      "id": 3,
+      "options": {
+        "legend": { "displayMode": "table", "placement": "bottom", "showLegend": true },
+        "tooltip": { "mode": "multi" }
+      },
+      "targets": [
+        {
+          "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
+          "expr": "histogram_quantile(0.50, sum by (le, handler) (rate(brightstaff_http_request_duration_seconds_bucket[5m])))",
+          "legendFormat": "p50 {{handler}}",
+          "refId": "A"
+        },
+        {
+          "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
+          "expr": "histogram_quantile(0.95, sum by (le, handler) (rate(brightstaff_http_request_duration_seconds_bucket[5m])))",
+          "legendFormat": "p95 {{handler}}",
+          "refId": "B"
+        },
+        {
+          "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
+          "expr": "histogram_quantile(0.99, sum by (le, handler) (rate(brightstaff_http_request_duration_seconds_bucket[5m])))",
+          "legendFormat": "p99 {{handler}}",
+          "refId": "C"
+        }
+      ],
+      "title": "Duration — p50 / p95 / p99 by handler",
+      "type": "timeseries"
+    },
+    {
+      "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
+      "description": "In-flight requests by handler. Climbs before latency does when brightstaff is saturated.",
+      "fieldConfig": {
+        "defaults": {
+          "color": { "mode": "palette-classic" },
+          "custom": { "drawStyle": "line", "fillOpacity": 10, "lineWidth": 1, "showPoints": "never" },
+          "unit": "short"
+        }
+      },
+      "gridPos": { "h": 8, "w": 24, "x": 0, "y": 18 },
+      "id": 4,
+      "options": {
+        "legend": { "displayMode": "table", "placement": "bottom", "showLegend": true },
+        "tooltip": { "mode": "multi" }
+      },
+      "targets": [
+        {
+          "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
+          "expr": "sum by (handler) (brightstaff_http_in_flight_requests)",
+          "legendFormat": "{{handler}}",
+          "refId": "A"
+        }
+      ],
+      "title": "In-flight requests by handler",
+      "type": "timeseries"
+    },
+    {
+      "collapsed": false,
+      "gridPos": { "h": 1, "w": 24, "x": 0, "y": 26 },
+      "id": 200,
+      "panels": [],
+      "title": "LLM upstream",
+      "type": "row"
+    },
+    {
+      "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
+      "fieldConfig": {
+        "defaults": {
+          "color": { "mode": "palette-classic" },
+          "custom": { "drawStyle": "line", "fillOpacity": 5, "lineWidth": 1, "showPoints": "never" },
+          "unit": "s"
+        }
+      },
+      "gridPos": { "h": 9, "w": 12, "x": 0, "y": 27 },
+      "id": 5,
+      "options": {
+        "legend": { "displayMode": "table", "placement": "bottom", "showLegend": true },
+        "tooltip": { "mode": "multi" }
+      },
+      "targets": [
+        {
+          "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
+          "expr": "histogram_quantile(0.95, sum by (le, provider, model) (rate(brightstaff_llm_upstream_duration_seconds_bucket[5m])))",
+          "legendFormat": "p95 {{provider}}/{{model}}",
+          "refId": "A"
+        }
+      ],
+      "title": "LLM upstream p95 by provider/model",
+      "type": "timeseries"
+    },
+    {
+      "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
+      "description": "All non-success error classes. timeout/connect = network, 5xx/429 = provider, parse = body shape mismatch, stream = mid-stream disconnect.",
+      "fieldConfig": {
+        "defaults": {
+          "color": { "mode": "palette-classic" },
+          "custom": { "drawStyle": "line", "fillOpacity": 30, "lineWidth": 1, "showPoints": "never", "stacking": { "mode": "normal" } },
+          "unit": "reqps"
+        }
+      },
+      "gridPos": { "h": 9, "w": 12, "x": 12, "y": 27 },
+      "id": 6,
+      "options": {
+        "legend": { "displayMode": "table", "placement": "bottom", "showLegend": true },
+        "tooltip": { "mode": "multi" }
+      },
+      "targets": [
+        {
+          "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
+          "expr": "sum by (provider, error_class) (rate(brightstaff_llm_upstream_requests_total{error_class!=\"none\"}[5m]))",
+          "legendFormat": "{{provider}} / {{error_class}}",
+          "refId": "A"
+        }
+      ],
+      "title": "LLM upstream errors by provider / class",
+      "type": "timeseries"
+    },
+    {
+      "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
+      "description": "Streaming only. Empty if the route never streams.",
+      "fieldConfig": {
+        "defaults": {
+          "color": { "mode": "palette-classic" },
+          "custom": { "drawStyle": "line", "fillOpacity": 5, "lineWidth": 1, "showPoints": "never" },
+          "unit": "s"
+        }
+      },
+      "gridPos": { "h": 9, "w": 12, "x": 0, "y": 36 },
+      "id": 7,
+      "options": {
+        "legend": { "displayMode": "table", "placement": "bottom", "showLegend": true },
+        "tooltip": { "mode": "multi" }
+      },
+      "targets": [
+        {
+          "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
+          "expr": "histogram_quantile(0.95, sum by (le, provider, model) (rate(brightstaff_llm_time_to_first_token_seconds_bucket[5m])))",
+          "legendFormat": "p95 {{provider}}/{{model}}",
+          "refId": "A"
+        }
+      ],
+      "title": "Time-to-first-token p95 (streaming)",
+      "type": "timeseries"
+    },
+    {
+      "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
+      "description": "Tokens/sec by provider/model/kind — proxy for cost. Stacked.",
+      "fieldConfig": {
+        "defaults": {
+          "color": { "mode": "palette-classic" },
+          "custom": { "drawStyle": "line", "fillOpacity": 30, "lineWidth": 1, "showPoints": "never", "stacking": { "mode": "normal" } },
+          "unit": "tokens/s"
+        }
+      },
+      "gridPos": { "h": 9, "w": 12, "x": 12, "y": 36 },
+      "id": 8,
+      "options": {
+        "legend": { "displayMode": "table", "placement": "bottom", "showLegend": true },
+        "tooltip": { "mode": "multi" }
+      },
+      "targets": [
+        {
+          "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
+          "expr": "sum by (provider, model, kind) (rate(brightstaff_llm_tokens_total[5m]))",
+          "legendFormat": "{{provider}}/{{model}} {{kind}}",
+          "refId": "A"
+        }
+      ],
+      "title": "Token throughput by provider / model / kind",
+      "type": "timeseries"
+    },
+    {
+      "collapsed": false,
+      "gridPos": { "h": 1, "w": 24, "x": 0, "y": 45 },
+      "id": 300,
+      "panels": [],
+      "title": "Routing service",
+      "type": "row"
+    },
+    {
+      "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
+      "description": "Which models the orchestrator picked over the last 15 minutes.",
+      "fieldConfig": {
+        "defaults": {
+          "color": { "mode": "palette-classic" },
+          "unit": "short"
+        }
+      },
+      "gridPos": { "h": 9, "w": 12, "x": 0, "y": 46 },
+      "id": 9,
+      "options": {
+        "displayMode": "gradient",
+        "orientation": "horizontal",
+        "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }
+      },
+      "targets": [
+        {
+          "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
+          "expr": "sum by (selected_model) (increase(brightstaff_router_decisions_total[15m]))",
+          "legendFormat": "{{selected_model}}",
+          "refId": "A"
+        }
+      ],
+      "title": "Model selection distribution (last 15m)",
+      "type": "bargauge"
+    },
+    {
+      "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
+      "description": "Fraction of decisions that fell back (orchestrator returned `none` or errored). High = router can't classify intent or no candidates configured.",
+      "fieldConfig": {
+        "defaults": {
+          "color": { "mode": "palette-classic" },
+          "custom": { "drawStyle": "line", "fillOpacity": 10, "lineWidth": 1, "showPoints": "never" },
+          "unit": "percentunit"
+        }
+      },
+      "gridPos": { "h": 9, "w": 12, "x": 12, "y": 46 },
+      "id": 10,
+      "options": {
+        "legend": { "displayMode": "table", "placement": "bottom", "showLegend": true },
+        "tooltip": { "mode": "multi" }
+      },
+      "targets": [
+        {
+          "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
+          "expr": "sum by (route) (rate(brightstaff_router_decisions_total{fallback=\"true\"}[5m])) / clamp_min(sum by (route) (rate(brightstaff_router_decisions_total[5m])), 1)",
+          "legendFormat": "{{route}}",
+          "refId": "A"
+        }
+      ],
+      "title": "Fallback rate by route",
+      "type": "timeseries"
+    },
+    {
+      "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
+      "fieldConfig": {
+        "defaults": {
+          "color": { "mode": "palette-classic" },
+          "custom": { "drawStyle": "line", "fillOpacity": 5, "lineWidth": 1, "showPoints": "never" },
+          "unit": "s"
+        }
+      },
+      "gridPos": { "h": 8, "w": 12, "x": 0, "y": 55 },
+      "id": 11,
+      "options": {
+        "legend": { "displayMode": "table", "placement": "bottom", "showLegend": true },
+        "tooltip": { "mode": "multi" }
+      },
+      "targets": [
+        {
+          "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
+          "expr": "histogram_quantile(0.95, sum by (le, route) (rate(brightstaff_router_decision_duration_seconds_bucket[5m])))",
+          "legendFormat": "p95 {{route}}",
+          "refId": "A"
+        }
+      ],
+      "title": "Router decision p95 latency",
+      "type": "timeseries"
+    },
+    {
+      "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
+      "description": "Hit / (hit + miss). Low ratio = sessions aren't being reused or TTL too short.",
+      "fieldConfig": {
+        "defaults": {
+          "color": { "mode": "thresholds" },
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              { "color": "red", "value": null },
+              { "color": "yellow", "value": 0.5 },
+              { "color": "green", "value": 0.8 }
+            ]
+          },
+          "unit": "percentunit",
+          "min": 0,
+          "max": 1
+        }
+      },
+      "gridPos": { "h": 8, "w": 6, "x": 12, "y": 55 },
+      "id": 12,
+      "options": {
+        "colorMode": "background",
+        "graphMode": "area",
+        "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }
+      },
+      "targets": [
+        {
+          "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
+          "expr": "sum(rate(brightstaff_session_cache_events_total{outcome=\"hit\"}[5m])) / clamp_min(sum(rate(brightstaff_session_cache_events_total{outcome=~\"hit|miss\"}[5m])), 1)",
+          "legendFormat": "hit rate",
+          "refId": "A"
+        }
+      ],
+      "title": "Session cache hit rate",
+      "type": "stat"
+    },
+    {
+      "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
+      "description": "decision_served = a real model picked. no_candidates = sentinel `none` returned. policy_error = orchestrator failed.",
+      "fieldConfig": {
+        "defaults": {
+          "color": { "mode": "palette-classic" },
+          "custom": { "drawStyle": "line", "fillOpacity": 30, "lineWidth": 1, "showPoints": "never", "stacking": { "mode": "normal" } },
+          "unit": "reqps"
+        }
+      },
+      "gridPos": { "h": 8, "w": 6, "x": 18, "y": 55 },
+      "id": 13,
+      "options": {
+        "legend": { "displayMode": "list", "placement": "bottom", "showLegend": true },
+        "tooltip": { "mode": "multi" }
+      },
+      "targets": [
+        {
+          "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
+          "expr": "sum by (outcome) (rate(brightstaff_routing_service_requests_total[5m]))",
+          "legendFormat": "{{outcome}}",
+          "refId": "A"
+        }
+      ],
+      "title": "/routing/* outcomes",
+      "type": "timeseries"
+    },
+    {
+      "collapsed": false,
+      "gridPos": { "h": 1, "w": 24, "x": 0, "y": 63 },
+      "id": 400,
+      "panels": [],
+      "title": "Process & Envoy link",
+      "type": "row"
+    },
+    {
+      "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
+      "description": "Compare to brightstaff RPS (panel 1) — sustained gap = network or Envoy queueing.",
+      "fieldConfig": {
+        "defaults": {
+          "color": { "mode": "palette-classic" },
+          "custom": { "drawStyle": "line", "fillOpacity": 10, "lineWidth": 1, "showPoints": "never" },
+          "unit": "reqps"
+        }
+      },
+      "gridPos": { "h": 8, "w": 12, "x": 0, "y": 64 },
+      "id": 14,
+      "options": {
+        "legend": { "displayMode": "table", "placement": "bottom", "showLegend": true },
+        "tooltip": { "mode": "multi" }
+      },
+      "targets": [
+        {
+          "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
+          "expr": "sum(rate(envoy_cluster_upstream_rq_total{envoy_cluster_name=\"bright_staff\"}[1m]))",
+          "legendFormat": "envoy → bright_staff",
+          "refId": "A"
+        },
+        {
+          "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
+          "expr": "sum(rate(brightstaff_http_requests_total[1m]))",
+          "legendFormat": "brightstaff served",
+          "refId": "B"
+        }
+      ],
+      "title": "Envoy → brightstaff link health",
+      "type": "timeseries"
+    },
+    {
+      "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
+      "fieldConfig": {
+        "defaults": {
+          "color": { "mode": "palette-classic" },
+          "custom": { "drawStyle": "line", "fillOpacity": 10, "lineWidth": 1, "showPoints": "never" }
+        },
+        "overrides": [
+          {
+            "matcher": { "id": "byName", "options": "RSS" },
+            "properties": [{ "id": "unit", "value": "bytes" }]
+          },
+          {
+            "matcher": { "id": "byName", "options": "CPU" },
+            "properties": [{ "id": "unit", "value": "percentunit" }]
+          }
+        ]
+      },
+      "gridPos": { "h": 8, "w": 12, "x": 12, "y": 64 },
+      "id": 15,
+      "options": {
+        "legend": { "displayMode": "table", "placement": "bottom", "showLegend": true },
+        "tooltip": { "mode": "multi" }
+      },
+      "targets": [
+        {
+          "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
+          "expr": "process_resident_memory_bytes{job=\"brightstaff\"}",
+          "legendFormat": "RSS",
+          "refId": "A"
+        },
+        {
+          "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
+          "expr": "rate(process_cpu_seconds_total{job=\"brightstaff\"}[1m])",
+          "legendFormat": "CPU",
+          "refId": "B"
+        }
+      ],
+      "title": "Brightstaff process RSS / CPU",
+      "type": "timeseries"
+    }
+  ],
+  "refresh": "30s",
+  "schemaVersion": 39,
+  "tags": ["plano", "brightstaff", "llm"],
+  "templating": {
+    "list": [
+      {
+        "name": "DS_PROMETHEUS",
+        "label": "Prometheus",
+        "type": "datasource",
+        "query": "prometheus",
+        "current": { "selected": false, "text": "Prometheus", "value": "DS_PROMETHEUS" },
+        "hide": 0,
+        "refresh": 1,
+        "regex": "",
+        "skipUrlSync": false,
+        "includeAll": false,
+        "multi": false
+      }
+    ]
+  },
+  "time": { "from": "now-1h", "to": "now" },
+  "timepicker": {},
+  "timezone": "browser",
+  "title": "Brightstaff (Plano dataplane)",
+  "uid": "brightstaff",
+  "version": 1,
+  "weekStart": ""
+}
diff --git a/config/grafana/docker-compose.yaml b/config/grafana/docker-compose.yaml
new file mode 100644
index 00000000..33238073
--- /dev/null
+++ b/config/grafana/docker-compose.yaml
@@ -0,0 +1,43 @@
+# One-command Prometheus + Grafana stack for observing a locally-running
+# Plano (Envoy admin :9901 + brightstaff :9092 on the host).
+#
+#   cd config/grafana
+#   docker compose up -d
+#   open http://localhost:3000  (admin / admin)
+#
+# Grafana is preloaded with:
+#   - Prometheus datasource (uid=DS_PROMETHEUS) → http://prometheus:9090
+#   - Brightstaff dashboard (auto-imported from brightstaff_dashboard.json)
+#
+# Prometheus scrapes the host's :9092 and :9901 via host.docker.internal.
+# On Linux this works because of the `extra_hosts: host-gateway` mapping
+# below. On Mac/Win it works natively.
+
+services:
+  prometheus:
+    image: prom/prometheus:latest
+    container_name: plano-prometheus
+    ports:
+      - "9090:9090"
+    volumes:
+      - ./prometheus_scrape.yaml:/etc/prometheus/prometheus.yml:ro
+    extra_hosts:
+      - "host.docker.internal:host-gateway"
+    restart: unless-stopped
+
+  grafana:
+    image: grafana/grafana:latest
+    container_name: plano-grafana
+    ports:
+      - "3000:3000"
+    environment:
+      GF_SECURITY_ADMIN_USER: admin
+      GF_SECURITY_ADMIN_PASSWORD: admin
+      GF_AUTH_ANONYMOUS_ENABLED: "true"
+      GF_AUTH_ANONYMOUS_ORG_ROLE: Viewer
+    volumes:
+      - ./provisioning:/etc/grafana/provisioning:ro
+      - ./brightstaff_dashboard.json:/var/lib/grafana/dashboards/brightstaff_dashboard.json:ro
+    depends_on:
+      - prometheus
+    restart: unless-stopped
diff --git a/config/grafana/prometheus_scrape.yaml b/config/grafana/prometheus_scrape.yaml
new file mode 100644
index 00000000..b4041287
--- /dev/null
+++ b/config/grafana/prometheus_scrape.yaml
@@ -0,0 +1,44 @@
+# Prometheus config that scrapes Plano (Envoy admin + brightstaff). This is
+# a complete Prometheus config — mount it directly at
+# /etc/prometheus/prometheus.yml. The included docker-compose.yaml does this
+# for you.
+#
+# Targets:
+# - envoy:9901      Envoy admin → envoy_cluster_*, envoy_http_*, envoy_server_*.
+# - brightstaff:9092  Native dataplane → brightstaff_http_*, brightstaff_llm_*,
+#                     brightstaff_router_*, process_*.
+#
+# Hostname `host.docker.internal` works on Docker Desktop (Mac/Win) and on
+# Linux when the container is started with `--add-host=host.docker.internal:
+# host-gateway` (the included compose does this). If Plano runs *inside*
+# Docker on the same network as Prometheus, replace it with the container
+# name (e.g. `plano:9092`).
+#
+# This file is unrelated to demos/llm_routing/model_routing_service/prometheus.yaml,
+# which scrapes a fake metrics service to feed the routing engine.
+
+global:
+  scrape_interval: 15s
+  scrape_timeout: 10s
+  evaluation_interval: 15s
+
+scrape_configs:
+  - job_name: envoy
+    honor_timestamps: true
+    metrics_path: /stats
+    params:
+      format: ["prometheus"]
+    static_configs:
+      - targets:
+          - host.docker.internal:9901
+        labels:
+          service: plano
+
+  - job_name: brightstaff
+    honor_timestamps: true
+    metrics_path: /metrics
+    static_configs:
+      - targets:
+          - host.docker.internal:9092
+        labels:
+          service: plano
diff --git a/config/grafana/provisioning/dashboards/brightstaff.yaml b/config/grafana/provisioning/dashboards/brightstaff.yaml
new file mode 100644
index 00000000..271e4a9b
--- /dev/null
+++ b/config/grafana/provisioning/dashboards/brightstaff.yaml
@@ -0,0 +1,15 @@
+# Auto-load the brightstaff dashboard JSON on Grafana startup.
+
+apiVersion: 1
+
+providers:
+  - name: brightstaff
+    orgId: 1
+    folder: Plano
+    type: file
+    disableDeletion: false
+    updateIntervalSeconds: 30
+    allowUiUpdates: true
+    options:
+      path: /var/lib/grafana/dashboards
+      foldersFromFilesStructure: false
diff --git a/config/grafana/provisioning/datasources/prometheus.yaml b/config/grafana/provisioning/datasources/prometheus.yaml
new file mode 100644
index 00000000..2e3170ec
--- /dev/null
+++ b/config/grafana/provisioning/datasources/prometheus.yaml
@@ -0,0 +1,14 @@
+# Auto-provision the Prometheus datasource so the bundled dashboard wires up
+# without any clicks. The `uid: DS_PROMETHEUS` matches the templated input in
+# brightstaff_dashboard.json.
+
+apiVersion: 1
+
+datasources:
+  - name: Prometheus
+    uid: DS_PROMETHEUS
+    type: prometheus
+    access: proxy
+    url: http://prometheus:9090
+    isDefault: true
+    editable: true
diff --git a/crates/Cargo.lock b/crates/Cargo.lock
index e07b47ee..56fc260c 100644
--- a/crates/Cargo.lock
+++ b/crates/Cargo.lock
@@ -23,6 +23,18 @@ version = "0.3.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "e8fd72866655d1904d6b0997d0b07ba561047d070fbe29de039031c641b61217"
 
+[[package]]
+name = "ahash"
+version = "0.8.12"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5a15f179cd60c4584b8a8c596927aadc462e27f2ca70c04e0071964a73ba7a75"
+dependencies = [
+ "cfg-if",
+ "once_cell",
+ "version_check",
+ "zerocopy",
+]
+
 [[package]]
 name = "aho-corasick"
 version = "1.1.4"
@@ -257,6 +269,24 @@ dependencies = [
  "vsimd",
 ]
 
+[[package]]
+name = "bindgen"
+version = "0.72.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "993776b509cfb49c750f11b8f07a46fa23e0a1386ffc01fb1e7d343efc387895"
+dependencies = [
+ "bitflags",
+ "cexpr",
+ "clang-sys",
+ "itertools 0.13.0",
+ "proc-macro2",
+ "quote",
+ "regex",
+ "rustc-hash 2.1.2",
+ "shlex",
+ "syn 2.0.117",
+]
+
 [[package]]
 name = "bit-set"
 version = "0.5.3"
@@ -316,6 +346,9 @@ dependencies = [
  "hyper 1.9.0",
  "hyper-util",
  "lru",
+ "metrics 0.23.1",
+ "metrics-exporter-prometheus",
+ "metrics-process",
  "mockito",
  "opentelemetry",
  "opentelemetry-http",
@@ -391,6 +424,15 @@ dependencies = [
  "shlex",
 ]
 
+[[package]]
+name = "cexpr"
+version = "0.6.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6fac387a98bb7c37292057cffc56d62ecb629900026402633ae9160df93a8766"
+dependencies = [
+ "nom",
+]
+
 [[package]]
 name = "cfg-if"
 version = "1.0.4"
@@ -428,6 +470,17 @@ dependencies = [
  "windows-link",
 ]
 
+[[package]]
+name = "clang-sys"
+version = "1.8.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0b023947811758c97c59bf9d1c188fd619ad4718dcaa767947df1cadb14f39f4"
+dependencies = [
+ "glob",
+ "libc",
+ "libloading",
+]
+
 [[package]]
 name = "cmov"
 version = "0.5.3"
@@ -574,6 +627,21 @@ dependencies = [
  "cfg-if",
 ]
 
+[[package]]
+name = "crossbeam-epoch"
+version = "0.9.18"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e"
+dependencies = [
+ "crossbeam-utils",
+]
+
+[[package]]
+name = "crossbeam-utils"
+version = "0.8.21"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28"
+
 [[package]]
 name = "crypto-common"
 version = "0.1.7"
@@ -1070,6 +1138,12 @@ dependencies = [
  "wasip3",
 ]
 
+[[package]]
+name = "glob"
+version = "0.3.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0cc23270f6e1808e30a928bdc84dea0b9b4136a8bc82338574f23baf47bbd280"
+
 [[package]]
 name = "governor"
 version = "0.6.3"
@@ -1128,7 +1202,7 @@ version = "0.8.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "e91b62f79061a0bc2e046024cb7ba44b08419ed238ecbd9adbd787434b9e8c25"
 dependencies = [
- "ahash",
+ "ahash 0.3.8",
  "autocfg",
 ]
 
@@ -1138,6 +1212,15 @@ version = "0.12.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "8a9ee70c43aaf417c914396645a0fa852624801b24ebb7ae78fe8272889ac888"
 
+[[package]]
+name = "hashbrown"
+version = "0.14.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e5274423e17b7c9fc20b6e7e208532f9b19825d82dfd615708b70edd83df41f1"
+dependencies = [
+ "ahash 0.8.12",
+]
+
 [[package]]
 name = "hashbrown"
 version = "0.15.5"
@@ -1189,6 +1272,12 @@ dependencies = [
  "uuid",
 ]
 
+[[package]]
+name = "hermit-abi"
+version = "0.5.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fc0fef456e4baa96da950455cd02c081ca953b141298e41db3fc7e36b1da849c"
+
 [[package]]
 name = "hex"
 version = "0.4.3"
@@ -1665,6 +1754,27 @@ version = "0.2.185"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "52ff2c0fe9bc6cb6b14a0592c2ff4fa9ceb83eea9db979b0487cd054946a2b8f"
 
+[[package]]
+name = "libloading"
+version = "0.8.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d7c4b02199fee7c5d21a5ae7d8cfa79a6ef5bb2fc834d6e9058e89c825efdc55"
+dependencies = [
+ "cfg-if",
+ "windows-link",
+]
+
+[[package]]
+name = "libproc"
+version = "0.14.11"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a54ad7278b8bc5301d5ffd2a94251c004feb971feba96c971ea4063645990757"
+dependencies = [
+ "bindgen",
+ "errno",
+ "libc",
+]
+
 [[package]]
 name = "libredox"
 version = "0.1.16"
@@ -1745,6 +1855,12 @@ version = "0.1.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "112b39cec0b298b6c1999fee3e31427f74f676e4cb9879ed1a121b43661a4154"
 
+[[package]]
+name = "mach2"
+version = "0.6.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "dae608c151f68243f2b000364e1f7b186d9c29845f7d2d85bd31b9ad77ad552b"
+
 [[package]]
 name = "matchers"
 version = "0.2.0"
@@ -1782,6 +1898,77 @@ version = "2.8.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "f8ca58f447f06ed17d5fc4043ce1b10dd205e060fb3ce5b979b8ed8e59ff3f79"
 
+[[package]]
+name = "metrics"
+version = "0.23.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3045b4193fbdc5b5681f32f11070da9be3609f189a79f3390706d42587f46bb5"
+dependencies = [
+ "ahash 0.8.12",
+ "portable-atomic",
+]
+
+[[package]]
+name = "metrics"
+version = "0.24.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5d5312e9ba3771cfa961b585728215e3d972c950a3eed9252aa093d6301277e8"
+dependencies = [
+ "ahash 0.8.12",
+ "portable-atomic",
+]
+
+[[package]]
+name = "metrics-exporter-prometheus"
+version = "0.15.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b4f0c8427b39666bf970460908b213ec09b3b350f20c0c2eabcbba51704a08e6"
+dependencies = [
+ "base64 0.22.1",
+ "http-body-util",
+ "hyper 1.9.0",
+ "hyper-util",
+ "indexmap 2.14.0",
+ "ipnet",
+ "metrics 0.23.1",
+ "metrics-util",
+ "quanta",
+ "thiserror 1.0.69",
+ "tokio",
+ "tracing",
+]
+
+[[package]]
+name = "metrics-process"
+version = "2.4.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4268d87f64a752f5a651314fc683f04da10be65701ea3e721ba4d74f79163cac"
+dependencies = [
+ "libc",
+ "libproc",
+ "mach2",
+ "metrics 0.24.3",
+ "once_cell",
+ "procfs",
+ "rlimit",
+ "windows",
+]
+
+[[package]]
+name = "metrics-util"
+version = "0.17.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4259040465c955f9f2f1a4a8a16dc46726169bca0f88e8fb2dbeced487c3e828"
+dependencies = [
+ "crossbeam-epoch",
+ "crossbeam-utils",
+ "hashbrown 0.14.5",
+ "metrics 0.23.1",
+ "num_cpus",
+ "quanta",
+ "sketches-ddsketch",
+]
+
 [[package]]
 name = "mime"
 version = "0.3.17"
@@ -1935,6 +2122,16 @@ dependencies = [
  "autocfg",
 ]
 
+[[package]]
+name = "num_cpus"
+version = "1.17.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "91df4bbde75afed763b708b7eee1e8e7651e02d97f6d5dd763e89367e957b23b"
+dependencies = [
+ "hermit-abi",
+ "libc",
+]
+
 [[package]]
 name = "objc2-core-foundation"
 version = "0.3.2"
@@ -2278,6 +2475,27 @@ dependencies = [
  "unicode-ident",
 ]
 
+[[package]]
+name = "procfs"
+version = "0.18.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "25485360a54d6861439d60facef26de713b1e126bf015ec8f98239467a2b82f7"
+dependencies = [
+ "bitflags",
+ "procfs-core",
+ "rustix",
+]
+
+[[package]]
+name = "procfs-core"
+version = "0.18.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e6401bf7b6af22f78b563665d15a22e9aef27775b79b149a66ca022468a4e405"
+dependencies = [
+ "bitflags",
+ "hex",
+]
+
 [[package]]
 name = "prompt_gateway"
 version = "0.1.0"
@@ -2333,6 +2551,21 @@ dependencies = [
  "log",
 ]
 
+[[package]]
+name = "quanta"
+version = "0.12.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f3ab5a9d756f0d97bdc89019bd2e4ea098cf9cde50ee7564dde6b81ccc8f06c7"
+dependencies = [
+ "crossbeam-utils",
+ "libc",
+ "once_cell",
+ "raw-cpuid",
+ "wasi 0.11.1+wasi-snapshot-preview1",
+ "web-sys",
+ "winapi",
+]
+
 [[package]]
 name = "quinn"
 version = "0.11.9"
@@ -2485,6 +2718,15 @@ version = "0.10.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "63b8176103e19a2643978565ca18b50549f6101881c443590420e4dc998a3c69"
 
+[[package]]
+name = "raw-cpuid"
+version = "11.6.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "498cd0dc59d73224351ee52a95fee0f1a617a2eae0e7d9d720cc622c73a54186"
+dependencies = [
+ "bitflags",
+]
+
 [[package]]
 name = "redis"
 version = "0.27.6"
@@ -2646,6 +2888,15 @@ dependencies = [
  "windows-sys 0.52.0",
 ]
 
+[[package]]
+name = "rlimit"
+version = "0.11.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f35ee2729c56bb610f6dba436bf78135f728b7373bdffae2ec815b2d3eb98cc3"
+dependencies = [
+ "libc",
+]
+
 [[package]]
 name = "rustc-hash"
 version = "1.1.0"
@@ -3098,6 +3349,12 @@ version = "1.0.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "b2aa850e253778c88a04c3d7323b043aeda9d3e30d5971937c1855769763678e"
 
+[[package]]
+name = "sketches-ddsketch"
+version = "0.2.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "85636c14b73d81f541e525f585c0a2109e6744e1565b5c1668e31c70c10ed65c"
+
 [[package]]
 name = "slab"
 version = "0.4.12"
@@ -4003,6 +4260,49 @@ dependencies = [
  "web-sys",
 ]
 
+[[package]]
+name = "winapi"
+version = "0.3.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419"
+dependencies = [
+ "winapi-i686-pc-windows-gnu",
+ "winapi-x86_64-pc-windows-gnu",
+]
+
+[[package]]
+name = "winapi-i686-pc-windows-gnu"
+version = "0.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6"
+
+[[package]]
+name = "winapi-x86_64-pc-windows-gnu"
+version = "0.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f"
+
+[[package]]
+name = "windows"
+version = "0.62.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "527fadee13e0c05939a6a05d5bd6eec6cd2e3dbd648b9f8e447c6518133d8580"
+dependencies = [
+ "windows-collections",
+ "windows-core",
+ "windows-future",
+ "windows-numerics",
+]
+
+[[package]]
+name = "windows-collections"
+version = "0.3.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "23b2d95af1a8a14a3c7367e1ed4fc9c20e0a26e79551b1454d72583c97cc6610"
+dependencies = [
+ "windows-core",
+]
+
 [[package]]
 name = "windows-core"
 version = "0.62.2"
@@ -4016,6 +4316,17 @@ dependencies = [
  "windows-strings",
 ]
 
+[[package]]
+name = "windows-future"
+version = "0.3.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e1d6f90251fe18a279739e78025bd6ddc52a7e22f921070ccdc67dde84c605cb"
+dependencies = [
+ "windows-core",
+ "windows-link",
+ "windows-threading",
+]
+
 [[package]]
 name = "windows-implement"
 version = "0.60.2"
@@ -4044,6 +4355,16 @@ version = "0.2.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "f0805222e57f7521d6a62e36fa9163bc891acd422f971defe97d64e70d0a4fe5"
 
+[[package]]
+name = "windows-numerics"
+version = "0.3.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6e2e40844ac143cdb44aead537bbf727de9b044e107a0f1220392177d15b0f26"
+dependencies = [
+ "windows-core",
+ "windows-link",
+]
+
 [[package]]
 name = "windows-registry"
 version = "0.6.1"
@@ -4133,6 +4454,15 @@ dependencies = [
  "windows_x86_64_msvc 0.53.1",
 ]
 
+[[package]]
+name = "windows-threading"
+version = "0.2.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3949bd5b99cafdf1c7ca86b43ca564028dfe27d66958f2470940f73d86d75b37"
+dependencies = [
+ "windows-link",
+]
+
 [[package]]
 name = "windows_aarch64_gnullvm"
 version = "0.52.6"
diff --git a/crates/brightstaff/Cargo.toml b/crates/brightstaff/Cargo.toml
index f88ed918..b9718e44 100644
--- a/crates/brightstaff/Cargo.toml
+++ b/crates/brightstaff/Cargo.toml
@@ -27,6 +27,9 @@ opentelemetry_sdk = { version = "0.31", features = ["rt-tokio"] }
 pretty_assertions = "1.4.1"
 rand = "0.9.2"
 lru = "0.12"
+metrics = "0.23"
+metrics-exporter-prometheus = { version = "0.15", default-features = false, features = ["http-listener"] }
+metrics-process = "2.1"
 redis = { version = "0.27", features = ["tokio-comp"] }
 reqwest = { version = "0.12.15", features = ["stream"] }
 serde = { version = "1.0.219", features = ["derive"] }
diff --git a/crates/brightstaff/src/handlers/llm/mod.rs b/crates/brightstaff/src/handlers/llm/mod.rs
index 719c048d..94930caa 100644
--- a/crates/brightstaff/src/handlers/llm/mod.rs
+++ b/crates/brightstaff/src/handlers/llm/mod.rs
@@ -24,13 +24,14 @@ use crate::app_state::AppState;
 use crate::handlers::agents::pipeline::PipelineProcessor;
 use crate::handlers::extract_request_id;
 use crate::handlers::full;
+use crate::metrics as bs_metrics;
 use crate::state::response_state_processor::ResponsesStateProcessor;
 use crate::state::{
     extract_input_items, retrieve_and_combine_input, StateStorage, StateStorageError,
 };
 use crate::streaming::{
     create_streaming_response, create_streaming_response_with_output_filter, truncate_message,
-    ObservableStreamProcessor, StreamProcessor,
+    LlmMetricsCtx, ObservableStreamProcessor, StreamProcessor,
 };
 use crate::tracing::{
     collect_custom_trace_attributes, llm as tracing_llm, operation_component,
@@ -686,6 +687,13 @@ async fn send_upstream(
 
     let request_start_time = std::time::Instant::now();
 
+    // Labels for LLM upstream metrics. We prefer `resolved_model` (post-routing)
+    // and derive the provider from its `provider/model` prefix. This matches the
+    // same model id the cost/latency router keys off.
+    let (metric_provider_raw, metric_model_raw) = bs_metrics::split_provider_model(resolved_model);
+    let metric_provider = metric_provider_raw.to_string();
+    let metric_model = metric_model_raw.to_string();
+
     let llm_response = match http_client
         .post(upstream_url)
         .headers(request_headers.clone())
@@ -695,6 +703,14 @@ async fn send_upstream(
     {
         Ok(res) => res,
         Err(err) => {
+            let err_class = bs_metrics::llm_error_class_from_reqwest(&err);
+            bs_metrics::record_llm_upstream(
+                &metric_provider,
+                &metric_model,
+                0,
+                err_class,
+                request_start_time.elapsed(),
+            );
             let err_msg = format!("Failed to send request: {}", err);
             let mut internal_error = Response::new(full(err_msg));
             *internal_error.status_mut() = StatusCode::INTERNAL_SERVER_ERROR;
@@ -750,7 +766,12 @@ async fn send_upstream(
         span_name,
         request_start_time,
         messages_for_signals,
-    );
+    )
+    .with_llm_metrics(LlmMetricsCtx {
+        provider: metric_provider.clone(),
+        model: metric_model.clone(),
+        upstream_status: upstream_status.as_u16(),
+    });
 
     let output_filter_request_headers = if filter_pipeline.has_output_filters() {
         Some(request_headers.clone())
diff --git a/crates/brightstaff/src/handlers/llm/model_selection.rs b/crates/brightstaff/src/handlers/llm/model_selection.rs
index 1b4315e7..a1378d86 100644
--- a/crates/brightstaff/src/handlers/llm/model_selection.rs
+++ b/crates/brightstaff/src/handlers/llm/model_selection.rs
@@ -5,10 +5,24 @@ use hyper::StatusCode;
 use std::sync::Arc;
 use tracing::{debug, info, warn};
 
+use crate::metrics as bs_metrics;
+use crate::metrics::labels as metric_labels;
 use crate::router::orchestrator::OrchestratorService;
 use crate::streaming::truncate_message;
 use crate::tracing::routing;
 
+/// Classify a request path (already stripped of `/agents` or `/routing` by
+/// the caller) into the fixed `route` label used on routing metrics.
+fn route_label_for_path(request_path: &str) -> &'static str {
+    if request_path.starts_with("/agents") {
+        metric_labels::ROUTE_AGENT
+    } else if request_path.starts_with("/routing") {
+        metric_labels::ROUTE_ROUTING
+    } else {
+        metric_labels::ROUTE_LLM
+    }
+}
+
 pub struct RoutingResult {
     /// Primary model to use (first in the ranked list).
     pub model_name: String,
@@ -106,15 +120,23 @@ pub async fn router_chat_get_upstream_model(
         )
         .await;
 
-    let determination_ms = routing_start_time.elapsed().as_millis() as i64;
+    let determination_elapsed = routing_start_time.elapsed();
+    let determination_ms = determination_elapsed.as_millis() as i64;
     let current_span = tracing::Span::current();
     current_span.record(routing::ROUTE_DETERMINATION_MS, determination_ms);
+    let route_label = route_label_for_path(request_path);
 
     match routing_result {
         Ok(route) => match route {
             Some((route_name, ranked_models)) => {
                 let model_name = ranked_models.first().cloned().unwrap_or_default();
                 current_span.record("route.selected_model", model_name.as_str());
+                bs_metrics::record_router_decision(
+                    route_label,
+                    &model_name,
+                    false,
+                    determination_elapsed,
+                );
                 Ok(RoutingResult {
                     model_name,
                     models: ranked_models,
@@ -126,6 +148,12 @@ pub async fn router_chat_get_upstream_model(
                 // This signals to llm.rs to use the original validated request model
                 current_span.record("route.selected_model", "none");
                 info!("no route determined, using default model");
+                bs_metrics::record_router_decision(
+                    route_label,
+                    "none",
+                    true,
+                    determination_elapsed,
+                );
 
                 Ok(RoutingResult {
                     model_name: "none".to_string(),
@@ -136,6 +164,7 @@ pub async fn router_chat_get_upstream_model(
         },
         Err(err) => {
             current_span.record("route.selected_model", "unknown");
+            bs_metrics::record_router_decision(route_label, "unknown", true, determination_elapsed);
             Err(RoutingError::internal_error(format!(
                 "Failed to determine route: {}",
                 err
diff --git a/crates/brightstaff/src/handlers/routing_service.rs b/crates/brightstaff/src/handlers/routing_service.rs
index 5fc0d3b9..b93b1422 100644
--- a/crates/brightstaff/src/handlers/routing_service.rs
+++ b/crates/brightstaff/src/handlers/routing_service.rs
@@ -12,6 +12,8 @@ use tracing::{debug, info, info_span, warn, Instrument};
 
 use super::extract_or_generate_traceparent;
 use crate::handlers::llm::model_selection::router_chat_get_upstream_model;
+use crate::metrics as bs_metrics;
+use crate::metrics::labels as metric_labels;
 use crate::router::orchestrator::OrchestratorService;
 use crate::tracing::{collect_custom_trace_attributes, operation_component, set_service_name};
 
@@ -230,6 +232,17 @@ async fn routing_decision_inner(
                 pinned: false,
             };
 
+            // Distinguish "decision served" (a concrete model picked) from
+            // "no_candidates" (the sentinel "none" returned when nothing
+            // matched). The handler still responds 200 in both cases, so RED
+            // metrics alone can't tell them apart.
+            let outcome = if response.models.first().map(|m| m == "none").unwrap_or(true) {
+                metric_labels::ROUTING_SVC_NO_CANDIDATES
+            } else {
+                metric_labels::ROUTING_SVC_DECISION_SERVED
+            };
+            bs_metrics::record_routing_service_outcome(outcome);
+
             info!(
                 primary_model = %response.models.first().map(|s| s.as_str()).unwrap_or("none"),
                 total_models = response.models.len(),
@@ -249,6 +262,7 @@ async fn routing_decision_inner(
                 .unwrap())
         }
         Err(err) => {
+            bs_metrics::record_routing_service_outcome(metric_labels::ROUTING_SVC_POLICY_ERROR);
             warn!(error = %err.message, "routing decision failed");
             Ok(BrightStaffError::InternalServerError(err.message).into_response())
         }
diff --git a/crates/brightstaff/src/lib.rs b/crates/brightstaff/src/lib.rs
index a0ba5f43..66c6eadf 100644
--- a/crates/brightstaff/src/lib.rs
+++ b/crates/brightstaff/src/lib.rs
@@ -1,5 +1,6 @@
 pub mod app_state;
 pub mod handlers;
+pub mod metrics;
 pub mod router;
 pub mod session_cache;
 pub mod signals;
diff --git a/crates/brightstaff/src/main.rs b/crates/brightstaff/src/main.rs
index 40ac429d..80e03b4b 100644
--- a/crates/brightstaff/src/main.rs
+++ b/crates/brightstaff/src/main.rs
@@ -5,6 +5,8 @@ use brightstaff::handlers::function_calling::function_calling_chat_handler;
 use brightstaff::handlers::llm::llm_chat;
 use brightstaff::handlers::models::list_models;
 use brightstaff::handlers::routing_service::routing_decision;
+use brightstaff::metrics as bs_metrics;
+use brightstaff::metrics::labels as metric_labels;
 use brightstaff::router::model_metrics::ModelMetricsService;
 use brightstaff::router::orchestrator::OrchestratorService;
 use brightstaff::session_cache::init_session_cache;
@@ -384,10 +386,79 @@ async fn init_state_storage(
 // Request routing
 // ---------------------------------------------------------------------------
 
+/// Normalized method label — limited set so we never emit a free-form string.
+fn method_label(method: &Method) -> &'static str {
+    match *method {
+        Method::GET => "GET",
+        Method::POST => "POST",
+        Method::PUT => "PUT",
+        Method::DELETE => "DELETE",
+        Method::PATCH => "PATCH",
+        Method::HEAD => "HEAD",
+        Method::OPTIONS => "OPTIONS",
+        _ => "OTHER",
+    }
+}
+
+/// Compute the fixed `handler` metric label from the request's path+method.
+/// Returning `None` for fall-through means `route()` will hand the request to
+/// the catch-all 404 branch.
+fn handler_label_for(method: &Method, path: &str) -> &'static str {
+    if let Some(stripped) = path.strip_prefix("/agents") {
+        if matches!(
+            stripped,
+            CHAT_COMPLETIONS_PATH | MESSAGES_PATH | OPENAI_RESPONSES_API_PATH
+        ) {
+            return metric_labels::HANDLER_AGENT_CHAT;
+        }
+    }
+    if let Some(stripped) = path.strip_prefix("/routing") {
+        if matches!(
+            stripped,
+            CHAT_COMPLETIONS_PATH | MESSAGES_PATH | OPENAI_RESPONSES_API_PATH
+        ) {
+            return metric_labels::HANDLER_ROUTING_DECISION;
+        }
+    }
+    match (method, path) {
+        (&Method::POST, CHAT_COMPLETIONS_PATH | MESSAGES_PATH | OPENAI_RESPONSES_API_PATH) => {
+            metric_labels::HANDLER_LLM_CHAT
+        }
+        (&Method::POST, "/function_calling") => metric_labels::HANDLER_FUNCTION_CALLING,
+        (&Method::GET, "/v1/models" | "/agents/v1/models") => metric_labels::HANDLER_LIST_MODELS,
+        (&Method::OPTIONS, "/v1/models" | "/agents/v1/models") => {
+            metric_labels::HANDLER_CORS_PREFLIGHT
+        }
+        _ => metric_labels::HANDLER_NOT_FOUND,
+    }
+}
+
 /// Route an incoming HTTP request to the appropriate handler.
 async fn route(
     req: Request<Incoming>,
     state: Arc<AppState>,
+) -> Result<Response<BoxBody<Bytes, hyper::Error>>, hyper::Error> {
+    let handler = handler_label_for(req.method(), req.uri().path());
+    let method = method_label(req.method());
+    let started = std::time::Instant::now();
+    let _in_flight = bs_metrics::InFlightGuard::new(handler);
+
+    let result = dispatch(req, state).await;
+
+    let status = match &result {
+        Ok(resp) => resp.status().as_u16(),
+        // hyper::Error here means the body couldn't be produced; conventionally 500.
+        Err(_) => 500,
+    };
+    bs_metrics::record_http(handler, method, status, started);
+    result
+}
+
+/// Inner dispatcher split out so `route()` can wrap it with metrics without
+/// duplicating the match tree.
+async fn dispatch(
+    req: Request<Incoming>,
+    state: Arc<AppState>,
 ) -> Result<Response<BoxBody<Bytes, hyper::Error>>, hyper::Error> {
     let parent_cx = global::get_text_map_propagator(|p| p.extract(&HeaderExtractor(req.headers())));
     let path = req.uri().path().to_string();
@@ -503,6 +574,7 @@ async fn run_server(state: Arc<AppState>) -> Result<(), Box<dyn std::error::Erro
 async fn main() -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
     let config = load_config()?;
     let _tracer_provider = init_tracer(config.tracing.as_ref());
+    bs_metrics::init();
     info!("loaded plano_config.yaml");
     let state = Arc::new(init_app_state(&config).await?);
     run_server(state).await
diff --git a/crates/brightstaff/src/metrics/labels.rs b/crates/brightstaff/src/metrics/labels.rs
new file mode 100644
index 00000000..4eaf3e59
--- /dev/null
+++ b/crates/brightstaff/src/metrics/labels.rs
@@ -0,0 +1,38 @@
+//! Fixed label-value constants so callers never emit free-form strings
+//! (which would blow up cardinality).
+
+// Handler enum — derived from the path+method match in `route()`.
+pub const HANDLER_AGENT_CHAT: &str = "agent_chat";
+pub const HANDLER_ROUTING_DECISION: &str = "routing_decision";
+pub const HANDLER_LLM_CHAT: &str = "llm_chat";
+pub const HANDLER_FUNCTION_CALLING: &str = "function_calling";
+pub const HANDLER_LIST_MODELS: &str = "list_models";
+pub const HANDLER_CORS_PREFLIGHT: &str = "cors_preflight";
+pub const HANDLER_NOT_FOUND: &str = "not_found";
+
+// Router "route" class — which brightstaff endpoint prompted the decision.
+pub const ROUTE_AGENT: &str = "agent";
+pub const ROUTE_ROUTING: &str = "routing";
+pub const ROUTE_LLM: &str = "llm";
+
+// Token kind for brightstaff_llm_tokens_total.
+pub const TOKEN_KIND_PROMPT: &str = "prompt";
+pub const TOKEN_KIND_COMPLETION: &str = "completion";
+
+// LLM error_class values (match docstring in metrics/mod.rs).
+pub const LLM_ERR_NONE: &str = "none";
+pub const LLM_ERR_TIMEOUT: &str = "timeout";
+pub const LLM_ERR_CONNECT: &str = "connect";
+pub const LLM_ERR_PARSE: &str = "parse";
+pub const LLM_ERR_OTHER: &str = "other";
+pub const LLM_ERR_STREAM: &str = "stream";
+
+// Routing service outcome values.
+pub const ROUTING_SVC_DECISION_SERVED: &str = "decision_served";
+pub const ROUTING_SVC_NO_CANDIDATES: &str = "no_candidates";
+pub const ROUTING_SVC_POLICY_ERROR: &str = "policy_error";
+
+// Session cache outcome values.
+pub const SESSION_CACHE_HIT: &str = "hit";
+pub const SESSION_CACHE_MISS: &str = "miss";
+pub const SESSION_CACHE_STORE: &str = "store";
diff --git a/crates/brightstaff/src/metrics/mod.rs b/crates/brightstaff/src/metrics/mod.rs
new file mode 100644
index 00000000..34679cca
--- /dev/null
+++ b/crates/brightstaff/src/metrics/mod.rs
@@ -0,0 +1,377 @@
+//! Prometheus metrics for brightstaff.
+//!
+//! Installs the `metrics` global recorder backed by
+//! `metrics-exporter-prometheus` and exposes a `/metrics` HTTP endpoint on a
+//! dedicated admin port (default `0.0.0.0:9092`, overridable via
+//! `METRICS_BIND_ADDRESS`).
+//!
+//! Emitted metric families (see `describe_all` for full list):
+//! - HTTP RED: `brightstaff_http_requests_total`,
+//!   `brightstaff_http_request_duration_seconds`,
+//!   `brightstaff_http_in_flight_requests`.
+//! - LLM upstream: `brightstaff_llm_upstream_requests_total`,
+//!   `brightstaff_llm_upstream_duration_seconds`,
+//!   `brightstaff_llm_time_to_first_token_seconds`,
+//!   `brightstaff_llm_tokens_total`,
+//!   `brightstaff_llm_tokens_usage_missing_total`.
+//! - Routing: `brightstaff_router_decisions_total`,
+//!   `brightstaff_router_decision_duration_seconds`,
+//!   `brightstaff_routing_service_requests_total`,
+//!   `brightstaff_session_cache_events_total`.
+//! - Process: via `metrics-process`.
+//! - Build: `brightstaff_build_info`.
+
+use std::net::SocketAddr;
+use std::sync::OnceLock;
+use std::time::{Duration, Instant};
+
+use metrics::{counter, describe_counter, describe_gauge, describe_histogram, gauge, histogram};
+use metrics_exporter_prometheus::{Matcher, PrometheusBuilder};
+use tracing::{info, warn};
+
+pub mod labels;
+
+/// Guard flag so tests don't re-install the global recorder.
+static INIT: OnceLock<()> = OnceLock::new();
+
+const DEFAULT_METRICS_BIND: &str = "0.0.0.0:9092";
+
+/// HTTP request duration buckets (seconds). Capped at 60s.
+const HTTP_BUCKETS: &[f64] = &[
+    0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0, 30.0, 60.0,
+];
+
+/// LLM upstream / TTFT buckets (seconds). Capped at 120s because provider
+/// completions routinely run that long.
+const LLM_BUCKETS: &[f64] = &[0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0, 30.0, 60.0, 120.0];
+
+/// Router decision buckets (seconds). The orchestrator call itself is usually
+/// sub-second but bucketed generously in case of upstream slowness.
+const ROUTER_BUCKETS: &[f64] = &[
+    0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0, 30.0,
+];
+
+/// Install the global recorder and spawn the `/metrics` HTTP listener.
+///
+/// Safe to call more than once; subsequent calls are no-ops so tests that
+/// construct their own recorder still work.
+pub fn init() {
+    if INIT.get().is_some() {
+        return;
+    }
+
+    let bind: SocketAddr = std::env::var("METRICS_BIND_ADDRESS")
+        .unwrap_or_else(|_| DEFAULT_METRICS_BIND.to_string())
+        .parse()
+        .unwrap_or_else(|err| {
+            warn!(error = %err, default = DEFAULT_METRICS_BIND, "invalid METRICS_BIND_ADDRESS, falling back to default");
+            DEFAULT_METRICS_BIND.parse().expect("default bind parses")
+        });
+
+    let builder = PrometheusBuilder::new()
+        .with_http_listener(bind)
+        .set_buckets_for_metric(
+            Matcher::Full("brightstaff_http_request_duration_seconds".to_string()),
+            HTTP_BUCKETS,
+        )
+        .and_then(|b| {
+            b.set_buckets_for_metric(Matcher::Prefix("brightstaff_llm_".to_string()), LLM_BUCKETS)
+        })
+        .and_then(|b| {
+            b.set_buckets_for_metric(
+                Matcher::Full("brightstaff_router_decision_duration_seconds".to_string()),
+                ROUTER_BUCKETS,
+            )
+        });
+
+    let builder = match builder {
+        Ok(b) => b,
+        Err(err) => {
+            warn!(error = %err, "failed to configure metrics buckets, using defaults");
+            PrometheusBuilder::new().with_http_listener(bind)
+        }
+    };
+
+    if let Err(err) = builder.install() {
+        warn!(error = %err, "failed to install Prometheus recorder; metrics disabled");
+        return;
+    }
+
+    let _ = INIT.set(());
+
+    describe_all();
+    emit_build_info();
+
+    // Register process-level collector (RSS, CPU, FDs).
+    let collector = metrics_process::Collector::default();
+    collector.describe();
+    // Prime once at startup; subsequent scrapes refresh via the exporter's
+    // per-scrape render, so we additionally refresh on a short interval to
+    // keep gauges moving between scrapes without requiring client pull.
+    collector.collect();
+    tokio::spawn(async move {
+        let mut tick = tokio::time::interval(Duration::from_secs(10));
+        tick.set_missed_tick_behavior(tokio::time::MissedTickBehavior::Skip);
+        loop {
+            tick.tick().await;
+            collector.collect();
+        }
+    });
+
+    info!(address = %bind, "metrics listener started");
+}
+
+fn describe_all() {
+    describe_counter!(
+        "brightstaff_http_requests_total",
+        "Total HTTP requests served by brightstaff, by handler and status class."
+    );
+    describe_histogram!(
+        "brightstaff_http_request_duration_seconds",
+        "Wall-clock duration of HTTP requests served by brightstaff, by handler."
+    );
+    describe_gauge!(
+        "brightstaff_http_in_flight_requests",
+        "Number of HTTP requests currently being served by brightstaff, by handler."
+    );
+
+    describe_counter!(
+        "brightstaff_llm_upstream_requests_total",
+        "LLM upstream request outcomes, by provider, model, status class and error class."
+    );
+    describe_histogram!(
+        "brightstaff_llm_upstream_duration_seconds",
+        "Wall-clock duration of LLM upstream calls (stream close for streaming), by provider and model."
+    );
+    describe_histogram!(
+        "brightstaff_llm_time_to_first_token_seconds",
+        "Time from request start to first streamed byte, by provider and model (streaming only)."
+    );
+    describe_counter!(
+        "brightstaff_llm_tokens_total",
+        "Tokens reported in the provider `usage` field, by provider, model and kind (prompt/completion)."
+    );
+    describe_counter!(
+        "brightstaff_llm_tokens_usage_missing_total",
+        "LLM responses that completed without a usable `usage` block (so token counts are unknown)."
+    );
+
+    describe_counter!(
+        "brightstaff_router_decisions_total",
+        "Routing decisions made by the orchestrator, by route, selected model, and whether a fallback was used."
+    );
+    describe_histogram!(
+        "brightstaff_router_decision_duration_seconds",
+        "Time spent in the orchestrator deciding a route, by route."
+    );
+    describe_counter!(
+        "brightstaff_routing_service_requests_total",
+        "Outcomes of /routing/* decision requests: decision_served, no_candidates, policy_error."
+    );
+    describe_counter!(
+        "brightstaff_session_cache_events_total",
+        "Session affinity cache lookups and stores, by outcome."
+    );
+
+    describe_gauge!(
+        "brightstaff_build_info",
+        "Build metadata. Always 1; labels carry version and git SHA."
+    );
+}
+
+fn emit_build_info() {
+    let version = env!("CARGO_PKG_VERSION");
+    let git_sha = option_env!("GIT_SHA").unwrap_or("unknown");
+    gauge!(
+        "brightstaff_build_info",
+        "version" => version.to_string(),
+        "git_sha" => git_sha.to_string(),
+    )
+    .set(1.0);
+}
+
+/// Split a provider-qualified model id like `"openai/gpt-4o"` into
+/// `(provider, model)`. Returns `("unknown", raw)` when there is no `/`.
+pub fn split_provider_model(full: &str) -> (&str, &str) {
+    match full.split_once('/') {
+        Some((p, m)) => (p, m),
+        None => ("unknown", full),
+    }
+}
+
+/// Bucket an HTTP status code into `"2xx"` / `"4xx"` / `"5xx"` / `"1xx"` / `"3xx"`.
+pub fn status_class(status: u16) -> &'static str {
+    match status {
+        100..=199 => "1xx",
+        200..=299 => "2xx",
+        300..=399 => "3xx",
+        400..=499 => "4xx",
+        500..=599 => "5xx",
+        _ => "other",
+    }
+}
+
+// ---------------------------------------------------------------------------
+// HTTP RED helpers
+// ---------------------------------------------------------------------------
+
+/// RAII guard that increments the in-flight gauge on construction and
+/// decrements on drop. Pair with [`HttpTimer`] in the `route()` wrapper so the
+/// gauge drops even on error paths.
+pub struct InFlightGuard {
+    handler: &'static str,
+}
+
+impl InFlightGuard {
+    pub fn new(handler: &'static str) -> Self {
+        gauge!(
+            "brightstaff_http_in_flight_requests",
+            "handler" => handler,
+        )
+        .increment(1.0);
+        Self { handler }
+    }
+}
+
+impl Drop for InFlightGuard {
+    fn drop(&mut self) {
+        gauge!(
+            "brightstaff_http_in_flight_requests",
+            "handler" => self.handler,
+        )
+        .decrement(1.0);
+    }
+}
+
+/// Record the HTTP request counter + duration histogram.
+pub fn record_http(handler: &'static str, method: &'static str, status: u16, started: Instant) {
+    let class = status_class(status);
+    counter!(
+        "brightstaff_http_requests_total",
+        "handler" => handler,
+        "method" => method,
+        "status_class" => class,
+    )
+    .increment(1);
+    histogram!(
+        "brightstaff_http_request_duration_seconds",
+        "handler" => handler,
+    )
+    .record(started.elapsed().as_secs_f64());
+}
+
+// ---------------------------------------------------------------------------
+// LLM upstream helpers
+// ---------------------------------------------------------------------------
+
+/// Classify an outcome of an LLM upstream call for the `error_class` label.
+pub fn llm_error_class_from_reqwest(err: &reqwest::Error) -> &'static str {
+    if err.is_timeout() {
+        "timeout"
+    } else if err.is_connect() {
+        "connect"
+    } else if err.is_decode() {
+        "parse"
+    } else {
+        "other"
+    }
+}
+
+/// Record the outcome of an LLM upstream call. `status` is the HTTP status
+/// the upstream returned (0 if the call never produced one, e.g. send failure).
+/// `error_class` is `"none"` on success, or a discriminated error label.
+pub fn record_llm_upstream(
+    provider: &str,
+    model: &str,
+    status: u16,
+    error_class: &str,
+    duration: Duration,
+) {
+    let class = if status == 0 {
+        "error"
+    } else {
+        status_class(status)
+    };
+    counter!(
+        "brightstaff_llm_upstream_requests_total",
+        "provider" => provider.to_string(),
+        "model" => model.to_string(),
+        "status_class" => class,
+        "error_class" => error_class.to_string(),
+    )
+    .increment(1);
+    histogram!(
+        "brightstaff_llm_upstream_duration_seconds",
+        "provider" => provider.to_string(),
+        "model" => model.to_string(),
+    )
+    .record(duration.as_secs_f64());
+}
+
+pub fn record_llm_ttft(provider: &str, model: &str, ttft: Duration) {
+    histogram!(
+        "brightstaff_llm_time_to_first_token_seconds",
+        "provider" => provider.to_string(),
+        "model" => model.to_string(),
+    )
+    .record(ttft.as_secs_f64());
+}
+
+pub fn record_llm_tokens(provider: &str, model: &str, kind: &'static str, count: u64) {
+    counter!(
+        "brightstaff_llm_tokens_total",
+        "provider" => provider.to_string(),
+        "model" => model.to_string(),
+        "kind" => kind,
+    )
+    .increment(count);
+}
+
+pub fn record_llm_tokens_usage_missing(provider: &str, model: &str) {
+    counter!(
+        "brightstaff_llm_tokens_usage_missing_total",
+        "provider" => provider.to_string(),
+        "model" => model.to_string(),
+    )
+    .increment(1);
+}
+
+// ---------------------------------------------------------------------------
+// Router helpers
+// ---------------------------------------------------------------------------
+
+pub fn record_router_decision(
+    route: &'static str,
+    selected_model: &str,
+    fallback: bool,
+    duration: Duration,
+) {
+    counter!(
+        "brightstaff_router_decisions_total",
+        "route" => route,
+        "selected_model" => selected_model.to_string(),
+        "fallback" => if fallback { "true" } else { "false" },
+    )
+    .increment(1);
+    histogram!(
+        "brightstaff_router_decision_duration_seconds",
+        "route" => route,
+    )
+    .record(duration.as_secs_f64());
+}
+
+pub fn record_routing_service_outcome(outcome: &'static str) {
+    counter!(
+        "brightstaff_routing_service_requests_total",
+        "outcome" => outcome,
+    )
+    .increment(1);
+}
+
+pub fn record_session_cache_event(outcome: &'static str) {
+    counter!(
+        "brightstaff_session_cache_events_total",
+        "outcome" => outcome,
+    )
+    .increment(1);
+}
diff --git a/crates/brightstaff/src/router/orchestrator.rs b/crates/brightstaff/src/router/orchestrator.rs
index 7aaf70a2..2d7b25de 100644
--- a/crates/brightstaff/src/router/orchestrator.rs
+++ b/crates/brightstaff/src/router/orchestrator.rs
@@ -15,6 +15,8 @@ use super::http::{self, post_and_extract_content};
 use super::model_metrics::ModelMetricsService;
 use super::orchestrator_model::OrchestratorModel;
 
+use crate::metrics as bs_metrics;
+use crate::metrics::labels as metric_labels;
 use crate::router::orchestrator_model_v1;
 use crate::session_cache::SessionCache;
 
@@ -130,7 +132,13 @@ impl OrchestratorService {
         tenant_id: Option<&str>,
     ) -> Option<CachedRoute> {
         let cache = self.session_cache.as_ref()?;
-        cache.get(&Self::session_key(tenant_id, session_id)).await
+        let result = cache.get(&Self::session_key(tenant_id, session_id)).await;
+        bs_metrics::record_session_cache_event(if result.is_some() {
+            metric_labels::SESSION_CACHE_HIT
+        } else {
+            metric_labels::SESSION_CACHE_MISS
+        });
+        result
     }
 
     pub async fn cache_route(
@@ -151,6 +159,7 @@ impl OrchestratorService {
                     self.session_ttl,
                 )
                 .await;
+            bs_metrics::record_session_cache_event(metric_labels::SESSION_CACHE_STORE);
         }
     }
 
diff --git a/crates/brightstaff/src/streaming.rs b/crates/brightstaff/src/streaming.rs
index 40cbbe7c..8a0f414b 100644
--- a/crates/brightstaff/src/streaming.rs
+++ b/crates/brightstaff/src/streaming.rs
@@ -20,6 +20,8 @@ const STREAM_BUFFER_SIZE: usize = 16;
 /// Most chat responses are well under this; pathological ones are dropped without
 /// affecting pass-through streaming to the client.
 const USAGE_BUFFER_MAX: usize = 2 * 1024 * 1024;
+use crate::metrics as bs_metrics;
+use crate::metrics::labels as metric_labels;
 use crate::signals::{InteractionQuality, SignalAnalyzer, TextBasedSignalAnalyzer, FLAG_MARKER};
 use crate::tracing::{llm, set_service_name, signals as signal_constants};
 use hermesllm::apis::openai::Message;
@@ -172,6 +174,18 @@ impl StreamProcessor for Box<dyn StreamProcessor> {
     }
 }
 
+/// Optional Prometheus-metric context for an LLM upstream call. When present,
+/// [`ObservableStreamProcessor`] emits `brightstaff_llm_*` metrics at
+/// first-byte / complete / error callbacks.
+#[derive(Debug, Clone)]
+pub struct LlmMetricsCtx {
+    pub provider: String,
+    pub model: String,
+    /// HTTP status of the upstream response. Used to pick `status_class` and
+    /// `error_class` on `on_complete`.
+    pub upstream_status: u16,
+}
+
 /// A processor that tracks streaming metrics
 pub struct ObservableStreamProcessor {
     service_name: String,
@@ -185,6 +199,8 @@ pub struct ObservableStreamProcessor {
     /// on `on_complete`. Capped at `USAGE_BUFFER_MAX`; excess chunks are dropped
     /// from the buffer (they still pass through to the client).
     response_buffer: Vec<u8>,
+    llm_metrics: Option<LlmMetricsCtx>,
+    metrics_recorded: bool,
 }
 
 impl ObservableStreamProcessor {
@@ -219,8 +235,17 @@ impl ObservableStreamProcessor {
             time_to_first_token: None,
             messages,
             response_buffer: Vec::new(),
+            llm_metrics: None,
+            metrics_recorded: false,
         }
     }
+
+    /// Attach LLM upstream metric context so the processor emits
+    /// `brightstaff_llm_*` metrics on first-byte / complete / error.
+    pub fn with_llm_metrics(mut self, ctx: LlmMetricsCtx) -> Self {
+        self.llm_metrics = Some(ctx);
+        self
+    }
 }
 
 impl StreamProcessor for ObservableStreamProcessor {
@@ -240,7 +265,11 @@ impl StreamProcessor for ObservableStreamProcessor {
     fn on_first_bytes(&mut self) {
         // Record time to first token (only for streaming)
         if self.time_to_first_token.is_none() {
-            self.time_to_first_token = Some(self.start_time.elapsed().as_millis());
+            let elapsed = self.start_time.elapsed();
+            self.time_to_first_token = Some(elapsed.as_millis());
+            if let Some(ref ctx) = self.llm_metrics {
+                bs_metrics::record_llm_ttft(&ctx.provider, &ctx.model, elapsed);
+            }
         }
     }
 
@@ -299,6 +328,39 @@ impl StreamProcessor for ObservableStreamProcessor {
                 otel_span.set_attribute(KeyValue::new(llm::MODEL_NAME, resolved));
             }
         }
+
+        // Emit LLM upstream prometheus metrics (duration + tokens) if wired.
+        // The upstream responded (we have a status), so status_class alone
+        // carries the non-2xx signal — error_class stays "none".
+        if let Some(ref ctx) = self.llm_metrics {
+            bs_metrics::record_llm_upstream(
+                &ctx.provider,
+                &ctx.model,
+                ctx.upstream_status,
+                metric_labels::LLM_ERR_NONE,
+                self.start_time.elapsed(),
+            );
+            if let Some(v) = usage.prompt_tokens {
+                bs_metrics::record_llm_tokens(
+                    &ctx.provider,
+                    &ctx.model,
+                    metric_labels::TOKEN_KIND_PROMPT,
+                    v.max(0) as u64,
+                );
+            }
+            if let Some(v) = usage.completion_tokens {
+                bs_metrics::record_llm_tokens(
+                    &ctx.provider,
+                    &ctx.model,
+                    metric_labels::TOKEN_KIND_COMPLETION,
+                    v.max(0) as u64,
+                );
+            }
+            if usage.prompt_tokens.is_none() && usage.completion_tokens.is_none() {
+                bs_metrics::record_llm_tokens_usage_missing(&ctx.provider, &ctx.model);
+            }
+            self.metrics_recorded = true;
+        }
         // Release the buffered bytes early; nothing downstream needs them.
         self.response_buffer.clear();
         self.response_buffer.shrink_to_fit();
@@ -396,6 +458,18 @@ impl StreamProcessor for ObservableStreamProcessor {
             duration_ms = self.start_time.elapsed().as_millis(),
             "stream error"
         );
+        if let Some(ref ctx) = self.llm_metrics {
+            if !self.metrics_recorded {
+                bs_metrics::record_llm_upstream(
+                    &ctx.provider,
+                    &ctx.model,
+                    ctx.upstream_status,
+                    metric_labels::LLM_ERR_STREAM,
+                    self.start_time.elapsed(),
+                );
+                self.metrics_recorded = true;
+            }
+        }
     }
 }
 
diff --git a/docs/source/guides/observability/monitoring.rst b/docs/source/guides/observability/monitoring.rst
index 736e0a64..d28d25ca 100644
--- a/docs/source/guides/observability/monitoring.rst
+++ b/docs/source/guides/observability/monitoring.rst
@@ -75,3 +75,54 @@ are some sample configuration files for both, respectively.
         isDefault: true
         access: proxy
         editable: true
+
+Brightstaff metrics
+~~~~~~~~~~~~~~~~~~~
+
+In addition to Envoy's stats on ``:9901``, the brightstaff dataplane
+process exposes its own Prometheus endpoint on ``0.0.0.0:9092`` (override
+with ``METRICS_BIND_ADDRESS``). It publishes:
+
+* HTTP RED — ``brightstaff_http_requests_total``,
+  ``brightstaff_http_request_duration_seconds``,
+  ``brightstaff_http_in_flight_requests`` (labels: ``handler``, ``method``,
+  ``status_class``).
+* LLM upstream — ``brightstaff_llm_upstream_requests_total``,
+  ``brightstaff_llm_upstream_duration_seconds``,
+  ``brightstaff_llm_time_to_first_token_seconds``,
+  ``brightstaff_llm_tokens_total`` (labels: ``provider``, ``model``,
+  ``error_class``, ``kind``).
+* Routing — ``brightstaff_router_decisions_total``,
+  ``brightstaff_router_decision_duration_seconds``,
+  ``brightstaff_routing_service_requests_total``,
+  ``brightstaff_session_cache_events_total``.
+* Process & build — ``process_resident_memory_bytes``,
+  ``process_cpu_seconds_total``, ``brightstaff_build_info``.
+
+A self-contained Prometheus + Grafana stack is shipped under
+``config/grafana/``. With Plano already running on the host, bring it up
+with one command:
+
+.. code-block:: bash
+
+    cd config/grafana
+    docker compose up -d
+    open http://localhost:3000   # admin / admin (anonymous viewer also enabled)
+
+Grafana auto-loads the Prometheus datasource and the brightstaff
+dashboard (look under the *Plano* folder). Prometheus scrapes the host's
+``:9092`` and ``:9901`` via ``host.docker.internal``.
+
+Files:
+
+* ``config/grafana/docker-compose.yaml`` — one-command Prom + Grafana
+  stack with provisioning.
+* ``config/grafana/prometheus_scrape.yaml`` — complete Prometheus config
+  with ``envoy`` and ``brightstaff`` scrape jobs (mounted by the
+  compose).
+* ``config/grafana/brightstaff_dashboard.json`` — 19-panel dashboard
+  across HTTP RED, LLM upstream, Routing service, and Process & Envoy
+  link rows. Auto-provisioned by the compose; can also be imported by
+  hand via *Dashboards → New → Import*.
+* ``config/grafana/provisioning/`` — Grafana provisioning files for the
+  datasource and dashboard provider.

From 800222dc23d0ce5a15842ce875069366fcf0b256 Mon Sep 17 00:00:00 2001
From: Adil Hafeez <adil@katanemo.com>
Date: Thu, 23 Apr 2026 00:25:19 -0700
Subject: [PATCH 16/16] slim down to jemalloc + debug endpoint + stress tests
 only

---
 config/plano_config_schema.yaml          |   8 -
 crates/brightstaff/src/handlers/debug.rs |  43 ----
 crates/brightstaff/src/main.rs           |   7 +-
 crates/brightstaff/src/state/memory.rs   | 245 +---------------------
 crates/brightstaff/src/state/mod.rs      |   6 -
 crates/brightstaff/src/tracing/init.rs   |   2 -
 crates/common/src/configuration.rs       |   6 -
 tests/stress/routing_stress.py           | 255 -----------------------
 8 files changed, 5 insertions(+), 567 deletions(-)
 delete mode 100644 tests/stress/routing_stress.py

diff --git a/config/plano_config_schema.yaml b/config/plano_config_schema.yaml
index d17e5e12..3439ebee 100644
--- a/config/plano_config_schema.yaml
+++ b/config/plano_config_schema.yaml
@@ -477,14 +477,6 @@ properties:
       connection_string:
         type: string
         description: Required when type is postgres. Supports environment variable substitution using $VAR or ${VAR} syntax.
-      ttl_seconds:
-        type: integer
-        minimum: 60
-        description: TTL in seconds for in-memory state entries. Only applies when type is memory. Default 1800 (30 min).
-      max_entries:
-        type: integer
-        minimum: 100
-        description: Maximum number of in-memory state entries. Only applies when type is memory. Default 10000.
     additionalProperties: false
     required:
       - type
diff --git a/crates/brightstaff/src/handlers/debug.rs b/crates/brightstaff/src/handlers/debug.rs
index 84ef3b87..58fbecd2 100644
--- a/crates/brightstaff/src/handlers/debug.rs
+++ b/crates/brightstaff/src/handlers/debug.rs
@@ -1,10 +1,8 @@
 use bytes::Bytes;
 use http_body_util::combinators::BoxBody;
 use hyper::{Response, StatusCode};
-use std::sync::Arc;
 
 use super::full;
-use crate::app_state::AppState;
 
 #[derive(serde::Serialize)]
 struct MemStats {
@@ -30,7 +28,6 @@ pub async fn memstats() -> Result<Response<BoxBody<Bytes, hyper::Error>>, hyper:
 fn get_jemalloc_stats() -> MemStats {
     use tikv_jemalloc_ctl::{epoch, stats};
 
-    // Advance the jemalloc stats epoch so numbers are fresh.
     if let Err(e) = epoch::advance() {
         return MemStats {
             allocated_bytes: 0,
@@ -54,43 +51,3 @@ fn get_jemalloc_stats() -> MemStats {
         error: Some("jemalloc feature not enabled".to_string()),
     }
 }
-
-#[derive(serde::Serialize)]
-struct StateSize {
-    entry_count: usize,
-    estimated_bytes: usize,
-    #[serde(skip_serializing_if = "Option::is_none")]
-    error: Option<String>,
-}
-
-/// Returns the number of entries and estimated byte size in the conversation state store.
-pub async fn state_size(
-    state: Arc<AppState>,
-) -> Result<Response<BoxBody<Bytes, hyper::Error>>, hyper::Error> {
-    let result = match &state.state_storage {
-        Some(storage) => match storage.entry_stats().await {
-            Ok((count, bytes)) => StateSize {
-                entry_count: count,
-                estimated_bytes: bytes,
-                error: None,
-            },
-            Err(e) => StateSize {
-                entry_count: 0,
-                estimated_bytes: 0,
-                error: Some(format!("{e}")),
-            },
-        },
-        None => StateSize {
-            entry_count: 0,
-            estimated_bytes: 0,
-            error: Some("no state_storage configured".to_string()),
-        },
-    };
-
-    let json = serde_json::to_string(&result).unwrap();
-    Ok(Response::builder()
-        .status(StatusCode::OK)
-        .header("Content-Type", "application/json")
-        .body(full(json))
-        .unwrap())
-}
diff --git a/crates/brightstaff/src/main.rs b/crates/brightstaff/src/main.rs
index 9dfc9977..e24fa650 100644
--- a/crates/brightstaff/src/main.rs
+++ b/crates/brightstaff/src/main.rs
@@ -358,15 +358,11 @@ async fn init_state_storage(
 
     let storage: Arc<dyn StateStorage> = match storage_config.storage_type {
         common::configuration::StateStorageType::Memory => {
-            let ttl = storage_config.ttl_seconds.unwrap_or(1800);
-            let max = storage_config.max_entries.unwrap_or(10_000);
             info!(
                 storage_type = "memory",
-                ttl_seconds = ttl,
-                max_entries = max,
                 "initialized conversation state storage"
             );
-            Arc::new(MemoryConversationalStorage::with_limits(ttl, max))
+            Arc::new(MemoryConversationalStorage::new())
         }
         common::configuration::StateStorageType::Postgres => {
             let connection_string = storage_config
@@ -520,7 +516,6 @@ async fn dispatch(
         }
         (&Method::OPTIONS, "/v1/models" | "/agents/v1/models") => cors_preflight(),
         (&Method::GET, "/debug/memstats") => debug::memstats().await,
-        (&Method::GET, "/debug/state_size") => debug::state_size(Arc::clone(&state)).await,
         _ => {
             debug!(method = %req.method(), path = %path, "no route found");
             let mut not_found = Response::new(empty());
diff --git a/crates/brightstaff/src/state/memory.rs b/crates/brightstaff/src/state/memory.rs
index c0c851e8..be4d8232 100644
--- a/crates/brightstaff/src/state/memory.rs
+++ b/crates/brightstaff/src/state/memory.rs
@@ -3,98 +3,21 @@ use async_trait::async_trait;
 use std::collections::HashMap;
 use std::sync::Arc;
 use tokio::sync::RwLock;
-use tracing::{debug, info, warn};
+use tracing::{debug, warn};
 
-const DEFAULT_TTL_SECS: u64 = 1800; // 30 minutes
-const DEFAULT_MAX_ENTRIES: usize = 10_000;
-const EVICTION_INTERVAL_SECS: u64 = 60;
-
-/// In-memory storage backend for conversation state.
-///
-/// Entries are evicted when they exceed `ttl_secs` or when the store grows
-/// beyond `max_entries` (oldest-first by `created_at`).  A background task
-/// runs every 60 s to sweep expired entries.
+/// In-memory storage backend for conversation state
+/// Uses a HashMap wrapped in Arc<RwLock<>> for thread-safe access
 #[derive(Clone)]
 pub struct MemoryConversationalStorage {
     storage: Arc<RwLock<HashMap<String, OpenAIConversationState>>>,
-    max_entries: usize,
 }
 
 impl MemoryConversationalStorage {
     pub fn new() -> Self {
-        Self::with_limits(DEFAULT_TTL_SECS, DEFAULT_MAX_ENTRIES)
-    }
-
-    pub fn with_limits(ttl_secs: u64, max_entries: usize) -> Self {
-        let storage = Arc::new(RwLock::new(HashMap::new()));
-
-        let bg_storage = Arc::clone(&storage);
-        tokio::spawn(async move {
-            Self::eviction_loop(bg_storage, ttl_secs, max_entries).await;
-        });
-
         Self {
-            storage,
-            max_entries,
+            storage: Arc::new(RwLock::new(HashMap::new())),
         }
     }
-
-    async fn eviction_loop(
-        storage: Arc<RwLock<HashMap<String, OpenAIConversationState>>>,
-        ttl_secs: u64,
-        max_entries: usize,
-    ) {
-        let interval = std::time::Duration::from_secs(EVICTION_INTERVAL_SECS);
-        loop {
-            tokio::time::sleep(interval).await;
-
-            let now = chrono::Utc::now().timestamp();
-            let cutoff = now - ttl_secs as i64;
-
-            let mut map = storage.write().await;
-            let before = map.len();
-
-            // Phase 1: remove expired entries
-            map.retain(|_, state| state.created_at > cutoff);
-
-            // Phase 2: if still over capacity, drop oldest entries
-            if map.len() > max_entries {
-                let mut entries: Vec<(String, i64)> =
-                    map.iter().map(|(k, v)| (k.clone(), v.created_at)).collect();
-                entries.sort_by_key(|(_, ts)| *ts);
-
-                let to_remove = map.len() - max_entries;
-                for (key, _) in entries.into_iter().take(to_remove) {
-                    map.remove(&key);
-                }
-            }
-
-            let evicted = before.saturating_sub(map.len());
-            if evicted > 0 {
-                info!(
-                    evicted,
-                    remaining = map.len(),
-                    "memory state store eviction sweep"
-                );
-            }
-        }
-    }
-
-    fn estimate_entry_bytes(state: &OpenAIConversationState) -> usize {
-        let base = std::mem::size_of::<OpenAIConversationState>()
-            + state.response_id.len()
-            + state.model.len()
-            + state.provider.len();
-        let items: usize = state
-            .input_items
-            .iter()
-            .map(|item| {
-                // Rough estimate: serialize to JSON and use its length as a proxy
-                serde_json::to_string(item).map(|s| s.len()).unwrap_or(64)
-            })
-            .sum();
-        base + items
-    }
 }
 
 impl Default for MemoryConversationalStorage {
@@ -115,26 +38,6 @@ impl StateStorage for MemoryConversationalStorage {
         );
 
         storage.insert(response_id, state);
-
-        // Inline cap check so we don't wait for the background sweep
-        if storage.len() > self.max_entries {
-            let mut entries: Vec<(String, i64)> = storage
-                .iter()
-                .map(|(k, v)| (k.clone(), v.created_at))
-                .collect();
-            entries.sort_by_key(|(_, ts)| *ts);
-
-            let to_remove = storage.len() - self.max_entries;
-            for (key, _) in entries.into_iter().take(to_remove) {
-                storage.remove(&key);
-            }
-            info!(
-                evicted = to_remove,
-                remaining = storage.len(),
-                "memory state store cap eviction on put"
-            );
-        }
-
         Ok(())
     }
 
@@ -177,13 +80,6 @@ impl StateStorage for MemoryConversationalStorage {
             Err(StateStorageError::NotFound(response_id.to_string()))
         }
     }
-
-    async fn entry_stats(&self) -> Result<(usize, usize), StateStorageError> {
-        let storage = self.storage.read().await;
-        let count = storage.len();
-        let bytes: usize = storage.values().map(Self::estimate_entry_bytes).sum();
-        Ok((count, bytes))
-    }
 }
 
 #[cfg(test)]
@@ -739,137 +635,4 @@ mod tests {
             _ => panic!("Expected MessageContent::Items"),
         }
     }
-
-    // -----------------------------------------------------------------------
-    // Stress / eviction tests
-    // -----------------------------------------------------------------------
-
-    fn create_test_state_with_ts(
-        response_id: &str,
-        num_messages: usize,
-        created_at: i64,
-    ) -> OpenAIConversationState {
-        let mut state = create_test_state(response_id, num_messages);
-        state.created_at = created_at;
-        state
-    }
-
-    #[tokio::test]
-    async fn test_max_entries_cap_evicts_oldest() {
-        let max = 100;
-        let storage = MemoryConversationalStorage::with_limits(3600, max);
-
-        let now = chrono::Utc::now().timestamp();
-        for i in 0..(max + 50) {
-            let state =
-                create_test_state_with_ts(&format!("resp_{i}"), 2, now - (max as i64) + i as i64);
-            storage.put(state).await.unwrap();
-        }
-
-        let (count, _) = storage.entry_stats().await.unwrap();
-        assert_eq!(count, max, "store should be capped at max_entries");
-
-        // The oldest entries should have been evicted
-        assert!(
-            storage.get("resp_0").await.is_err(),
-            "oldest entry should be evicted"
-        );
-        // The newest entry should still be present
-        let newest = format!("resp_{}", max + 49);
-        assert!(
-            storage.get(&newest).await.is_ok(),
-            "newest entry should be present"
-        );
-    }
-
-    #[tokio::test]
-    async fn test_ttl_eviction_removes_expired() {
-        let ttl_secs = 2;
-        let storage = MemoryConversationalStorage::with_limits(ttl_secs, 100_000);
-
-        let now = chrono::Utc::now().timestamp();
-
-        // Insert entries that are already "old" (created_at in the past beyond TTL)
-        for i in 0..20 {
-            let state =
-                create_test_state_with_ts(&format!("old_{i}"), 1, now - (ttl_secs as i64) - 10);
-            storage.put(state).await.unwrap();
-        }
-
-        // Insert fresh entries
-        for i in 0..10 {
-            let state = create_test_state_with_ts(&format!("new_{i}"), 1, now);
-            storage.put(state).await.unwrap();
-        }
-
-        let (count_before, _) = storage.entry_stats().await.unwrap();
-        assert_eq!(count_before, 30);
-
-        // Wait for the eviction sweep (interval is 60s in prod, but the old
-        // entries are already past TTL so a manual trigger would clean them).
-        // For unit testing we directly call the eviction logic instead of waiting.
-        {
-            let bg_storage = storage.storage.clone();
-            let cutoff = now - ttl_secs as i64;
-            let mut map = bg_storage.write().await;
-            map.retain(|_, state| state.created_at > cutoff);
-        }
-
-        let (count_after, _) = storage.entry_stats().await.unwrap();
-        assert_eq!(
-            count_after, 10,
-            "expired entries should be evicted, only fresh ones remain"
-        );
-
-        // Verify fresh entries are still present
-        for i in 0..10 {
-            assert!(storage.get(&format!("new_{i}")).await.is_ok());
-        }
-    }
-
-    #[tokio::test]
-    async fn test_entry_stats_reports_reasonable_size() {
-        let storage = MemoryConversationalStorage::with_limits(3600, 10_000);
-
-        let now = chrono::Utc::now().timestamp();
-        for i in 0..50 {
-            let state = create_test_state_with_ts(&format!("resp_{i}"), 5, now);
-            storage.put(state).await.unwrap();
-        }
-
-        let (count, bytes) = storage.entry_stats().await.unwrap();
-        assert_eq!(count, 50);
-        assert!(bytes > 0, "estimated bytes should be positive");
-        assert!(
-            bytes > 50 * 100,
-            "50 entries with 5 messages each should be at least a few KB"
-        );
-    }
-
-    #[tokio::test]
-    async fn test_high_concurrency_with_cap() {
-        let max = 500;
-        let storage = MemoryConversationalStorage::with_limits(3600, max);
-        let now = chrono::Utc::now().timestamp();
-
-        let mut handles = vec![];
-        for i in 0..1000 {
-            let s = storage.clone();
-            let handle = tokio::spawn(async move {
-                let state = create_test_state_with_ts(&format!("conc_{i}"), 3, now + i as i64);
-                s.put(state).await.unwrap();
-            });
-            handles.push(handle);
-        }
-
-        for handle in handles {
-            handle.await.unwrap();
-        }
-
-        let (count, _) = storage.entry_stats().await.unwrap();
-        assert!(
-            count <= max,
-            "store should never exceed max_entries, got {count}"
-        );
-    }
 }
diff --git a/crates/brightstaff/src/state/mod.rs b/crates/brightstaff/src/state/mod.rs
index 794568a6..43454ee2 100644
--- a/crates/brightstaff/src/state/mod.rs
+++ b/crates/brightstaff/src/state/mod.rs
@@ -75,12 +75,6 @@ pub trait StateStorage: Send + Sync {
     /// Delete state for a response_id (optional, for cleanup)
     async fn delete(&self, response_id: &str) -> Result<(), StateStorageError>;
 
-    /// Return (entry_count, estimated_bytes) for observability.
-    /// Backends that cannot cheaply compute this may return (0, 0).
-    async fn entry_stats(&self) -> Result<(usize, usize), StateStorageError> {
-        Ok((0, 0))
-    }
-
     fn merge(
         &self,
         prev_state: &OpenAIConversationState,
diff --git a/crates/brightstaff/src/tracing/init.rs b/crates/brightstaff/src/tracing/init.rs
index 1457d52a..ed351148 100644
--- a/crates/brightstaff/src/tracing/init.rs
+++ b/crates/brightstaff/src/tracing/init.rs
@@ -109,8 +109,6 @@ pub fn init_tracer(tracing_config: Option<&Tracing>) -> &'static SdkTracerProvid
 
             let provider = SdkTracerProvider::builder()
                 .with_batch_exporter(exporter)
-                .with_max_attributes_per_span(64)
-                .with_max_events_per_span(16)
                 .build();
 
             global::set_tracer_provider(provider.clone());
diff --git a/crates/common/src/configuration.rs b/crates/common/src/configuration.rs
index 2fbdfdec..028c8046 100644
--- a/crates/common/src/configuration.rs
+++ b/crates/common/src/configuration.rs
@@ -123,12 +123,6 @@ pub struct StateStorageConfig {
     #[serde(rename = "type")]
     pub storage_type: StateStorageType,
     pub connection_string: Option<String>,
-    /// TTL in seconds for in-memory state entries (default: 1800 = 30 min).
-    /// Only applies when type is `memory`.
-    pub ttl_seconds: Option<u64>,
-    /// Maximum number of in-memory state entries (default: 10000).
-    /// Only applies when type is `memory`.
-    pub max_entries: Option<usize>,
 }
 
 #[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
diff --git a/tests/stress/routing_stress.py b/tests/stress/routing_stress.py
deleted file mode 100644
index 152630cb..00000000
--- a/tests/stress/routing_stress.py
+++ /dev/null
@@ -1,255 +0,0 @@
-#!/usr/bin/env python3
-"""
-Stress test for Plano routing service to detect memory leaks.
-
-Sends sustained traffic to the routing endpoint and monitors memory
-via the /debug/memstats and /debug/state_size endpoints.
-
-Usage:
-    # Against a local Plano instance (docker or native)
-    python routing_stress.py --base-url http://localhost:12000
-
-    # Custom parameters
-    python routing_stress.py \
-        --base-url http://localhost:12000 \
-        --num-requests 5000 \
-        --concurrency 20 \
-        --poll-interval 5 \
-        --growth-threshold 3.0
-
-Requirements:
-    pip install httpx
-"""
-from __future__ import annotations
-
-import argparse
-import asyncio
-import json
-import sys
-import time
-import uuid
-from dataclasses import dataclass, field
-
-import httpx
-
-
-@dataclass
-class MemSnapshot:
-    timestamp: float
-    allocated_bytes: int
-    resident_bytes: int
-    state_entries: int
-    state_bytes: int
-    requests_completed: int
-
-
-@dataclass
-class StressResult:
-    snapshots: list[MemSnapshot] = field(default_factory=list)
-    total_requests: int = 0
-    total_errors: int = 0
-    elapsed_secs: float = 0.0
-    passed: bool = True
-    failure_reason: str = ""
-
-
-def make_routing_body(unique: bool = True) -> dict:
-    """Build a minimal chat-completions body for the routing endpoint."""
-    return {
-        "model": "gpt-4o",
-        "messages": [
-            {"role": "user", "content": f"test message {uuid.uuid4() if unique else 'static'}"}
-        ],
-    }
-
-
-async def poll_debug_endpoints(
-    client: httpx.AsyncClient,
-    base_url: str,
-    requests_completed: int,
-) -> MemSnapshot | None:
-    try:
-        mem_resp = await client.get(f"{base_url}/debug/memstats", timeout=5)
-        mem_data = mem_resp.json()
-
-        state_resp = await client.get(f"{base_url}/debug/state_size", timeout=5)
-        state_data = state_resp.json()
-
-        return MemSnapshot(
-            timestamp=time.time(),
-            allocated_bytes=mem_data.get("allocated_bytes", 0),
-            resident_bytes=mem_data.get("resident_bytes", 0),
-            state_entries=state_data.get("entry_count", 0),
-            state_bytes=state_data.get("estimated_bytes", 0),
-            requests_completed=requests_completed,
-        )
-    except Exception as e:
-        print(f"  [warn] failed to poll debug endpoints: {e}", file=sys.stderr)
-        return None
-
-
-async def send_requests(
-    client: httpx.AsyncClient,
-    url: str,
-    count: int,
-    semaphore: asyncio.Semaphore,
-    counter: dict,
-):
-    """Send `count` routing requests, respecting the concurrency semaphore."""
-    for _ in range(count):
-        async with semaphore:
-            try:
-                body = make_routing_body(unique=True)
-                resp = await client.post(url, json=body, timeout=30)
-                if resp.status_code >= 400:
-                    counter["errors"] += 1
-            except Exception:
-                counter["errors"] += 1
-            finally:
-                counter["completed"] += 1
-
-
-async def run_stress_test(
-    base_url: str,
-    num_requests: int,
-    concurrency: int,
-    poll_interval: float,
-    growth_threshold: float,
-) -> StressResult:
-    result = StressResult()
-    routing_url = f"{base_url}/routing/v1/chat/completions"
-
-    print(f"Stress test config:")
-    print(f"  base_url:         {base_url}")
-    print(f"  routing_url:      {routing_url}")
-    print(f"  num_requests:     {num_requests}")
-    print(f"  concurrency:      {concurrency}")
-    print(f"  poll_interval:    {poll_interval}s")
-    print(f"  growth_threshold: {growth_threshold}x")
-    print()
-
-    async with httpx.AsyncClient() as client:
-        # Take baseline snapshot
-        baseline = await poll_debug_endpoints(client, base_url, 0)
-        if baseline:
-            result.snapshots.append(baseline)
-            print(f"[baseline] allocated={baseline.allocated_bytes:,}B "
-                  f"resident={baseline.resident_bytes:,}B "
-                  f"state_entries={baseline.state_entries}")
-        else:
-            print("[warn] could not get baseline snapshot, continuing anyway")
-
-        counter = {"completed": 0, "errors": 0}
-        semaphore = asyncio.Semaphore(concurrency)
-
-        start = time.time()
-
-        # Launch request sender and poller concurrently
-        sender = asyncio.create_task(
-            send_requests(client, routing_url, num_requests, semaphore, counter)
-        )
-
-        # Poll memory while requests are in flight
-        while not sender.done():
-            await asyncio.sleep(poll_interval)
-            snapshot = await poll_debug_endpoints(client, base_url, counter["completed"])
-            if snapshot:
-                result.snapshots.append(snapshot)
-                print(
-                    f"  [{counter['completed']:>6}/{num_requests}] "
-                    f"allocated={snapshot.allocated_bytes:,}B "
-                    f"resident={snapshot.resident_bytes:,}B "
-                    f"state_entries={snapshot.state_entries} "
-                    f"state_bytes={snapshot.state_bytes:,}B"
-                )
-
-        await sender
-        result.elapsed_secs = time.time() - start
-        result.total_requests = counter["completed"]
-        result.total_errors = counter["errors"]
-
-        # Final snapshot
-        final = await poll_debug_endpoints(client, base_url, counter["completed"])
-        if final:
-            result.snapshots.append(final)
-
-    # Analyze results
-    print()
-    print(f"Completed {result.total_requests} requests in {result.elapsed_secs:.1f}s "
-          f"({result.total_errors} errors)")
-
-    if len(result.snapshots) >= 2:
-        first = result.snapshots[0]
-        last = result.snapshots[-1]
-
-        if first.resident_bytes > 0:
-            growth_ratio = last.resident_bytes / first.resident_bytes
-            print(f"Memory growth: {first.resident_bytes:,}B -> {last.resident_bytes:,}B "
-                  f"({growth_ratio:.2f}x)")
-
-            if growth_ratio > growth_threshold:
-                result.passed = False
-                result.failure_reason = (
-                    f"Memory grew {growth_ratio:.2f}x (threshold: {growth_threshold}x). "
-                    f"Likely memory leak detected."
-                )
-                print(f"FAIL: {result.failure_reason}")
-            else:
-                print(f"PASS: Memory growth {growth_ratio:.2f}x is within {growth_threshold}x threshold")
-
-        print(f"State store: {last.state_entries} entries, {last.state_bytes:,}B")
-    else:
-        print("[warn] not enough snapshots to analyze memory growth")
-
-    return result
-
-
-def main():
-    parser = argparse.ArgumentParser(description="Plano routing service stress test")
-    parser.add_argument("--base-url", default="http://localhost:12000",
-                        help="Base URL of the Plano instance")
-    parser.add_argument("--num-requests", type=int, default=2000,
-                        help="Total number of requests to send")
-    parser.add_argument("--concurrency", type=int, default=10,
-                        help="Max concurrent requests")
-    parser.add_argument("--poll-interval", type=float, default=5.0,
-                        help="Seconds between memory polls")
-    parser.add_argument("--growth-threshold", type=float, default=3.0,
-                        help="Max allowed memory growth ratio (fail if exceeded)")
-    args = parser.parse_args()
-
-    result = asyncio.run(run_stress_test(
-        base_url=args.base_url,
-        num_requests=args.num_requests,
-        concurrency=args.concurrency,
-        poll_interval=args.poll_interval,
-        growth_threshold=args.growth_threshold,
-    ))
-
-    # Write results JSON for CI consumption
-    report = {
-        "passed": result.passed,
-        "failure_reason": result.failure_reason,
-        "total_requests": result.total_requests,
-        "total_errors": result.total_errors,
-        "elapsed_secs": result.elapsed_secs,
-        "snapshots": [
-            {
-                "timestamp": s.timestamp,
-                "allocated_bytes": s.allocated_bytes,
-                "resident_bytes": s.resident_bytes,
-                "state_entries": s.state_entries,
-                "state_bytes": s.state_bytes,
-                "requests_completed": s.requests_completed,
-            }
-            for s in result.snapshots
-        ],
-    }
-    print()
-    print(json.dumps(report, indent=2))
-
-    sys.exit(0 if result.passed else 1)
-
-
-if __name__ == "__main__":
-    main()