From 028a2cd19661d82e912a7ed02177fe1f3399d4bf Mon Sep 17 00:00:00 2001
From: Adil Hafeez <adil.hafeez@gmail.com>
Date: Mon, 9 Mar 2026 16:32:16 -0700
Subject: [PATCH 1/8] add routing service (#814)

fixes https://github.com/katanemo/plano/issues/810
---
 crates/brightstaff/src/handlers/mod.rs        |   1 +
 .../brightstaff/src/handlers/router_chat.rs   |   9 +-
 .../src/handlers/routing_service.rs           | 163 ++++++++++++++++++
 crates/brightstaff/src/main.rs                |  23 ++-
 .../model_routing_service/README.md           |  92 ++++++++++
 .../model_routing_service/config.yaml         |  27 +++
 .../llm_routing/model_routing_service/demo.sh |  65 +++++++
 7 files changed, 375 insertions(+), 5 deletions(-)
 create mode 100644 crates/brightstaff/src/handlers/routing_service.rs
 create mode 100644 demos/llm_routing/model_routing_service/README.md
 create mode 100644 demos/llm_routing/model_routing_service/config.yaml
 create mode 100755 demos/llm_routing/model_routing_service/demo.sh
diff --git a/crates/brightstaff/src/handlers/mod.rs b/crates/brightstaff/src/handlers/mod.rs
index 0bbd3454..9c602e93 100644
--- a/crates/brightstaff/src/handlers/mod.rs
+++ b/crates/brightstaff/src/handlers/mod.rs
@@ -7,6 +7,7 @@ pub mod models;
 pub mod pipeline_processor;
 pub mod response_handler;
 pub mod router_chat;
+pub mod routing_service;
 pub mod utils;
 
 #[cfg(test)]
diff --git a/crates/brightstaff/src/handlers/router_chat.rs b/crates/brightstaff/src/handlers/router_chat.rs
index d71734fa..345632fc 100644
--- a/crates/brightstaff/src/handlers/router_chat.rs
+++ b/crates/brightstaff/src/handlers/router_chat.rs
@@ -10,6 +10,7 @@ use crate::tracing::routing;
 
 pub struct RoutingResult {
     pub model_name: String,
+    pub route_name: Option<String>,
 }
 
 pub struct RoutingError {
@@ -133,9 +134,12 @@ pub async fn router_chat_get_upstream_model(
 
     match routing_result {
         Ok(route) => match route {
-            Some((_, model_name)) => {
+            Some((route_name, model_name)) => {
                 current_span.record("route.selected_model", model_name.as_str());
-                Ok(RoutingResult { model_name })
+                Ok(RoutingResult {
+                    model_name,
+                    route_name: Some(route_name),
+                })
             }
             None => {
                 // No route determined, return sentinel value "none"
@@ -145,6 +149,7 @@ pub async fn router_chat_get_upstream_model(
 
                 Ok(RoutingResult {
                     model_name: "none".to_string(),
+                    route_name: None,
                 })
             }
         },
diff --git a/crates/brightstaff/src/handlers/routing_service.rs b/crates/brightstaff/src/handlers/routing_service.rs
new file mode 100644
index 00000000..32f37a08
--- /dev/null
+++ b/crates/brightstaff/src/handlers/routing_service.rs
@@ -0,0 +1,163 @@
+use bytes::Bytes;
+use common::configuration::SpanAttributes;
+use common::consts::{REQUEST_ID_HEADER, TRACE_PARENT_HEADER};
+use common::errors::BrightStaffError;
+use hermesllm::clients::SupportedAPIsFromClient;
+use hermesllm::ProviderRequestType;
+use http_body_util::combinators::BoxBody;
+use http_body_util::{BodyExt, Full};
+use hyper::{Request, Response, StatusCode};
+use std::sync::Arc;
+use tracing::{debug, info, info_span, warn, Instrument};
+
+use crate::handlers::router_chat::router_chat_get_upstream_model;
+use crate::router::llm_router::RouterService;
+use crate::tracing::{collect_custom_trace_attributes, operation_component, set_service_name};
+
+#[derive(serde::Serialize)]
+struct RoutingDecisionResponse {
+    model: String,
+    route: Option<String>,
+    trace_id: String,
+}
+
+pub async fn routing_decision(
+    request: Request<hyper::body::Incoming>,
+    router_service: Arc<RouterService>,
+    request_path: String,
+    span_attributes: Arc<Option<SpanAttributes>>,
+) -> Result<Response<BoxBody<Bytes, hyper::Error>>, hyper::Error> {
+    let request_headers = request.headers().clone();
+    let request_id: String = request_headers
+        .get(REQUEST_ID_HEADER)
+        .and_then(|h| h.to_str().ok())
+        .map(|s| s.to_string())
+        .unwrap_or_else(|| uuid::Uuid::new_v4().to_string());
+
+    let custom_attrs =
+        collect_custom_trace_attributes(&request_headers, span_attributes.as_ref().as_ref());
+
+    let request_span = info_span!(
+        "routing_decision",
+        component = "routing",
+        request_id = %request_id,
+        http.method = %request.method(),
+        http.path = %request_path,
+    );
+
+    routing_decision_inner(
+        request,
+        router_service,
+        request_id,
+        request_path,
+        request_headers,
+        custom_attrs,
+    )
+    .instrument(request_span)
+    .await
+}
+
+async fn routing_decision_inner(
+    request: Request<hyper::body::Incoming>,
+    router_service: Arc<RouterService>,
+    request_id: String,
+    request_path: String,
+    request_headers: hyper::HeaderMap,
+    custom_attrs: std::collections::HashMap<String, String>,
+) -> Result<Response<BoxBody<Bytes, hyper::Error>>, hyper::Error> {
+    set_service_name(operation_component::ROUTING);
+    opentelemetry::trace::get_active_span(|span| {
+        for (key, value) in &custom_attrs {
+            span.set_attribute(opentelemetry::KeyValue::new(key.clone(), value.clone()));
+        }
+    });
+
+    // Extract or generate traceparent
+    let traceparent: String = match request_headers
+        .get(TRACE_PARENT_HEADER)
+        .and_then(|h| h.to_str().ok())
+        .map(|s| s.to_string())
+    {
+        Some(tp) => tp,
+        None => {
+            let trace_id = uuid::Uuid::new_v4().to_string().replace("-", "");
+            let generated_tp = format!("00-{}-0000000000000000-01", trace_id);
+            warn!(
+                generated_traceparent = %generated_tp,
+                "TRACE_PARENT header missing, generated new traceparent"
+            );
+            generated_tp
+        }
+    };
+
+    // Extract trace_id from traceparent (format: 00-{trace_id}-{span_id}-{flags})
+    let trace_id = traceparent
+        .split('-')
+        .nth(1)
+        .unwrap_or("unknown")
+        .to_string();
+
+    // Parse request body
+    let chat_request_bytes = request.collect().await?.to_bytes();
+
+    debug!(
+        body = %String::from_utf8_lossy(&chat_request_bytes),
+        "routing decision request body received"
+    );
+
+    let client_request = match ProviderRequestType::try_from((
+        &chat_request_bytes[..],
+        &SupportedAPIsFromClient::from_endpoint(request_path.as_str()).unwrap(),
+    )) {
+        Ok(request) => request,
+        Err(err) => {
+            warn!(error = %err, "failed to parse request for routing decision");
+            return Ok(BrightStaffError::InvalidRequest(format!(
+                "Failed to parse request: {}",
+                err
+            ))
+            .into_response());
+        }
+    };
+
+    // Call the existing routing logic
+    let routing_result = router_chat_get_upstream_model(
+        router_service,
+        client_request,
+        &traceparent,
+        &request_path,
+        &request_id,
+    )
+    .await;
+
+    match routing_result {
+        Ok(result) => {
+            let response = RoutingDecisionResponse {
+                model: result.model_name,
+                route: result.route_name,
+                trace_id,
+            };
+
+            info!(
+                model = %response.model,
+                route = ?response.route,
+                "routing decision completed"
+            );
+
+            let json = serde_json::to_string(&response).unwrap();
+            let body = Full::new(Bytes::from(json))
+                .map_err(|never| match never {})
+                .boxed();
+
+            Ok(Response::builder()
+                .status(StatusCode::OK)
+                .header("Content-Type", "application/json")
+                .body(body)
+                .unwrap())
+        }
+        Err(err) => {
+            warn!(error = %err.message, "routing decision failed");
+            Ok(BrightStaffError::InternalServerError(err.message).into_response())
+        }
+    }
+}
diff --git a/crates/brightstaff/src/main.rs b/crates/brightstaff/src/main.rs
index 97345556..51c9127f 100644
--- a/crates/brightstaff/src/main.rs
+++ b/crates/brightstaff/src/main.rs
@@ -2,6 +2,7 @@ use brightstaff::handlers::agent_chat_completions::agent_chat;
 use brightstaff::handlers::function_calling::function_calling_chat_handler;
 use brightstaff::handlers::llm::llm_chat;
 use brightstaff::handlers::models::list_models;
+use brightstaff::handlers::routing_service::routing_decision;
 use brightstaff::router::llm_router::RouterService;
 use brightstaff::router::plano_orchestrator::OrchestratorService;
 use brightstaff::state::memory::MemoryConversationalStorage;
@@ -194,7 +195,7 @@ async fn main() -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
             let state_storage = state_storage.clone();
 
             async move {
-                let path = req.uri().path();
+                let path = req.uri().path().to_string();
                 // Check if path starts with /agents
                 if path.starts_with("/agents") {
                     // Check if it matches one of the agent API paths
@@ -217,7 +218,23 @@ async fn main() -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
                         .await;
                     }
                 }
-                match (req.method(), path) {
+                if let Some(stripped_path) = path.strip_prefix("/routing") {
+                    let stripped_path = stripped_path.to_string();
+                    if matches!(
+                        stripped_path.as_str(),
+                        CHAT_COMPLETIONS_PATH | MESSAGES_PATH | OPENAI_RESPONSES_API_PATH
+                    ) {
+                        return routing_decision(
+                            req,
+                            router_service,
+                            stripped_path,
+                            span_attributes,
+                        )
+                        .with_context(parent_cx)
+                        .await;
+                    }
+                }
+                match (req.method(), path.as_str()) {
                     (
                         &Method::POST,
                         CHAT_COMPLETIONS_PATH | MESSAGES_PATH | OPENAI_RESPONSES_API_PATH,
@@ -270,7 +287,7 @@ async fn main() -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
                         Ok(response)
                     }
                     _ => {
-                        debug!(method = %req.method(), path = %req.uri().path(), "no route found");
+                        debug!(method = %req.method(), path = %path, "no route found");
                         let mut not_found = Response::new(empty());
                         *not_found.status_mut() = StatusCode::NOT_FOUND;
                         Ok(not_found)
diff --git a/demos/llm_routing/model_routing_service/README.md b/demos/llm_routing/model_routing_service/README.md
new file mode 100644
index 00000000..85d56abf
--- /dev/null
+++ b/demos/llm_routing/model_routing_service/README.md
@@ -0,0 +1,92 @@
+# Model Routing Service Demo
+
+This demo shows how to use the `/routing/v1/*` endpoints to get routing decisions without proxying requests to an LLM. The endpoint accepts standard LLM request formats and returns which model Plano's router would select.
+
+## Setup
+
+Make sure you have Plano CLI installed (`pip install planoai` or `uv tool install planoai`).
+
+```bash
+export OPENAI_API_KEY=<your-key>
+export ANTHROPIC_API_KEY=<your-key>
+```
+
+Start Plano:
+```bash
+cd demos/llm_routing/model_routing_service
+planoai up config.yaml
+```
+
+## Run the demo
+
+```bash
+./demo.sh
+```
+
+## Endpoints
+
+All three LLM API formats are supported:
+
+| Endpoint | Format |
+|---|---|
+| `POST /routing/v1/chat/completions` | OpenAI Chat Completions |
+| `POST /routing/v1/messages` | Anthropic Messages |
+| `POST /routing/v1/responses` | OpenAI Responses API |
+
+## Example
+
+```bash
+curl http://localhost:12000/routing/v1/chat/completions \
+  -H "Content-Type: application/json" \
+  -d '{
+    "model": "gpt-4o-mini",
+    "messages": [{"role": "user", "content": "Write a Python function for binary search"}]
+  }'
+```
+
+Response:
+```json
+{
+    "model": "anthropic/claude-sonnet-4-20250514",
+    "route": "code_generation",
+    "trace_id": "c16d1096c1af4a17abb48fb182918a88"
+}
+```
+
+The response tells you which model would handle this request and which route was matched, without actually making the LLM call.
+
+## Demo Output
+
+```
+=== Model Routing Service Demo ===
+
+--- 1. Code generation query (OpenAI format) ---
+{
+    "model": "anthropic/claude-sonnet-4-20250514",
+    "route": "code_generation",
+    "trace_id": "c16d1096c1af4a17abb48fb182918a88"
+}
+
+--- 2. Complex reasoning query (OpenAI format) ---
+{
+    "model": "openai/gpt-4o",
+    "route": "complex_reasoning",
+    "trace_id": "30795e228aff4d7696f082ed01b75ad4"
+}
+
+--- 3. Simple query - no routing match (OpenAI format) ---
+{
+    "model": "none",
+    "route": null,
+    "trace_id": "ae0b6c3b220d499fb5298ac63f4eac0e"
+}
+
+--- 4. Code generation query (Anthropic format) ---
+{
+    "model": "anthropic/claude-sonnet-4-20250514",
+    "route": "code_generation",
+    "trace_id": "26be822bbdf14a3ba19fe198e55ea4a9"
+}
+
+=== Demo Complete ===
+```
diff --git a/demos/llm_routing/model_routing_service/config.yaml b/demos/llm_routing/model_routing_service/config.yaml
new file mode 100644
index 00000000..7b98b25b
--- /dev/null
+++ b/demos/llm_routing/model_routing_service/config.yaml
@@ -0,0 +1,27 @@
+version: v0.3.0
+
+listeners:
+  - type: model
+    name: model_listener
+    port: 12000
+
+model_providers:
+
+  - model: openai/gpt-4o-mini
+    access_key: $OPENAI_API_KEY
+    default: true
+
+  - model: openai/gpt-4o
+    access_key: $OPENAI_API_KEY
+    routing_preferences:
+      - name: complex_reasoning
+        description: complex reasoning tasks, multi-step analysis, or detailed explanations
+
+  - model: anthropic/claude-sonnet-4-20250514
+    access_key: $ANTHROPIC_API_KEY
+    routing_preferences:
+      - name: code_generation
+        description: generating new code, writing functions, or creating boilerplate
+
+tracing:
+  random_sampling: 100
diff --git a/demos/llm_routing/model_routing_service/demo.sh b/demos/llm_routing/model_routing_service/demo.sh
new file mode 100755
index 00000000..3e9b0584
--- /dev/null
+++ b/demos/llm_routing/model_routing_service/demo.sh
@@ -0,0 +1,65 @@
+#!/bin/bash
+set -e
+
+PLANO_URL="${PLANO_URL:-http://localhost:12000}"
+
+echo "=== Model Routing Service Demo ==="
+echo ""
+echo "This demo shows how to use the /routing/v1/* endpoints to get"
+echo "routing decisions without actually proxying the request to an LLM."
+echo ""
+
+# --- Example 1: OpenAI Chat Completions format ---
+echo "--- 1. Code generation query (OpenAI format) ---"
+echo ""
+curl -s "$PLANO_URL/routing/v1/chat/completions" \
+  -H "Content-Type: application/json" \
+  -d '{
+    "model": "gpt-4o-mini",
+    "messages": [
+      {"role": "user", "content": "Write a Python function that implements binary search on a sorted array"}
+    ]
+  }' | python3 -m json.tool
+echo ""
+
+# --- Example 2: Complex reasoning query ---
+echo "--- 2. Complex reasoning query (OpenAI format) ---"
+echo ""
+curl -s "$PLANO_URL/routing/v1/chat/completions" \
+  -H "Content-Type: application/json" \
+  -d '{
+    "model": "gpt-4o-mini",
+    "messages": [
+      {"role": "user", "content": "Explain the trade-offs between microservices and monolithic architectures, considering scalability, team structure, and operational complexity"}
+    ]
+  }' | python3 -m json.tool
+echo ""
+
+# --- Example 3: Simple query (no routing match) ---
+echo "--- 3. Simple query - no routing match (OpenAI format) ---"
+echo ""
+curl -s "$PLANO_URL/routing/v1/chat/completions" \
+  -H "Content-Type: application/json" \
+  -d '{
+    "model": "gpt-4o-mini",
+    "messages": [
+      {"role": "user", "content": "What is the capital of France?"}
+    ]
+  }' | python3 -m json.tool
+echo ""
+
+# --- Example 4: Anthropic Messages format ---
+echo "--- 4. Code generation query (Anthropic format) ---"
+echo ""
+curl -s "$PLANO_URL/routing/v1/messages" \
+  -H "Content-Type: application/json" \
+  -d '{
+    "model": "gpt-4o-mini",
+    "max_tokens": 1024,
+    "messages": [
+      {"role": "user", "content": "Create a REST API endpoint in Rust using actix-web that handles user registration"}
+    ]
+  }' | python3 -m json.tool
+echo ""
+
+echo "=== Demo Complete ==="

From 97b7a390efe323a0e4ef8bb21cdcb4c9a1334f6d Mon Sep 17 00:00:00 2001
From: Adil Hafeez <adil.hafeez@gmail.com>
Date: Tue, 10 Mar 2026 12:23:18 -0700
Subject: [PATCH 2/8] support inline routing_policy in request body (#811)
 (#815)

---
 crates/brightstaff/src/handlers/llm.rs        |  19 +-
 .../brightstaff/src/handlers/router_chat.rs   |  26 ++-
 .../src/handlers/routing_service.rs           | 202 +++++++++++++++++-
 .../llm_routing/model_routing_service/demo.sh |  55 +++++
 4 files changed, 286 insertions(+), 16 deletions(-)

diff --git a/crates/brightstaff/src/handlers/llm.rs b/crates/brightstaff/src/handlers/llm.rs
index ee41dd2d..b03d4d29 100644
--- a/crates/brightstaff/src/handlers/llm.rs
+++ b/crates/brightstaff/src/handlers/llm.rs
@@ -126,13 +126,27 @@ async fn llm_chat_inner(
         }
     };
 
-    let chat_request_bytes = request.collect().await?.to_bytes();
+    let raw_bytes = request.collect().await?.to_bytes();
 
     debug!(
-        body = %String::from_utf8_lossy(&chat_request_bytes),
+        body = %String::from_utf8_lossy(&raw_bytes),
         "request body received"
     );
 
+    // Extract routing_policy from request body if present
+    let (chat_request_bytes, inline_routing_policy) =
+        match crate::handlers::routing_service::extract_routing_policy(&raw_bytes, false) {
+            Ok(result) => result,
+            Err(err) => {
+                warn!(error = %err, "failed to parse request JSON");
+                return Ok(BrightStaffError::InvalidRequest(format!(
+                    "Failed to parse request: {}",
+                    err
+                ))
+                .into_response());
+            }
+        };
+
     let mut client_request = match ProviderRequestType::try_from((
         &chat_request_bytes[..],
         &SupportedAPIsFromClient::from_endpoint(request_path.as_str()).unwrap(),
@@ -335,6 +349,7 @@ async fn llm_chat_inner(
             &traceparent,
             &request_path,
             &request_id,
+            inline_routing_policy,
         )
         .await
     }
diff --git a/crates/brightstaff/src/handlers/router_chat.rs b/crates/brightstaff/src/handlers/router_chat.rs
index 345632fc..910e5408 100644
--- a/crates/brightstaff/src/handlers/router_chat.rs
+++ b/crates/brightstaff/src/handlers/router_chat.rs
@@ -38,6 +38,7 @@ pub async fn router_chat_get_upstream_model(
     traceparent: &str,
     request_path: &str,
     request_id: &str,
+    inline_usage_preferences: Option<Vec<ModelUsagePreference>>,
 ) -> Result<RoutingResult, RoutingError> {
     // Clone metadata for routing before converting (which consumes client_request)
     let routing_metadata = client_request.metadata().clone();
@@ -76,16 +77,21 @@ pub async fn router_chat_get_upstream_model(
         "router request"
     );
 
-    // Extract usage preferences from metadata
-    let usage_preferences_str: Option<String> = routing_metadata.as_ref().and_then(|metadata| {
-        metadata
-            .get("plano_preference_config")
-            .map(|value| value.to_string())
-    });
-
-    let usage_preferences: Option<Vec<ModelUsagePreference>> = usage_preferences_str
-        .as_ref()
-        .and_then(|s| serde_yaml::from_str(s).ok());
+    // Use inline preferences if provided, otherwise fall back to metadata extraction
+    let usage_preferences: Option<Vec<ModelUsagePreference>> = if inline_usage_preferences.is_some()
+    {
+        inline_usage_preferences
+    } else {
+        let usage_preferences_str: Option<String> =
+            routing_metadata.as_ref().and_then(|metadata| {
+                metadata
+                    .get("plano_preference_config")
+                    .map(|value| value.to_string())
+            });
+        usage_preferences_str
+            .as_ref()
+            .and_then(|s| serde_yaml::from_str(s).ok())
+    };
 
     // Prepare log message with latest message from chat request
     let latest_message_for_log = chat_request
diff --git a/crates/brightstaff/src/handlers/routing_service.rs b/crates/brightstaff/src/handlers/routing_service.rs
index 32f37a08..4eae4685 100644
--- a/crates/brightstaff/src/handlers/routing_service.rs
+++ b/crates/brightstaff/src/handlers/routing_service.rs
@@ -1,5 +1,5 @@
 use bytes::Bytes;
-use common::configuration::SpanAttributes;
+use common::configuration::{ModelUsagePreference, SpanAttributes};
 use common::consts::{REQUEST_ID_HEADER, TRACE_PARENT_HEADER};
 use common::errors::BrightStaffError;
 use hermesllm::clients::SupportedAPIsFromClient;
@@ -14,6 +14,53 @@ use crate::handlers::router_chat::router_chat_get_upstream_model;
 use crate::router::llm_router::RouterService;
 use crate::tracing::{collect_custom_trace_attributes, operation_component, set_service_name};
 
+const ROUTING_POLICY_SIZE_WARNING_BYTES: usize = 5120;
+
+/// Extracts `routing_policy` from a JSON body, returning the cleaned body bytes
+/// and parsed preferences. The `routing_policy` field is removed from the JSON
+/// before re-serializing so downstream parsers don't see the non-standard field.
+///
+/// If `warn_on_size` is true, logs a warning when the serialized policy exceeds 5KB.
+pub fn extract_routing_policy(
+    raw_bytes: &[u8],
+    warn_on_size: bool,
+) -> Result<(Bytes, Option<Vec<ModelUsagePreference>>), String> {
+    let mut json_body: serde_json::Value = serde_json::from_slice(raw_bytes)
+        .map_err(|err| format!("Failed to parse JSON: {}", err))?;
+
+    let preferences = json_body
+        .as_object_mut()
+        .and_then(|obj| obj.remove("routing_policy"))
+        .and_then(|policy_value| {
+            if warn_on_size {
+                let policy_str = serde_json::to_string(&policy_value).unwrap_or_default();
+                if policy_str.len() > ROUTING_POLICY_SIZE_WARNING_BYTES {
+                    warn!(
+                        size_bytes = policy_str.len(),
+                        limit_bytes = ROUTING_POLICY_SIZE_WARNING_BYTES,
+                        "routing_policy exceeds recommended size limit"
+                    );
+                }
+            }
+            match serde_json::from_value::<Vec<ModelUsagePreference>>(policy_value) {
+                Ok(prefs) => {
+                    info!(
+                        num_models = prefs.len(),
+                        "using inline routing_policy from request body"
+                    );
+                    Some(prefs)
+                }
+                Err(err) => {
+                    warn!(error = %err, "failed to parse routing_policy");
+                    None
+                }
+            }
+        });
+
+    let bytes = Bytes::from(serde_json::to_vec(&json_body).unwrap());
+    Ok((bytes, preferences))
+}
+
 #[derive(serde::Serialize)]
 struct RoutingDecisionResponse {
     model: String,
@@ -98,13 +145,26 @@ async fn routing_decision_inner(
         .to_string();
 
     // Parse request body
-    let chat_request_bytes = request.collect().await?.to_bytes();
+    let raw_bytes = request.collect().await?.to_bytes();
 
     debug!(
-        body = %String::from_utf8_lossy(&chat_request_bytes),
+        body = %String::from_utf8_lossy(&raw_bytes),
         "routing decision request body received"
     );
 
+    // Extract routing_policy from request body before parsing as ProviderRequestType
+    let (chat_request_bytes, inline_preferences) = match extract_routing_policy(&raw_bytes, true) {
+        Ok(result) => result,
+        Err(err) => {
+            warn!(error = %err, "failed to parse request JSON");
+            return Ok(BrightStaffError::InvalidRequest(format!(
+                "Failed to parse request JSON: {}",
+                err
+            ))
+            .into_response());
+        }
+    };
+
     let client_request = match ProviderRequestType::try_from((
         &chat_request_bytes[..],
         &SupportedAPIsFromClient::from_endpoint(request_path.as_str()).unwrap(),
@@ -120,13 +180,14 @@ async fn routing_decision_inner(
         }
     };
 
-    // Call the existing routing logic
+    // Call the existing routing logic with inline preferences
     let routing_result = router_chat_get_upstream_model(
         router_service,
         client_request,
         &traceparent,
         &request_path,
         &request_id,
+        inline_preferences,
     )
     .await;
 
@@ -161,3 +222,136 @@ async fn routing_decision_inner(
         }
     }
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    fn make_chat_body(extra_fields: &str) -> Vec<u8> {
+        let extra = if extra_fields.is_empty() {
+            String::new()
+        } else {
+            format!(", {}", extra_fields)
+        };
+        format!(
+            r#"{{"model": "gpt-4o-mini", "messages": [{{"role": "user", "content": "hello"}}]{}}}"#,
+            extra
+        )
+        .into_bytes()
+    }
+
+    #[test]
+    fn extract_routing_policy_no_policy() {
+        let body = make_chat_body("");
+        let (cleaned, prefs) = extract_routing_policy(&body, false).unwrap();
+
+        assert!(prefs.is_none());
+        let cleaned_json: serde_json::Value = serde_json::from_slice(&cleaned).unwrap();
+        assert_eq!(cleaned_json["model"], "gpt-4o-mini");
+        assert!(cleaned_json.get("routing_policy").is_none());
+    }
+
+    #[test]
+    fn extract_routing_policy_valid_policy() {
+        let policy = r#""routing_policy": [
+            {
+                "model": "openai/gpt-4o",
+                "routing_preferences": [
+                    {"name": "coding", "description": "code generation tasks"}
+                ]
+            },
+            {
+                "model": "openai/gpt-4o-mini",
+                "routing_preferences": [
+                    {"name": "general", "description": "general questions"}
+                ]
+            }
+        ]"#;
+        let body = make_chat_body(policy);
+        let (cleaned, prefs) = extract_routing_policy(&body, false).unwrap();
+
+        let prefs = prefs.expect("should have parsed preferences");
+        assert_eq!(prefs.len(), 2);
+        assert_eq!(prefs[0].model, "openai/gpt-4o");
+        assert_eq!(prefs[0].routing_preferences[0].name, "coding");
+        assert_eq!(prefs[1].model, "openai/gpt-4o-mini");
+        assert_eq!(prefs[1].routing_preferences[0].name, "general");
+
+        // routing_policy should be stripped from cleaned body
+        let cleaned_json: serde_json::Value = serde_json::from_slice(&cleaned).unwrap();
+        assert!(cleaned_json.get("routing_policy").is_none());
+        assert_eq!(cleaned_json["model"], "gpt-4o-mini");
+    }
+
+    #[test]
+    fn extract_routing_policy_invalid_policy_returns_none() {
+        // routing_policy is present but has wrong shape
+        let policy = r#""routing_policy": "not-an-array""#;
+        let body = make_chat_body(policy);
+        let (cleaned, prefs) = extract_routing_policy(&body, false).unwrap();
+
+        // Invalid policy should be ignored (returns None), not error
+        assert!(prefs.is_none());
+        // routing_policy should still be stripped from cleaned body
+        let cleaned_json: serde_json::Value = serde_json::from_slice(&cleaned).unwrap();
+        assert!(cleaned_json.get("routing_policy").is_none());
+    }
+
+    #[test]
+    fn extract_routing_policy_invalid_json_returns_error() {
+        let body = b"not valid json";
+        let result = extract_routing_policy(body, false);
+        assert!(result.is_err());
+        assert!(result.unwrap_err().contains("Failed to parse JSON"));
+    }
+
+    #[test]
+    fn extract_routing_policy_empty_array() {
+        let policy = r#""routing_policy": []"#;
+        let body = make_chat_body(policy);
+        let (_, prefs) = extract_routing_policy(&body, false).unwrap();
+
+        let prefs = prefs.expect("empty array is valid");
+        assert_eq!(prefs.len(), 0);
+    }
+
+    #[test]
+    fn extract_routing_policy_preserves_other_fields() {
+        let policy = r#""routing_policy": [{"model": "gpt-4o", "routing_preferences": [{"name": "test", "description": "test"}]}], "temperature": 0.5, "max_tokens": 100"#;
+        let body = make_chat_body(policy);
+        let (cleaned, prefs) = extract_routing_policy(&body, false).unwrap();
+
+        assert!(prefs.is_some());
+        let cleaned_json: serde_json::Value = serde_json::from_slice(&cleaned).unwrap();
+        assert_eq!(cleaned_json["temperature"], 0.5);
+        assert_eq!(cleaned_json["max_tokens"], 100);
+        assert!(cleaned_json.get("routing_policy").is_none());
+    }
+
+    #[test]
+    fn routing_decision_response_serialization() {
+        let response = RoutingDecisionResponse {
+            model: "openai/gpt-4o".to_string(),
+            route: Some("code_generation".to_string()),
+            trace_id: "abc123".to_string(),
+        };
+        let json = serde_json::to_string(&response).unwrap();
+        let parsed: serde_json::Value = serde_json::from_str(&json).unwrap();
+        assert_eq!(parsed["model"], "openai/gpt-4o");
+        assert_eq!(parsed["route"], "code_generation");
+        assert_eq!(parsed["trace_id"], "abc123");
+    }
+
+    #[test]
+    fn routing_decision_response_serialization_no_route() {
+        let response = RoutingDecisionResponse {
+            model: "none".to_string(),
+            route: None,
+            trace_id: "abc123".to_string(),
+        };
+        let json = serde_json::to_string(&response).unwrap();
+        let parsed: serde_json::Value = serde_json::from_str(&json).unwrap();
+        assert_eq!(parsed["model"], "none");
+        assert!(parsed["route"].is_null());
+    }
+}
diff --git a/demos/llm_routing/model_routing_service/demo.sh b/demos/llm_routing/model_routing_service/demo.sh
index 3e9b0584..0c3fdc5d 100755
--- a/demos/llm_routing/model_routing_service/demo.sh
+++ b/demos/llm_routing/model_routing_service/demo.sh
@@ -62,4 +62,59 @@ curl -s "$PLANO_URL/routing/v1/messages" \
   }' | python3 -m json.tool
 echo ""
 
+# --- Example 5: Inline routing policy in request body ---
+echo "--- 5. Inline routing_policy (no config needed) ---"
+echo ""
+curl -s "$PLANO_URL/routing/v1/chat/completions" \
+  -H "Content-Type: application/json" \
+  -d '{
+    "model": "gpt-4o-mini",
+    "messages": [
+      {"role": "user", "content": "Write a quicksort implementation in Go"}
+    ],
+    "routing_policy": [
+      {
+        "model": "openai/gpt-4o",
+        "routing_preferences": [
+          {"name": "coding", "description": "code generation, writing functions, debugging"}
+        ]
+      },
+      {
+        "model": "openai/gpt-4o-mini",
+        "routing_preferences": [
+          {"name": "general", "description": "general questions, simple lookups, casual conversation"}
+        ]
+      }
+    ]
+  }' | python3 -m json.tool
+echo ""
+
+# --- Example 6: Inline routing policy with Anthropic format ---
+echo "--- 6. Inline routing_policy (Anthropic format) ---"
+echo ""
+curl -s "$PLANO_URL/routing/v1/messages" \
+  -H "Content-Type: application/json" \
+  -d '{
+    "model": "gpt-4o-mini",
+    "max_tokens": 1024,
+    "messages": [
+      {"role": "user", "content": "What is the weather like today?"}
+    ],
+    "routing_policy": [
+      {
+        "model": "openai/gpt-4o",
+        "routing_preferences": [
+          {"name": "coding", "description": "code generation, writing functions, debugging"}
+        ]
+      },
+      {
+        "model": "openai/gpt-4o-mini",
+        "routing_preferences": [
+          {"name": "general", "description": "general questions, simple lookups, casual conversation"}
+        ]
+      }
+    ]
+  }' | python3 -m json.tool
+echo ""
+
 echo "=== Demo Complete ==="

From 5189f7907a94fc8fbb43ae0b658116866fc48b5f Mon Sep 17 00:00:00 2001
From: Adil Hafeez <adil.hafeez@gmail.com>
Date: Tue, 10 Mar 2026 12:27:31 -0700
Subject: [PATCH 3/8] add k8s deploy guide (#816)

---
 docs/source/resources/deployment.rst | 188 +++++++++++++++++++++++++++
 1 file changed, 188 insertions(+)

diff --git a/docs/source/resources/deployment.rst b/docs/source/resources/deployment.rst
index 71452ea3..7b8b0554 100644
--- a/docs/source/resources/deployment.rst
+++ b/docs/source/resources/deployment.rst
@@ -100,6 +100,194 @@ You can also use the CLI with Docker mode:
    planoai up plano_config.yaml --docker
    planoai down --docker
 
+Kubernetes Deployment
+---------------------
+
+Plano runs as a single container in Kubernetes. The container bundles Envoy, WASM plugins, and brightstaff, managed by supervisord internally. Deploy it as a standard Kubernetes Deployment with your ``plano_config.yaml`` mounted via a ConfigMap and API keys injected via a Secret.
+
+.. note::
+   All environment variables referenced in your ``plano_config.yaml`` (e.g. ``$OPENAI_API_KEY``) must be set in the container environment. Use Kubernetes Secrets for API keys.
+
+Step 1: Create the Config
+~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Store your ``plano_config.yaml`` in a ConfigMap:
+
+.. code-block:: bash
+
+   kubectl create configmap plano-config --from-file=plano_config.yaml=./plano_config.yaml
+
+Step 2: Create API Key Secrets
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Store your LLM provider API keys in a Secret:
+
+.. code-block:: bash
+
+   kubectl create secret generic plano-secrets \
+     --from-literal=OPENAI_API_KEY=sk-... \
+     --from-literal=ANTHROPIC_API_KEY=sk-ant-...
+
+Step 3: Deploy Plano
+~~~~~~~~~~~~~~~~~~~~
+
+Create a ``plano-deployment.yaml``:
+
+.. code-block:: yaml
+
+   apiVersion: apps/v1
+   kind: Deployment
+   metadata:
+     name: plano
+     labels:
+       app: plano
+   spec:
+     replicas: 1
+     selector:
+       matchLabels:
+         app: plano
+     template:
+       metadata:
+         labels:
+           app: plano
+       spec:
+         containers:
+           - name: plano
+             image: katanemo/plano:0.4.11
+             ports:
+               - containerPort: 12000  # LLM gateway (chat completions, model routing)
+                 name: llm-gateway
+             envFrom:
+               - secretRef:
+                   name: plano-secrets
+             env:
+               - name: LOG_LEVEL
+                 value: "info"
+             volumeMounts:
+               - name: plano-config
+                 mountPath: /app/plano_config.yaml
+                 subPath: plano_config.yaml
+                 readOnly: true
+             readinessProbe:
+               httpGet:
+                 path: /healthz
+                 port: 12000
+               initialDelaySeconds: 5
+               periodSeconds: 10
+             livenessProbe:
+               httpGet:
+                 path: /healthz
+                 port: 12000
+               initialDelaySeconds: 10
+               periodSeconds: 30
+             resources:
+               requests:
+                 memory: "256Mi"
+                 cpu: "250m"
+               limits:
+                 memory: "512Mi"
+                 cpu: "1000m"
+         volumes:
+           - name: plano-config
+             configMap:
+               name: plano-config
+   ---
+   apiVersion: v1
+   kind: Service
+   metadata:
+     name: plano
+   spec:
+     selector:
+       app: plano
+     ports:
+       - name: llm-gateway
+         port: 12000
+         targetPort: 12000
+
+Apply it:
+
+.. code-block:: bash
+
+   kubectl apply -f plano-deployment.yaml
+
+Step 4: Verify
+~~~~~~~~~~~~~~
+
+.. code-block:: bash
+
+   # Check pod status
+   kubectl get pods -l app=plano
+
+   # Check logs
+   kubectl logs -l app=plano -f
+
+   # Test routing (port-forward for local testing)
+   kubectl port-forward svc/plano 12000:12000
+
+   curl -s -H "Content-Type: application/json" \
+     -d '{"messages":[{"role":"user","content":"tell me a joke"}], "model":"none"}' \
+     http://localhost:12000/v1/chat/completions | jq .model
+
+Updating Configuration
+~~~~~~~~~~~~~~~~~~~~~~
+
+To update ``plano_config.yaml``, replace the ConfigMap and restart the pod:
+
+.. code-block:: bash
+
+   kubectl create configmap plano-config \
+     --from-file=plano_config.yaml=./plano_config.yaml \
+     --dry-run=client -o yaml | kubectl apply -f -
+
+   kubectl rollout restart deployment/plano
+
+Enabling OTEL Tracing
+~~~~~~~~~~~~~~~~~~~~~
+
+Plano emits OpenTelemetry traces for every request — including routing decisions, model selection, and upstream latency. To export traces to an OTEL collector in your cluster, add the ``tracing`` section to your ``plano_config.yaml``:
+
+.. code-block:: yaml
+
+   tracing:
+     opentracing_grpc_endpoint: "http://otel-collector.monitoring:4317"
+     random_sampling: 100       # percentage of requests to trace (1-100)
+     trace_arch_internal: true  # include internal Plano spans
+     span_attributes:
+       header_prefixes:         # capture request headers as span attributes
+         - "x-"
+       static:                  # add static attributes to all spans
+         environment: "production"
+         service: "plano"
+
+Set the ``OTEL_TRACING_GRPC_ENDPOINT`` environment variable or configure it directly in the config. Plano propagates the ``traceparent`` header end-to-end, so traces correlate across your upstream and downstream services.
+
+Environment Variables Reference
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The following environment variables can be set on the container:
+
+.. list-table::
+   :header-rows: 1
+   :widths: 30 50 20
+
+   * - Variable
+     - Description
+     - Default
+   * - ``LOG_LEVEL``
+     - Log verbosity (``debug``, ``info``, ``warn``, ``error``)
+     - ``info``
+   * - ``OPENAI_API_KEY``
+     - OpenAI API key (if referenced in config)
+     -
+   * - ``ANTHROPIC_API_KEY``
+     - Anthropic API key (if referenced in config)
+     -
+   * - ``OTEL_TRACING_GRPC_ENDPOINT``
+     - OTEL collector endpoint for trace export
+     - ``http://localhost:4317``
+
+Any environment variable referenced in ``plano_config.yaml`` with ``$VAR_NAME`` syntax will be substituted at startup. Use Kubernetes Secrets for sensitive values and ConfigMaps or ``env`` entries for non-sensitive configuration.
+
 Runtime Tests
 -------------
 

From 66100976590bff14c1a4b66f6a090c950a0aad3d Mon Sep 17 00:00:00 2001
From: Musa <malikmusa1323@gmail.com>
Date: Tue, 10 Mar 2026 20:54:14 -0700
Subject: [PATCH 4/8] Support for Codex via Plano (#808)

* Add Codex CLI support; xAI response improvements

* Add native Plano running check and update CLI agent error handling

* adding PR suggestions for transformations and code quality

* message extraction logic in ResponsesAPIRequest

* xAI support for Responses API by routing to native endpoint + refactor code
---
 cli/planoai/core.py                           | 129 +++-
 cli/planoai/main.py                           |  47 +-
 cli/uv.lock                                   |   2 +-
 crates/brightstaff/src/handlers/llm.rs        |   8 +-
 crates/brightstaff/src/state/mod.rs           |  99 +++
 crates/hermesllm/src/apis/openai.rs           |   2 +-
 crates/hermesllm/src/apis/openai_responses.rs | 199 +++++-
 .../responses_api_streaming_buffer.rs         |  66 +-
 crates/hermesllm/src/clients/endpoints.rs     |  19 +-
 crates/hermesllm/src/providers/id.rs          |  21 +-
 crates/hermesllm/src/providers/request.rs     |  76 +++
 .../src/transforms/request/from_openai.rs     | 645 +++++++++++++++---
 .../response_streaming/to_openai_streaming.rs |  17 +-
 crates/llm_gateway/src/stream_context.rs      |   3 +-
 demos/README.md                               |   1 +
 demos/llm_routing/codex_router/README.md      |  92 +++
 demos/llm_routing/codex_router/config.yaml    |  38 ++
 .../codex_router/pretty_model_resolution.sh   |  33 +
 18 files changed, 1297 insertions(+), 200 deletions(-)
 create mode 100644 demos/llm_routing/codex_router/README.md
 create mode 100644 demos/llm_routing/codex_router/config.yaml
 create mode 100644 demos/llm_routing/codex_router/pretty_model_resolution.sh

diff --git a/cli/planoai/core.py b/cli/planoai/core.py
index e9ddc7bd..174f37c0 100644
--- a/cli/planoai/core.py
+++ b/cli/planoai/core.py
@@ -10,7 +10,6 @@ from planoai.consts import (
     PLANO_DOCKER_IMAGE,
     PLANO_DOCKER_NAME,
 )
-import subprocess
 from planoai.docker_cli import (
     docker_container_status,
     docker_remove_container,
@@ -147,26 +146,48 @@ def stop_docker_container(service=PLANO_DOCKER_NAME):
         log.info(f"Failed to shut down services: {str(e)}")
 
 
-def start_cli_agent(plano_config_file=None, settings_json="{}"):
-    """Start a CLI client connected to Plano."""
-
-    with open(plano_config_file, "r") as file:
-        plano_config = file.read()
-        plano_config_yaml = yaml.safe_load(plano_config)
-
-    # Get egress listener configuration
-    egress_config = plano_config_yaml.get("listeners", {}).get("egress_traffic", {})
-    host = egress_config.get("host", "127.0.0.1")
-    port = egress_config.get("port", 12000)
-
-    # Parse additional settings from command line
+def _parse_cli_agent_settings(settings_json: str) -> dict:
     try:
-        additional_settings = json.loads(settings_json) if settings_json else {}
+        return json.loads(settings_json) if settings_json else {}
     except json.JSONDecodeError:
         log.error("Settings must be valid JSON")
         sys.exit(1)
 
-    # Set up environment variables
+
+def _resolve_cli_agent_endpoint(plano_config_yaml: dict) -> tuple[str, int]:
+    listeners = plano_config_yaml.get("listeners")
+
+    if isinstance(listeners, dict):
+        egress_config = listeners.get("egress_traffic", {})
+        host = egress_config.get("host") or egress_config.get("address") or "0.0.0.0"
+        port = egress_config.get("port", 12000)
+        return host, port
+
+    if isinstance(listeners, list):
+        for listener in listeners:
+            if listener.get("type") in ["model", "model_listener"]:
+                host = listener.get("host") or listener.get("address") or "0.0.0.0"
+                port = listener.get("port", 12000)
+                return host, port
+
+    return "0.0.0.0", 12000
+
+
+def _apply_non_interactive_env(env: dict, additional_settings: dict) -> None:
+    if additional_settings.get("NON_INTERACTIVE_MODE", False):
+        env.update(
+            {
+                "CI": "true",
+                "FORCE_COLOR": "0",
+                "NODE_NO_READLINE": "1",
+                "TERM": "dumb",
+            }
+        )
+
+
+def _start_claude_cli_agent(
+    host: str, port: int, plano_config_yaml: dict, additional_settings: dict
+) -> None:
     env = os.environ.copy()
     env.update(
         {
@@ -186,7 +207,6 @@ def start_cli_agent(plano_config_file=None, settings_json="{}"):
             "ANTHROPIC_SMALL_FAST_MODEL"
         ]
     else:
-        # Check if arch.claude.code.small.fast alias exists in model_aliases
         model_aliases = plano_config_yaml.get("model_aliases", {})
         if "arch.claude.code.small.fast" in model_aliases:
             env["ANTHROPIC_SMALL_FAST_MODEL"] = "arch.claude.code.small.fast"
@@ -196,23 +216,10 @@ def start_cli_agent(plano_config_file=None, settings_json="{}"):
             )
             log.info("Or provide ANTHROPIC_SMALL_FAST_MODEL in --settings JSON")
 
-    # Non-interactive mode configuration from additional_settings only
-    if additional_settings.get("NON_INTERACTIVE_MODE", False):
-        env.update(
-            {
-                "CI": "true",
-                "FORCE_COLOR": "0",
-                "NODE_NO_READLINE": "1",
-                "TERM": "dumb",
-            }
-        )
+    _apply_non_interactive_env(env, additional_settings)
 
-    # Build claude command arguments
     claude_args = []
-
-    # Add settings if provided, excluding those already handled as environment variables
     if additional_settings:
-        # Filter out settings that are already processed as environment variables
         claude_settings = {
             k: v
             for k, v in additional_settings.items()
@@ -221,10 +228,8 @@ def start_cli_agent(plano_config_file=None, settings_json="{}"):
         if claude_settings:
             claude_args.append(f"--settings={json.dumps(claude_settings)}")
 
-    # Use claude from PATH
     claude_path = "claude"
     log.info(f"Connecting Claude Code Agent to Plano at {host}:{port}")
-
     try:
         subprocess.run([claude_path] + claude_args, env=env, check=True)
     except subprocess.CalledProcessError as e:
@@ -235,3 +240,61 @@ def start_cli_agent(plano_config_file=None, settings_json="{}"):
             f"{claude_path} not found. Make sure Claude Code is installed: npm install -g @anthropic-ai/claude-code"
         )
         sys.exit(1)
+
+
+def _start_codex_cli_agent(host: str, port: int, additional_settings: dict) -> None:
+    env = os.environ.copy()
+    env.update(
+        {
+            "OPENAI_API_KEY": "test",  # Use test token for plano
+            "OPENAI_BASE_URL": f"http://{host}:{port}/v1",
+            "NO_PROXY": host,
+            "DISABLE_TELEMETRY": "true",
+        }
+    )
+    _apply_non_interactive_env(env, additional_settings)
+
+    codex_model = additional_settings.get("CODEX_MODEL", "gpt-5.3-codex")
+    codex_path = "codex"
+    codex_args = ["--model", codex_model]
+
+    log.info(
+        f"Connecting Codex CLI Agent to Plano at {host}:{port} (default model: {codex_model})"
+    )
+    try:
+        subprocess.run([codex_path] + codex_args, env=env, check=True)
+    except subprocess.CalledProcessError as e:
+        log.error(f"Error starting codex: {e}")
+        sys.exit(1)
+    except FileNotFoundError:
+        log.error(
+            f"{codex_path} not found. Make sure Codex CLI is installed: npm install -g @openai/codex"
+        )
+        sys.exit(1)
+
+
+def start_cli_agent(
+    plano_config_file=None, cli_agent_type="claude", settings_json="{}"
+):
+    """Start a CLI client connected to Plano."""
+
+    with open(plano_config_file, "r") as file:
+        plano_config = file.read()
+        plano_config_yaml = yaml.safe_load(plano_config)
+
+    host, port = _resolve_cli_agent_endpoint(plano_config_yaml)
+
+    additional_settings = _parse_cli_agent_settings(settings_json)
+
+    if cli_agent_type == "claude":
+        _start_claude_cli_agent(host, port, plano_config_yaml, additional_settings)
+        return
+
+    if cli_agent_type == "codex":
+        _start_codex_cli_agent(host, port, additional_settings)
+        return
+
+    log.error(
+        f"Unsupported cli agent type '{cli_agent_type}'. Supported values: claude, codex"
+    )
+    sys.exit(1)
diff --git a/cli/planoai/main.py b/cli/planoai/main.py
index a63f294e..e9cdc0a0 100644
--- a/cli/planoai/main.py
+++ b/cli/planoai/main.py
@@ -1,3 +1,4 @@
+import json
 import os
 import multiprocessing
 import subprocess
@@ -31,6 +32,7 @@ from planoai.trace_cmd import trace as trace_cmd, start_trace_listener_backgroun
 from planoai.consts import (
     DEFAULT_OTEL_TRACING_GRPC_ENDPOINT,
     DEFAULT_NATIVE_OTEL_TRACING_GRPC_ENDPOINT,
+    NATIVE_PID_FILE,
     PLANO_DOCKER_IMAGE,
     PLANO_DOCKER_NAME,
 )
@@ -40,6 +42,30 @@ from planoai.versioning import check_version_status, get_latest_version, get_ver
 log = getLogger(__name__)
 
 
+def _is_native_plano_running() -> bool:
+    if not os.path.exists(NATIVE_PID_FILE):
+        return False
+    try:
+        with open(NATIVE_PID_FILE, "r") as f:
+            pids = json.load(f)
+    except (OSError, json.JSONDecodeError):
+        return False
+
+    envoy_pid = pids.get("envoy_pid")
+    brightstaff_pid = pids.get("brightstaff_pid")
+    if not isinstance(envoy_pid, int) or not isinstance(brightstaff_pid, int):
+        return False
+
+    for pid in (envoy_pid, brightstaff_pid):
+        try:
+            os.kill(pid, 0)
+        except ProcessLookupError:
+            return False
+        except PermissionError:
+            continue
+    return True
+
+
 def _is_port_in_use(port: int) -> bool:
     """Check if a TCP port is already bound on localhost."""
     import socket
@@ -523,7 +549,7 @@ def logs(debug, follow, docker):
 
 
 @click.command()
-@click.argument("type", type=click.Choice(["claude"]), required=True)
+@click.argument("type", type=click.Choice(["claude", "codex"]), required=True)
 @click.argument("file", required=False)  # Optional file argument
 @click.option(
     "--path", default=".", help="Path to the directory containing plano_config.yaml"
@@ -536,14 +562,19 @@ def logs(debug, follow, docker):
 def cli_agent(type, file, path, settings):
     """Start a CLI agent connected to Plano.
 
-    CLI_AGENT: The type of CLI agent to start (currently only 'claude' is supported)
+    CLI_AGENT: The type of CLI agent to start ('claude' or 'codex')
     """
 
-    # Check if plano docker container is running
-    plano_status = docker_container_status(PLANO_DOCKER_NAME)
-    if plano_status != "running":
-        log.error(f"plano docker container is not running (status: {plano_status})")
-        log.error("Please start plano using the 'planoai up' command.")
+    native_running = _is_native_plano_running()
+    docker_running = False
+    if not native_running:
+        docker_running = docker_container_status(PLANO_DOCKER_NAME) == "running"
+
+    if not (native_running or docker_running):
+        log.error("Plano is not running.")
+        log.error(
+            "Start Plano first using 'planoai up <config.yaml>' (native or --docker mode)."
+        )
         sys.exit(1)
 
     # Determine plano_config.yaml path
@@ -553,7 +584,7 @@ def cli_agent(type, file, path, settings):
         sys.exit(1)
 
     try:
-        start_cli_agent(plano_config_file, settings)
+        start_cli_agent(plano_config_file, type, settings)
     except SystemExit:
         # Re-raise SystemExit to preserve exit codes
         raise
diff --git a/cli/uv.lock b/cli/uv.lock
index 45ccf82e..9d85bf85 100644
--- a/cli/uv.lock
+++ b/cli/uv.lock
@@ -337,7 +337,7 @@ wheels = [
 
 [[package]]
 name = "planoai"
-version = "0.4.7"
+version = "0.4.9"
 source = { editable = "." }
 dependencies = [
     { name = "click" },
diff --git a/crates/brightstaff/src/handlers/llm.rs b/crates/brightstaff/src/handlers/llm.rs
index b03d4d29..67afebff 100644
--- a/crates/brightstaff/src/handlers/llm.rs
+++ b/crates/brightstaff/src/handlers/llm.rs
@@ -198,6 +198,7 @@ async fn llm_chat_inner(
     let temperature = client_request.get_temperature();
     let is_streaming_request = client_request.is_streaming();
     let alias_resolved_model = resolve_model_alias(&model_from_request, &model_aliases);
+    let (provider_id, _) = get_provider_info(&llm_providers, &alias_resolved_model).await;
 
     // Validate that the requested model exists in configuration
     // This matches the validation in llm_gateway routing.rs
@@ -249,7 +250,11 @@ async fn llm_chat_inner(
     if client_request.remove_metadata_key("plano_preference_config") {
         debug!("removed plano_preference_config from metadata");
     }
-
+    if let Some(ref client_api_kind) = client_api {
+        let upstream_api =
+            provider_id.compatible_api_for_client(client_api_kind, is_streaming_request);
+        client_request.normalize_for_upstream(provider_id, &upstream_api);
+    }
     // === v1/responses state management: Determine upstream API and combine input if needed ===
     // Do this BEFORE routing since routing consumes the request
     // Only process state if state_storage is configured
@@ -496,7 +501,6 @@ async fn llm_chat_inner(
         .into_response()),
     }
 }
-
 /// Resolves model aliases by looking up the requested model in the model_aliases map.
 /// Returns the target model if an alias is found, otherwise returns the original model.
 fn resolve_model_alias(
diff --git a/crates/brightstaff/src/state/mod.rs b/crates/brightstaff/src/state/mod.rs
index ce3ec8ae..3d59f359 100644
--- a/crates/brightstaff/src/state/mod.rs
+++ b/crates/brightstaff/src/state/mod.rs
@@ -130,6 +130,7 @@ pub fn extract_input_items(input: &InputParam) -> Vec<InputItem> {
                 }]),
             })]
         }
+        InputParam::SingleItem(item) => vec![item.clone()],
         InputParam::Items(items) => items.clone(),
     }
 }
@@ -146,3 +147,101 @@ pub async fn retrieve_and_combine_input(
     let combined_input = storage.merge(&prev_state, current_input);
     Ok(combined_input)
 }
+
+#[cfg(test)]
+mod tests {
+    use super::extract_input_items;
+    use hermesllm::apis::openai_responses::{
+        InputContent, InputItem, InputMessage, InputParam, MessageContent, MessageRole,
+    };
+
+    #[test]
+    fn test_extract_input_items_converts_text_to_user_message_item() {
+        let extracted = extract_input_items(&InputParam::Text("hello world".to_string()));
+        assert_eq!(extracted.len(), 1);
+
+        let InputItem::Message(message) = &extracted[0] else {
+            panic!("expected InputItem::Message");
+        };
+        assert!(matches!(message.role, MessageRole::User));
+
+        let MessageContent::Items(items) = &message.content else {
+            panic!("expected MessageContent::Items");
+        };
+        assert_eq!(items.len(), 1);
+
+        let InputContent::InputText { text } = &items[0] else {
+            panic!("expected InputContent::InputText");
+        };
+        assert_eq!(text, "hello world");
+    }
+
+    #[test]
+    fn test_extract_input_items_preserves_single_item() {
+        let item = InputItem::Message(InputMessage {
+            role: MessageRole::Assistant,
+            content: MessageContent::Items(vec![InputContent::InputText {
+                text: "assistant note".to_string(),
+            }]),
+        });
+
+        let extracted = extract_input_items(&InputParam::SingleItem(item.clone()));
+        assert_eq!(extracted.len(), 1);
+        let InputItem::Message(message) = &extracted[0] else {
+            panic!("expected InputItem::Message");
+        };
+        assert!(matches!(message.role, MessageRole::Assistant));
+        let MessageContent::Items(items) = &message.content else {
+            panic!("expected MessageContent::Items");
+        };
+        let InputContent::InputText { text } = &items[0] else {
+            panic!("expected InputContent::InputText");
+        };
+        assert_eq!(text, "assistant note");
+    }
+
+    #[test]
+    fn test_extract_input_items_preserves_items_list() {
+        let items = vec![
+            InputItem::Message(InputMessage {
+                role: MessageRole::User,
+                content: MessageContent::Items(vec![InputContent::InputText {
+                    text: "first".to_string(),
+                }]),
+            }),
+            InputItem::Message(InputMessage {
+                role: MessageRole::Assistant,
+                content: MessageContent::Items(vec![InputContent::InputText {
+                    text: "second".to_string(),
+                }]),
+            }),
+        ];
+
+        let extracted = extract_input_items(&InputParam::Items(items.clone()));
+        assert_eq!(extracted.len(), items.len());
+
+        let InputItem::Message(first) = &extracted[0] else {
+            panic!("expected first item to be message");
+        };
+        assert!(matches!(first.role, MessageRole::User));
+        let MessageContent::Items(first_items) = &first.content else {
+            panic!("expected MessageContent::Items");
+        };
+        let InputContent::InputText { text: first_text } = &first_items[0] else {
+            panic!("expected InputContent::InputText");
+        };
+        assert_eq!(first_text, "first");
+
+        let InputItem::Message(second) = &extracted[1] else {
+            panic!("expected second item to be message");
+        };
+        assert!(matches!(second.role, MessageRole::Assistant));
+        let MessageContent::Items(second_items) = &second.content else {
+            panic!("expected MessageContent::Items");
+        };
+        let InputContent::InputText { text: second_text } = &second_items[0] else {
+            panic!("expected InputContent::InputText");
+        };
+        assert_eq!(second_text, "second");
+    }
+}
diff --git a/crates/hermesllm/src/apis/openai.rs b/crates/hermesllm/src/apis/openai.rs
index 53eee442..33f55b29 100644
--- a/crates/hermesllm/src/apis/openai.rs
+++ b/crates/hermesllm/src/apis/openai.rs
@@ -108,7 +108,7 @@ pub struct ChatCompletionsRequest {
     pub top_p: Option<f32>,
     pub top_logprobs: Option<u32>,
     pub user: Option<String>,
-    // pub web_search: Option<bool>, // GOOD FIRST ISSUE: Future support for web search
+    pub web_search_options: Option<Value>,
 
     // VLLM-specific parameters (used by Arch-Function)
     pub top_k: Option<u32>,
diff --git a/crates/hermesllm/src/apis/openai_responses.rs b/crates/hermesllm/src/apis/openai_responses.rs
index 65f4dfa0..eac8a452 100644
--- a/crates/hermesllm/src/apis/openai_responses.rs
+++ b/crates/hermesllm/src/apis/openai_responses.rs
@@ -116,6 +116,8 @@ pub enum InputParam {
     Text(String),
     /// Array of input items (messages, references, outputs, etc.)
     Items(Vec<InputItem>),
+    /// Single input item (some clients send object instead of array)
+    SingleItem(InputItem),
 }
 
 /// Input item - can be a message, item reference, function call output, etc.
@@ -130,12 +132,20 @@ pub enum InputItem {
         item_type: String,
         id: String,
     },
+    /// Function call emitted by model in prior turn
+    FunctionCall {
+        #[serde(rename = "type")]
+        item_type: String,
+        name: String,
+        arguments: String,
+        call_id: String,
+    },
     /// Function call output
     FunctionCallOutput {
         #[serde(rename = "type")]
         item_type: String,
         call_id: String,
-        output: String,
+        output: serde_json::Value,
     },
 }
 
@@ -166,6 +176,7 @@ pub enum MessageRole {
     Assistant,
     System,
     Developer,
+    Tool,
 }
 
 /// Input content types
@@ -173,6 +184,7 @@ pub enum MessageRole {
 #[serde(tag = "type", rename_all = "snake_case")]
 pub enum InputContent {
     /// Text input
+    #[serde(rename = "input_text", alias = "text", alias = "output_text")]
     InputText { text: String },
     /// Image input via URL
     InputImage {
@@ -180,6 +192,7 @@ pub enum InputContent {
         detail: Option<String>,
     },
     /// File input via URL
+    #[serde(rename = "input_file", alias = "file")]
     InputFile { file_url: String },
     /// Audio input
     InputAudio {
@@ -207,10 +220,11 @@ pub struct AudioConfig {
 }
 
 /// Text configuration
+#[skip_serializing_none]
 #[derive(Debug, Clone, Serialize, Deserialize)]
 pub struct TextConfig {
     /// Text format configuration
-    pub format: TextFormat,
+    pub format: Option<TextFormat>,
 }
 
 /// Text format
@@ -285,6 +299,7 @@ pub enum Tool {
         filters: Option<serde_json::Value>,
     },
     /// Web search tool
+    #[serde(rename = "web_search", alias = "web_search_preview")]
     WebSearchPreview {
         domains: Option<Vec<String>>,
         search_context_size: Option<String>,
@@ -298,6 +313,12 @@ pub enum Tool {
         display_height_px: Option<i32>,
         display_number: Option<i32>,
     },
+    /// Custom tool (provider/SDK-specific tool contract)
+    Custom {
+        name: Option<String>,
+        description: Option<String>,
+        format: Option<serde_json::Value>,
+    },
 }
 
 /// Ranking options for file search
@@ -1015,6 +1036,30 @@ pub struct ListInputItemsResponse {
 // ProviderRequest Implementation
 // ============================================================================
 
+fn append_input_content_text(buffer: &mut String, content: &InputContent) {
+    match content {
+        InputContent::InputText { text } => buffer.push_str(text),
+        InputContent::InputImage { .. } => buffer.push_str("[Image]"),
+        InputContent::InputFile { .. } => buffer.push_str("[File]"),
+        InputContent::InputAudio { .. } => buffer.push_str("[Audio]"),
+    }
+}
+
+fn append_content_items_text(buffer: &mut String, content_items: &[InputContent]) {
+    for content in content_items {
+        // Preserve existing behavior: each content item is prefixed with a space.
+        buffer.push(' ');
+        append_input_content_text(buffer, content);
+    }
+}
+
+fn append_message_content_text(buffer: &mut String, content: &MessageContent) {
+    match content {
+        MessageContent::Text(text) => buffer.push_str(text),
+        MessageContent::Items(content_items) => append_content_items_text(buffer, content_items),
+    }
+}
+
 impl ProviderRequest for ResponsesAPIRequest {
     fn model(&self) -> &str {
         &self.model
@@ -1031,36 +1076,27 @@ impl ProviderRequest for ResponsesAPIRequest {
     fn extract_messages_text(&self) -> String {
         match &self.input {
             InputParam::Text(text) => text.clone(),
-            InputParam::Items(items) => {
-                items.iter().fold(String::new(), |acc, item| {
-                    match item {
-                        InputItem::Message(msg) => {
-                            let content_text = match &msg.content {
-                                MessageContent::Text(text) => text.clone(),
-                                MessageContent::Items(content_items) => {
-                                    content_items.iter().fold(String::new(), |acc, content| {
-                                        acc + " "
-                                            + &match content {
-                                                InputContent::InputText { text } => text.clone(),
-                                                InputContent::InputImage { .. } => {
-                                                    "[Image]".to_string()
-                                                }
-                                                InputContent::InputFile { .. } => {
-                                                    "[File]".to_string()
-                                                }
-                                                InputContent::InputAudio { .. } => {
-                                                    "[Audio]".to_string()
-                                                }
-                                            }
-                                    })
-                                }
-                            };
-                            acc + " " + &content_text
-                        }
-                        // Skip non-message items (references, outputs, etc.)
-                        _ => acc,
+            InputParam::SingleItem(item) => {
+                // Normalize single-item input for extraction behavior parity.
+                match item {
+                    InputItem::Message(msg) => {
+                        let mut extracted = String::new();
+                        append_message_content_text(&mut extracted, &msg.content);
+                        extracted
                     }
-                })
+                    _ => String::new(),
+                }
+            }
+            InputParam::Items(items) => {
+                let mut extracted = String::new();
+                for item in items {
+                    if let InputItem::Message(msg) = item {
+                        // Preserve existing behavior: each message is prefixed with a space.
+                        extracted.push(' ');
+                        append_message_content_text(&mut extracted, &msg.content);
+                    }
+                }
+                extracted
             }
         }
     }
@@ -1068,6 +1104,20 @@ impl ProviderRequest for ResponsesAPIRequest {
     fn get_recent_user_message(&self) -> Option<String> {
         match &self.input {
             InputParam::Text(text) => Some(text.clone()),
+            InputParam::SingleItem(item) => match item {
+                InputItem::Message(msg) if matches!(msg.role, MessageRole::User) => {
+                    match &msg.content {
+                        MessageContent::Text(text) => Some(text.clone()),
+                        MessageContent::Items(content_items) => {
+                            content_items.iter().find_map(|content| match content {
+                                InputContent::InputText { text } => Some(text.clone()),
+                                _ => None,
+                            })
+                        }
+                    }
+                }
+                _ => None,
+            },
             InputParam::Items(items) => {
                 items.iter().rev().find_map(|item| {
                     match item {
@@ -1097,6 +1147,9 @@ impl ProviderRequest for ResponsesAPIRequest {
                 .iter()
                 .filter_map(|tool| match tool {
                     Tool::Function { name, .. } => Some(name.clone()),
+                    Tool::Custom {
+                        name: Some(name), ..
+                    } => Some(name.clone()),
                     // Other tool types don't have user-defined names
                     _ => None,
                 })
@@ -1366,6 +1419,7 @@ impl crate::providers::streaming_response::ProviderStreamResponse for ResponsesA
 #[cfg(test)]
 mod tests {
     use super::*;
+    use serde_json::json;
 
     #[test]
     fn test_response_output_text_delta_deserialization() {
@@ -1506,4 +1560,87 @@ mod tests {
             _ => panic!("Expected ResponseCompleted event"),
         }
     }
+
+    #[test]
+    fn test_request_deserializes_custom_tool() {
+        let request = json!({
+            "model": "gpt-5.3-codex",
+            "input": "apply the patch",
+            "tools": [
+                {
+                    "type": "custom",
+                    "name": "run_patch",
+                    "description": "Apply patch text",
+                    "format": {
+                        "kind": "patch",
+                        "version": "v1"
+                    }
+                }
+            ]
+        });
+
+        let bytes = serde_json::to_vec(&request).unwrap();
+        let parsed = ResponsesAPIRequest::try_from(bytes.as_slice()).unwrap();
+        let tools = parsed.tools.expect("tools should be present");
+        assert_eq!(tools.len(), 1);
+
+        match &tools[0] {
+            Tool::Custom {
+                name,
+                description,
+                format,
+            } => {
+                assert_eq!(name.as_deref(), Some("run_patch"));
+                assert_eq!(description.as_deref(), Some("Apply patch text"));
+                assert!(format.is_some());
+            }
+            _ => panic!("expected custom tool"),
+        }
+    }
+
+    #[test]
+    fn test_request_deserializes_web_search_tool_alias() {
+        let request = json!({
+            "model": "gpt-5.3-codex",
+            "input": "find repository info",
+            "tools": [
+                {
+                    "type": "web_search",
+                    "domains": ["github.com"],
+                    "search_context_size": "medium"
+                }
+            ]
+        });
+
+        let bytes = serde_json::to_vec(&request).unwrap();
+        let parsed = ResponsesAPIRequest::try_from(bytes.as_slice()).unwrap();
+        let tools = parsed.tools.expect("tools should be present");
+        assert_eq!(tools.len(), 1);
+
+        match &tools[0] {
+            Tool::WebSearchPreview {
+                domains,
+                search_context_size,
+                ..
+            } => {
+                assert_eq!(domains.as_ref().map(Vec::len), Some(1));
+                assert_eq!(search_context_size.as_deref(), Some("medium"));
+            }
+            _ => panic!("expected web search preview tool"),
+        }
+    }
+
+    #[test]
+    fn test_request_deserializes_text_config_without_format() {
+        let request = json!({
+            "model": "gpt-5.3-codex",
+            "input": "hello",
+            "text": {}
+        });
+
+        let bytes = serde_json::to_vec(&request).unwrap();
+        let parsed = ResponsesAPIRequest::try_from(bytes.as_slice()).unwrap();
+        assert!(parsed.text.is_some());
+        assert!(parsed.text.unwrap().format.is_none());
+    }
 }
diff --git a/crates/hermesllm/src/apis/streaming_shapes/responses_api_streaming_buffer.rs b/crates/hermesllm/src/apis/streaming_shapes/responses_api_streaming_buffer.rs
index 2aeb34ac..92589ccf 100644
--- a/crates/hermesllm/src/apis/streaming_shapes/responses_api_streaming_buffer.rs
+++ b/crates/hermesllm/src/apis/streaming_shapes/responses_api_streaming_buffer.rs
@@ -74,6 +74,7 @@ pub struct ResponsesAPIStreamBuffer {
     /// Lifecycle state flags
     created_emitted: bool,
     in_progress_emitted: bool,
+    finalized: bool,
 
     /// Track which output items we've added
     output_items_added: HashMap<i32, String>, // output_index -> item_id
@@ -109,6 +110,7 @@ impl ResponsesAPIStreamBuffer {
             upstream_response_metadata: None,
             created_emitted: false,
             in_progress_emitted: false,
+            finalized: false,
             output_items_added: HashMap::new(),
             text_content: HashMap::new(),
             function_arguments: HashMap::new(),
@@ -236,7 +238,7 @@ impl ResponsesAPIStreamBuffer {
             }),
             store: Some(true),
             text: Some(TextConfig {
-                format: TextFormat::Text,
+                format: Some(TextFormat::Text),
             }),
             audio: None,
             modalities: None,
@@ -255,8 +257,38 @@ impl ResponsesAPIStreamBuffer {
     /// Finalize the response by emitting all *.done events and response.completed.
     /// Call this when the stream is complete (after seeing [DONE] or end_of_stream).
     pub fn finalize(&mut self) {
+        // Idempotent finalize: avoid duplicate response.completed loops.
+        if self.finalized {
+            return;
+        }
+        self.finalized = true;
+
         let mut events = Vec::new();
 
+        // Ensure lifecycle prelude is emitted even if finalize is triggered
+        // by finish_reason before any prior delta was processed.
+        if !self.created_emitted {
+            if self.response_id.is_none() {
+                self.response_id = Some(format!(
+                    "resp_{}",
+                    uuid::Uuid::new_v4().to_string().replace("-", "")
+                ));
+                self.created_at = Some(
+                    std::time::SystemTime::now()
+                        .duration_since(std::time::UNIX_EPOCH)
+                        .unwrap()
+                        .as_secs() as i64,
+                );
+                self.model = Some("unknown".to_string());
+            }
+            events.push(self.create_response_created_event());
+            self.created_emitted = true;
+        }
+        if !self.in_progress_emitted {
+            events.push(self.create_response_in_progress_event());
+            self.in_progress_emitted = true;
+        }
+
         // Emit done events for all accumulated content
 
         // Text content done events
@@ -443,6 +475,12 @@ impl SseStreamBufferTrait for ResponsesAPIStreamBuffer {
             }
         };
 
+        // Explicit completion marker from transform layer.
+        if matches!(stream_event.as_ref(), ResponsesAPIStreamEvent::Done { .. }) {
+            self.finalize();
+            return;
+        }
+
         let mut events = Vec::new();
 
         // Capture upstream metadata from ResponseCreated or ResponseInProgress if present
@@ -789,4 +827,30 @@ mod tests {
         println!("✓ NO completion events (partial stream, no [DONE])");
         println!("✓ Arguments accumulated: '{{\"location\":\"'\n");
     }
+
+    #[test]
+    fn test_finish_reason_without_done_still_finalizes_once() {
+        let raw_input = r#"data: {"id":"chatcmpl-123","object":"chat.completion.chunk","created":1234567890,"model":"gpt-4o","choices":[{"index":0,"delta":{"role":"assistant","content":"Hello"},"finish_reason":null}]}
+
+data: {"id":"chatcmpl-123","object":"chat.completion.chunk","created":1234567890,"model":"gpt-4o","choices":[{"index":0,"delta":{},"finish_reason":"stop"}]}"#;
+
+        let client_api = SupportedAPIsFromClient::OpenAIResponsesAPI(OpenAIApi::Responses);
+        let upstream_api = SupportedUpstreamAPIs::OpenAIChatCompletions(OpenAIApi::ChatCompletions);
+
+        let stream_iter = SseStreamIter::try_from(raw_input.as_bytes()).unwrap();
+        let mut buffer = ResponsesAPIStreamBuffer::new();
+
+        for raw_event in stream_iter {
+            let transformed_event =
+                SseEvent::try_from((raw_event, &client_api, &upstream_api)).unwrap();
+            buffer.add_transformed_event(transformed_event);
+        }
+
+        let output = String::from_utf8_lossy(&buffer.to_bytes()).to_string();
+        let completed_count = output.matches("event: response.completed").count();
+        assert_eq!(
+            completed_count, 1,
+            "response.completed should be emitted exactly once"
+        );
+    }
 }
diff --git a/crates/hermesllm/src/clients/endpoints.rs b/crates/hermesllm/src/clients/endpoints.rs
index eff96cc5..23e14604 100644
--- a/crates/hermesllm/src/clients/endpoints.rs
+++ b/crates/hermesllm/src/clients/endpoints.rs
@@ -184,8 +184,8 @@ impl SupportedAPIsFromClient {
             SupportedAPIsFromClient::OpenAIResponsesAPI(_) => {
                 // For Responses API, check if provider supports it, otherwise translate to chat/completions
                 match provider_id {
-                    // OpenAI and compatible providers that support /v1/responses
-                    ProviderId::OpenAI => route_by_provider("/responses"),
+                    // Providers that support /v1/responses natively
+                    ProviderId::OpenAI | ProviderId::XAI => route_by_provider("/responses"),
                     // All other providers: translate to /chat/completions
                     _ => route_by_provider("/chat/completions"),
                 }
@@ -654,4 +654,19 @@ mod tests {
             "/custom/azure/path/gpt-4-deployment/chat/completions?api-version=2025-01-01-preview"
         );
     }
+
+    #[test]
+    fn test_responses_api_targets_xai_native_responses_endpoint() {
+        let api = SupportedAPIsFromClient::OpenAIResponsesAPI(OpenAIApi::Responses);
+        assert_eq!(
+            api.target_endpoint_for_provider(
+                &ProviderId::XAI,
+                "/v1/responses",
+                "grok-4-1-fast-reasoning",
+                false,
+                None
+            ),
+            "/v1/responses"
+        );
+    }
 }
diff --git a/crates/hermesllm/src/providers/id.rs b/crates/hermesllm/src/providers/id.rs
index fff73f15..11008711 100644
--- a/crates/hermesllm/src/providers/id.rs
+++ b/crates/hermesllm/src/providers/id.rs
@@ -166,10 +166,11 @@ impl ProviderId {
                 SupportedAPIsFromClient::OpenAIChatCompletions(_),
             ) => SupportedUpstreamAPIs::OpenAIChatCompletions(OpenAIApi::ChatCompletions),
 
-            // OpenAI Responses API - only OpenAI supports this
-            (ProviderId::OpenAI, SupportedAPIsFromClient::OpenAIResponsesAPI(_)) => {
-                SupportedUpstreamAPIs::OpenAIResponsesAPI(OpenAIApi::Responses)
-            }
+            // OpenAI Responses API - OpenAI and xAI support this natively
+            (
+                ProviderId::OpenAI | ProviderId::XAI,
+                SupportedAPIsFromClient::OpenAIResponsesAPI(_),
+            ) => SupportedUpstreamAPIs::OpenAIResponsesAPI(OpenAIApi::Responses),
 
             // Amazon Bedrock natively supports Bedrock APIs
             (ProviderId::AmazonBedrock, SupportedAPIsFromClient::OpenAIChatCompletions(_)) => {
@@ -328,4 +329,16 @@ mod tests {
             "AmazonBedrock should have models (mapped to amazon)"
         );
     }
+
+    #[test]
+    fn test_xai_uses_responses_api_for_responses_clients() {
+        use crate::clients::endpoints::{SupportedAPIsFromClient, SupportedUpstreamAPIs};
+
+        let client_api = SupportedAPIsFromClient::OpenAIResponsesAPI(OpenAIApi::Responses);
+        let upstream = ProviderId::XAI.compatible_api_for_client(&client_api, false);
+        assert!(matches!(
+            upstream,
+            SupportedUpstreamAPIs::OpenAIResponsesAPI(OpenAIApi::Responses)
+        ));
+    }
 }
diff --git a/crates/hermesllm/src/providers/request.rs b/crates/hermesllm/src/providers/request.rs
index e97e8a68..92688133 100644
--- a/crates/hermesllm/src/providers/request.rs
+++ b/crates/hermesllm/src/providers/request.rs
@@ -5,6 +5,7 @@ use crate::apis::amazon_bedrock::{ConverseRequest, ConverseStreamRequest};
 use crate::apis::openai_responses::ResponsesAPIRequest;
 use crate::clients::endpoints::SupportedAPIsFromClient;
 use crate::clients::endpoints::SupportedUpstreamAPIs;
+use crate::ProviderId;
 
 use serde_json::Value;
 use std::collections::HashMap;
@@ -70,6 +71,25 @@ impl ProviderRequestType {
             Self::ResponsesAPIRequest(r) => r.set_messages(messages),
         }
     }
+
+    /// Apply provider-specific request normalization before sending upstream.
+    pub fn normalize_for_upstream(
+        &mut self,
+        provider_id: ProviderId,
+        upstream_api: &SupportedUpstreamAPIs,
+    ) {
+        if provider_id == ProviderId::XAI
+            && matches!(
+                upstream_api,
+                SupportedUpstreamAPIs::OpenAIChatCompletions(_)
+            )
+        {
+            if let Self::ChatCompletionsRequest(req) = self {
+                // xAI's legacy live-search shape is deprecated on chat/completions.
+                req.web_search_options = None;
+            }
+        }
+    }
 }
 
 impl ProviderRequest for ProviderRequestType {
@@ -787,6 +807,62 @@ mod tests {
         }
     }
 
+    #[test]
+    fn test_normalize_for_upstream_xai_clears_chat_web_search_options() {
+        use crate::apis::openai::{Message, MessageContent, OpenAIApi, Role};
+
+        let mut request = ProviderRequestType::ChatCompletionsRequest(ChatCompletionsRequest {
+            model: "grok-4".to_string(),
+            messages: vec![Message {
+                role: Role::User,
+                content: Some(MessageContent::Text("hello".to_string())),
+                name: None,
+                tool_calls: None,
+                tool_call_id: None,
+            }],
+            web_search_options: Some(serde_json::json!({"search_context_size":"medium"})),
+            ..Default::default()
+        });
+
+        request.normalize_for_upstream(
+            ProviderId::XAI,
+            &SupportedUpstreamAPIs::OpenAIChatCompletions(OpenAIApi::ChatCompletions),
+        );
+
+        let ProviderRequestType::ChatCompletionsRequest(req) = request else {
+            panic!("expected chat request");
+        };
+        assert!(req.web_search_options.is_none());
+    }
+
+    #[test]
+    fn test_normalize_for_upstream_non_xai_keeps_chat_web_search_options() {
+        use crate::apis::openai::{Message, MessageContent, OpenAIApi, Role};
+
+        let mut request = ProviderRequestType::ChatCompletionsRequest(ChatCompletionsRequest {
+            model: "gpt-4o".to_string(),
+            messages: vec![Message {
+                role: Role::User,
+                content: Some(MessageContent::Text("hello".to_string())),
+                name: None,
+                tool_calls: None,
+                tool_call_id: None,
+            }],
+            web_search_options: Some(serde_json::json!({"search_context_size":"medium"})),
+            ..Default::default()
+        });
+
+        request.normalize_for_upstream(
+            ProviderId::OpenAI,
+            &SupportedUpstreamAPIs::OpenAIChatCompletions(OpenAIApi::ChatCompletions),
+        );
+
+        let ProviderRequestType::ChatCompletionsRequest(req) = request else {
+            panic!("expected chat request");
+        };
+        assert!(req.web_search_options.is_some());
+    }
+
     #[test]
     fn test_responses_api_to_anthropic_messages_conversion() {
         use crate::apis::anthropic::AnthropicApi::Messages;
diff --git a/crates/hermesllm/src/transforms/request/from_openai.rs b/crates/hermesllm/src/transforms/request/from_openai.rs
index ddc3b1ca..f2e8ab0d 100644
--- a/crates/hermesllm/src/transforms/request/from_openai.rs
+++ b/crates/hermesllm/src/transforms/request/from_openai.rs
@@ -10,7 +10,8 @@ use crate::apis::anthropic::{
     ToolResultContent,
 };
 use crate::apis::openai::{
-    ChatCompletionsRequest, Message, MessageContent, Role, Tool, ToolChoice, ToolChoiceType,
+    ChatCompletionsRequest, FunctionCall as OpenAIFunctionCall, Message, MessageContent, Role,
+    Tool, ToolCall as OpenAIToolCall, ToolChoice, ToolChoiceType,
 };
 
 use crate::apis::openai_responses::{
@@ -65,6 +66,14 @@ impl TryFrom<ResponsesInputConverter> for Vec<Message> {
 
                 Ok(messages)
             }
+            InputParam::SingleItem(item) => {
+                // Some clients send a single object instead of an array.
+                let nested = ResponsesInputConverter {
+                    input: InputParam::Items(vec![item]),
+                    instructions: converter.instructions,
+                };
+                Vec::<Message>::try_from(nested)
+            }
             InputParam::Items(items) => {
                 // Convert input items to messages
                 let mut converted_messages = Vec::new();
@@ -82,82 +91,145 @@ impl TryFrom<ResponsesInputConverter> for Vec<Message> {
 
                 // Convert each input item
                 for item in items {
-                    if let InputItem::Message(input_msg) = item {
-                        let role = match input_msg.role {
-                            MessageRole::User => Role::User,
-                            MessageRole::Assistant => Role::Assistant,
-                            MessageRole::System => Role::System,
-                            MessageRole::Developer => Role::System, // Map developer to system
-                        };
+                    match item {
+                        InputItem::Message(input_msg) => {
+                            let role = match input_msg.role {
+                                MessageRole::User => Role::User,
+                                MessageRole::Assistant => Role::Assistant,
+                                MessageRole::System => Role::System,
+                                MessageRole::Developer => Role::System, // Map developer to system
+                                MessageRole::Tool => Role::Tool,
+                            };
 
-                        // Convert content based on MessageContent type
-                        let content = match &input_msg.content {
-                            crate::apis::openai_responses::MessageContent::Text(text) => {
-                                // Simple text content
-                                MessageContent::Text(text.clone())
-                            }
-                            crate::apis::openai_responses::MessageContent::Items(content_items) => {
-                                // Check if it's a single text item (can use simple text format)
-                                if content_items.len() == 1 {
-                                    if let InputContent::InputText { text } = &content_items[0] {
-                                        MessageContent::Text(text.clone())
+                            // Convert content based on MessageContent type
+                            let content = match &input_msg.content {
+                                crate::apis::openai_responses::MessageContent::Text(text) => {
+                                    // Simple text content
+                                    MessageContent::Text(text.clone())
+                                }
+                                crate::apis::openai_responses::MessageContent::Items(
+                                    content_items,
+                                ) => {
+                                    // Check if it's a single text item (can use simple text format)
+                                    if content_items.len() == 1 {
+                                        if let InputContent::InputText { text } = &content_items[0]
+                                        {
+                                            MessageContent::Text(text.clone())
+                                        } else {
+                                            // Single non-text item - use parts format
+                                            MessageContent::Parts(
+                                                content_items
+                                                    .iter()
+                                                    .filter_map(|c| match c {
+                                                        InputContent::InputText { text } => {
+                                                            Some(crate::apis::openai::ContentPart::Text {
+                                                                text: text.clone(),
+                                                            })
+                                                        }
+                                                        InputContent::InputImage { image_url, .. } => {
+                                                            Some(crate::apis::openai::ContentPart::ImageUrl {
+                                                                image_url: crate::apis::openai::ImageUrl {
+                                                                    url: image_url.clone(),
+                                                                    detail: None,
+                                                                },
+                                                            })
+                                                        }
+                                                        InputContent::InputFile { .. } => None, // Skip files for now
+                                                        InputContent::InputAudio { .. } => None, // Skip audio for now
+                                                    })
+                                                    .collect(),
+                                            )
+                                        }
                                     } else {
-                                        // Single non-text item - use parts format
+                                        // Multiple content items - convert to parts
                                         MessageContent::Parts(
-                                            content_items.iter()
+                                            content_items
+                                                .iter()
                                                 .filter_map(|c| match c {
                                                     InputContent::InputText { text } => {
-                                                        Some(crate::apis::openai::ContentPart::Text { text: text.clone() })
+                                                        Some(crate::apis::openai::ContentPart::Text {
+                                                            text: text.clone(),
+                                                        })
                                                     }
                                                     InputContent::InputImage { image_url, .. } => {
                                                         Some(crate::apis::openai::ContentPart::ImageUrl {
                                                             image_url: crate::apis::openai::ImageUrl {
                                                                 url: image_url.clone(),
                                                                 detail: None,
-                                                            }
+                                                            },
                                                         })
                                                     }
                                                     InputContent::InputFile { .. } => None, // Skip files for now
                                                     InputContent::InputAudio { .. } => None, // Skip audio for now
                                                 })
-                                                .collect()
+                                                .collect(),
                                         )
                                     }
-                                } else {
-                                    // Multiple content items - convert to parts
-                                    MessageContent::Parts(
-                                        content_items
-                                            .iter()
-                                            .filter_map(|c| match c {
-                                                InputContent::InputText { text } => {
-                                                    Some(crate::apis::openai::ContentPart::Text {
-                                                        text: text.clone(),
-                                                    })
-                                                }
-                                                InputContent::InputImage { image_url, .. } => Some(
-                                                    crate::apis::openai::ContentPart::ImageUrl {
-                                                        image_url: crate::apis::openai::ImageUrl {
-                                                            url: image_url.clone(),
-                                                            detail: None,
-                                                        },
-                                                    },
-                                                ),
-                                                InputContent::InputFile { .. } => None, // Skip files for now
-                                                InputContent::InputAudio { .. } => None, // Skip audio for now
-                                            })
-                                            .collect(),
-                                    )
+                                }
+                            };
+
+                            converted_messages.push(Message {
+                                role,
+                                content: Some(content),
+                                name: None,
+                                tool_call_id: None,
+                                tool_calls: None,
+                            });
+                        }
+                        InputItem::FunctionCallOutput {
+                            item_type: _,
+                            call_id,
+                            output,
+                        } => {
+                            // Preserve tool result so upstream models do not re-issue the same tool call.
+                            let output_text = match output {
+                                serde_json::Value::String(s) => s.clone(),
+                                other => serde_json::to_string(&other).unwrap_or_default(),
+                            };
+                            converted_messages.push(Message {
+                                role: Role::Tool,
+                                content: Some(MessageContent::Text(output_text)),
+                                name: None,
+                                tool_call_id: Some(call_id),
+                                tool_calls: None,
+                            });
+                        }
+                        InputItem::FunctionCall {
+                            item_type: _,
+                            name,
+                            arguments,
+                            call_id,
+                        } => {
+                            let tool_call = OpenAIToolCall {
+                                id: call_id,
+                                call_type: "function".to_string(),
+                                function: OpenAIFunctionCall { name, arguments },
+                            };
+
+                            // Prefer attaching tool_calls to the preceding assistant message when present.
+                            if let Some(last) = converted_messages.last_mut() {
+                                if matches!(last.role, Role::Assistant) {
+                                    if let Some(existing) = &mut last.tool_calls {
+                                        existing.push(tool_call);
+                                    } else {
+                                        last.tool_calls = Some(vec![tool_call]);
+                                    }
+                                    continue;
                                 }
                             }
-                        };
 
-                        converted_messages.push(Message {
-                            role,
-                            content: Some(content),
-                            name: None,
-                            tool_call_id: None,
-                            tool_calls: None,
-                        });
+                            converted_messages.push(Message {
+                                role: Role::Assistant,
+                                content: None,
+                                name: None,
+                                tool_call_id: None,
+                                tool_calls: Some(vec![tool_call]),
+                            });
+                        }
+                        InputItem::ItemReference { .. } => {
+                            // Item references/unknown entries are metadata-like and can be skipped
+                            // for chat-completions conversion.
+                        }
                     }
                 }
 
@@ -397,6 +469,170 @@ impl TryFrom<ResponsesAPIRequest> for ChatCompletionsRequest {
     type Error = TransformError;
 
     fn try_from(req: ResponsesAPIRequest) -> Result<Self, Self::Error> {
+        fn normalize_function_parameters(
+            parameters: Option<serde_json::Value>,
+            fallback_extra: Option<serde_json::Value>,
+        ) -> serde_json::Value {
+            // ChatCompletions function tools require JSON Schema with top-level type=object.
+            let mut base = serde_json::json!({
+                "type": "object",
+                "properties": {},
+            });
+
+            if let Some(serde_json::Value::Object(mut obj)) = parameters {
+                // Enforce a valid object schema shape regardless of upstream tool format.
+                obj.insert(
+                    "type".to_string(),
+                    serde_json::Value::String("object".to_string()),
+                );
+                if !obj.contains_key("properties") {
+                    obj.insert(
+                        "properties".to_string(),
+                        serde_json::Value::Object(serde_json::Map::new()),
+                    );
+                }
+                base = serde_json::Value::Object(obj);
+            }
+
+            if let Some(extra) = fallback_extra {
+                if let serde_json::Value::Object(ref mut map) = base {
+                    map.insert("x-custom-format".to_string(), extra);
+                }
+            }
+
+            base
+        }
+
+        let mut converted_chat_tools: Vec<Tool> = Vec::new();
+        let mut web_search_options: Option<serde_json::Value> = None;
+
+        if let Some(tools) = req.tools.clone() {
+            for (idx, tool) in tools.into_iter().enumerate() {
+                match tool {
+                    ResponsesTool::Function {
+                        name,
+                        description,
+                        parameters,
+                        strict,
+                    } => converted_chat_tools.push(Tool {
+                        tool_type: "function".to_string(),
+                        function: crate::apis::openai::Function {
+                            name,
+                            description,
+                            parameters: normalize_function_parameters(parameters, None),
+                            strict,
+                        },
+                    }),
+                    ResponsesTool::WebSearchPreview {
+                        search_context_size,
+                        user_location,
+                        ..
+                    } => {
+                        if web_search_options.is_none() {
+                            let user_location_value = user_location.map(|loc| {
+                                let mut approx = serde_json::Map::new();
+                                if let Some(city) = loc.city {
+                                    approx.insert(
+                                        "city".to_string(),
+                                        serde_json::Value::String(city),
+                                    );
+                                }
+                                if let Some(country) = loc.country {
+                                    approx.insert(
+                                        "country".to_string(),
+                                        serde_json::Value::String(country),
+                                    );
+                                }
+                                if let Some(region) = loc.region {
+                                    approx.insert(
+                                        "region".to_string(),
+                                        serde_json::Value::String(region),
+                                    );
+                                }
+                                if let Some(timezone) = loc.timezone {
+                                    approx.insert(
+                                        "timezone".to_string(),
+                                        serde_json::Value::String(timezone),
+                                    );
+                                }
+
+                                serde_json::json!({
+                                    "type": loc.location_type,
+                                    "approximate": serde_json::Value::Object(approx),
+                                })
+                            });
+
+                            let mut web_search = serde_json::Map::new();
+                            if let Some(size) = search_context_size {
+                                web_search.insert(
+                                    "search_context_size".to_string(),
+                                    serde_json::Value::String(size),
+                                );
+                            }
+                            if let Some(location) = user_location_value {
+                                web_search.insert("user_location".to_string(), location);
+                            }
+                            web_search_options = Some(serde_json::Value::Object(web_search));
+                        }
+                    }
+                    ResponsesTool::Custom {
+                        name,
+                        description,
+                        format,
+                    } => {
+                        // Custom tools do not have a strict ChatCompletions equivalent for all
+                        // providers. Map them to a permissive function tool for compatibility.
+                        let tool_name = name.unwrap_or_else(|| format!("custom_tool_{}", idx + 1));
+                        let parameters = normalize_function_parameters(
+                            Some(serde_json::json!({
+                                "type": "object",
+                                "properties": {
+                                    "input": { "type": "string" }
+                                },
+                                "required": ["input"],
+                                "additionalProperties": true,
+                            })),
+                            format,
+                        );
+
+                        converted_chat_tools.push(Tool {
+                            tool_type: "function".to_string(),
+                            function: crate::apis::openai::Function {
+                                name: tool_name,
+                                description,
+                                parameters,
+                                strict: Some(false),
+                            },
+                        });
+                    }
+                    ResponsesTool::FileSearch { .. } => {
+                        return Err(TransformError::UnsupportedConversion(
+                            "FileSearch tool is not supported in ChatCompletions API. Only function/custom/web search tools are supported in this conversion."
+                                .to_string(),
+                        ));
+                    }
+                    ResponsesTool::CodeInterpreter => {
+                        return Err(TransformError::UnsupportedConversion(
+                            "CodeInterpreter tool is not supported in ChatCompletions API conversion."
+                                .to_string(),
+                        ));
+                    }
+                    ResponsesTool::Computer { .. } => {
+                        return Err(TransformError::UnsupportedConversion(
+                            "Computer tool is not supported in ChatCompletions API conversion."
+                                .to_string(),
+                        ));
+                    }
+                }
+            }
+        }
+
+        let tools = if converted_chat_tools.is_empty() {
+            None
+        } else {
+            Some(converted_chat_tools)
+        };
+
         // Convert input to messages using the shared converter
         let converter = ResponsesInputConverter {
             input: req.input,
@@ -418,57 +654,24 @@ impl TryFrom<ResponsesAPIRequest> for ChatCompletionsRequest {
             service_tier: req.service_tier,
             top_logprobs: req.top_logprobs.map(|t| t as u32),
             modalities: req.modalities.map(|mods| {
-                mods.into_iter().map(|m| {
-                    match m {
+                mods.into_iter()
+                    .map(|m| match m {
                         Modality::Text => "text".to_string(),
                         Modality::Audio => "audio".to_string(),
-                    }
-                }).collect()
+                    })
+                    .collect()
             }),
-            stream_options: req.stream_options.map(|opts| {
-                crate::apis::openai::StreamOptions {
+            stream_options: req
+                .stream_options
+                .map(|opts| crate::apis::openai::StreamOptions {
                     include_usage: opts.include_usage,
-                }
+                }),
+            reasoning_effort: req.reasoning_effort.map(|effort| match effort {
+                ReasoningEffort::Low => "low".to_string(),
+                ReasoningEffort::Medium => "medium".to_string(),
+                ReasoningEffort::High => "high".to_string(),
             }),
-            reasoning_effort: req.reasoning_effort.map(|effort| {
-                match effort {
-                    ReasoningEffort::Low => "low".to_string(),
-                    ReasoningEffort::Medium => "medium".to_string(),
-                    ReasoningEffort::High => "high".to_string(),
-                }
-            }),
-            tools: req.tools.map(|tools| {
-                tools.into_iter().map(|tool| {
-
-                    // Only convert Function tools - other types are not supported in ChatCompletions
-                    match tool {
-                        ResponsesTool::Function { name, description, parameters, strict } => Ok(Tool {
-                            tool_type: "function".to_string(),
-                            function: crate::apis::openai::Function {
-                                name,
-                                description,
-                                parameters: parameters.unwrap_or_else(|| serde_json::json!({
-                                    "type": "object",
-                                    "properties": {}
-                                })),
-                                strict,
-                            }
-                        }),
-                        ResponsesTool::FileSearch { .. } => Err(TransformError::UnsupportedConversion(
-                            "FileSearch tool is not supported in ChatCompletions API. Only function tools are supported.".to_string()
-                        )),
-                        ResponsesTool::WebSearchPreview { .. } => Err(TransformError::UnsupportedConversion(
-                            "WebSearchPreview tool is not supported in ChatCompletions API. Only function tools are supported.".to_string()
-                        )),
-                        ResponsesTool::CodeInterpreter => Err(TransformError::UnsupportedConversion(
-                            "CodeInterpreter tool is not supported in ChatCompletions API. Only function tools are supported.".to_string()
-                        )),
-                        ResponsesTool::Computer { .. } => Err(TransformError::UnsupportedConversion(
-                            "Computer tool is not supported in ChatCompletions API. Only function tools are supported.".to_string()
-                        )),
-                    }
-                }).collect::<Result<Vec<_>, _>>()
-            }).transpose()?,
+            tools,
             tool_choice: req.tool_choice.map(|choice| {
                 match choice {
                     ResponsesToolChoice::String(s) => {
@@ -481,11 +684,14 @@ impl TryFrom<ResponsesAPIRequest> for ChatCompletionsRequest {
                     }
                     ResponsesToolChoice::Named { function, .. } => ToolChoice::Function {
                         choice_type: "function".to_string(),
-                        function: crate::apis::openai::FunctionChoice { name: function.name }
-                    }
+                        function: crate::apis::openai::FunctionChoice {
+                            name: function.name,
+                        },
+                    },
                 }
             }),
             parallel_tool_calls: req.parallel_tool_calls,
+            web_search_options,
             ..Default::default()
         })
     }
@@ -1027,4 +1233,235 @@ mod tests {
             panic!("Expected text content block");
         }
     }
+
+    #[test]
+    fn test_responses_custom_tool_maps_to_function_tool_for_chat_completions() {
+        use crate::apis::openai_responses::{
+            InputParam, ResponsesAPIRequest, Tool as ResponsesTool,
+        };
+
+        let req = ResponsesAPIRequest {
+            model: "gpt-5.3-codex".to_string(),
+            input: InputParam::Text("use custom tool".to_string()),
+            tools: Some(vec![ResponsesTool::Custom {
+                name: Some("run_patch".to_string()),
+                description: Some("Apply structured patch".to_string()),
+                format: Some(serde_json::json!({
+                    "kind": "patch",
+                    "version": "v1"
+                })),
+            }]),
+            include: None,
+            parallel_tool_calls: None,
+            store: None,
+            instructions: None,
+            stream: None,
+            stream_options: None,
+            conversation: None,
+            tool_choice: None,
+            max_output_tokens: None,
+            temperature: None,
+            top_p: None,
+            metadata: None,
+            previous_response_id: None,
+            modalities: None,
+            audio: None,
+            text: None,
+            reasoning_effort: None,
+            truncation: None,
+            user: None,
+            max_tool_calls: None,
+            service_tier: None,
+            background: None,
+            top_logprobs: None,
+        };
+
+        let converted = ChatCompletionsRequest::try_from(req).expect("conversion should succeed");
+        let tools = converted.tools.expect("tools should be present");
+        assert_eq!(tools.len(), 1);
+        assert_eq!(tools[0].tool_type, "function");
+        assert_eq!(tools[0].function.name, "run_patch");
+        assert_eq!(
+            tools[0].function.description.as_deref(),
+            Some("Apply structured patch")
+        );
+    }
+
+    #[test]
+    fn test_responses_web_search_maps_to_chat_web_search_options() {
+        use crate::apis::openai_responses::{
+            InputParam, ResponsesAPIRequest, Tool as ResponsesTool, UserLocation,
+        };
+
+        let req = ResponsesAPIRequest {
+            model: "gpt-5.3-codex".to_string(),
+            input: InputParam::Text("find project docs".to_string()),
+            tools: Some(vec![ResponsesTool::WebSearchPreview {
+                domains: Some(vec!["docs.planoai.dev".to_string()]),
+                search_context_size: Some("medium".to_string()),
+                user_location: Some(UserLocation {
+                    location_type: "approximate".to_string(),
+                    city: Some("San Francisco".to_string()),
+                    country: Some("US".to_string()),
+                    region: Some("CA".to_string()),
+                    timezone: Some("America/Los_Angeles".to_string()),
+                }),
+            }]),
+            include: None,
+            parallel_tool_calls: None,
+            store: None,
+            instructions: None,
+            stream: None,
+            stream_options: None,
+            conversation: None,
+            tool_choice: None,
+            max_output_tokens: None,
+            temperature: None,
+            top_p: None,
+            metadata: None,
+            previous_response_id: None,
+            modalities: None,
+            audio: None,
+            text: None,
+            reasoning_effort: None,
+            truncation: None,
+            user: None,
+            max_tool_calls: None,
+            service_tier: None,
+            background: None,
+            top_logprobs: None,
+        };
+
+        let converted = ChatCompletionsRequest::try_from(req).expect("conversion should succeed");
+        assert!(converted.web_search_options.is_some());
+    }
+
+    #[test]
+    fn test_responses_function_call_output_maps_to_tool_message() {
+        use crate::apis::openai_responses::{
+            InputItem, InputParam, ResponsesAPIRequest, Tool as ResponsesTool,
+        };
+
+        let req = ResponsesAPIRequest {
+            model: "gpt-5.3-codex".to_string(),
+            input: InputParam::Items(vec![InputItem::FunctionCallOutput {
+                item_type: "function_call_output".to_string(),
+                call_id: "call_123".to_string(),
+                output: serde_json::json!({"status":"ok","stdout":"hello"}),
+            }]),
+            tools: Some(vec![ResponsesTool::Function {
+                name: "exec_command".to_string(),
+                description: Some("Execute a shell command".to_string()),
+                parameters: Some(serde_json::json!({
+                    "type": "object",
+                    "properties": {
+                        "cmd": { "type": "string" }
+                    },
+                    "required": ["cmd"]
+                })),
+                strict: Some(false),
+            }]),
+            include: None,
+            parallel_tool_calls: None,
+            store: None,
+            instructions: None,
+            stream: None,
+            stream_options: None,
+            conversation: None,
+            tool_choice: None,
+            max_output_tokens: None,
+            temperature: None,
+            top_p: None,
+            metadata: None,
+            previous_response_id: None,
+            modalities: None,
+            audio: None,
+            text: None,
+            reasoning_effort: None,
+            truncation: None,
+            user: None,
+            max_tool_calls: None,
+            service_tier: None,
+            background: None,
+            top_logprobs: None,
+        };
+
+        let converted = ChatCompletionsRequest::try_from(req).expect("conversion should succeed");
+        assert_eq!(converted.messages.len(), 1);
+        assert!(matches!(converted.messages[0].role, Role::Tool));
+        assert_eq!(
+            converted.messages[0].tool_call_id.as_deref(),
+            Some("call_123")
+        );
+    }
+
+    #[test]
+    fn test_responses_function_call_and_output_preserve_call_id_link() {
+        use crate::apis::openai_responses::{
+            InputItem, InputMessage, MessageContent as ResponsesMessageContent, MessageRole,
+            ResponsesAPIRequest,
+        };
+
+        let req = ResponsesAPIRequest {
+            model: "gpt-5.3-codex".to_string(),
+            input: InputParam::Items(vec![
+                InputItem::Message(InputMessage {
+                    role: MessageRole::Assistant,
+                    content: ResponsesMessageContent::Items(vec![]),
+                }),
+                InputItem::FunctionCall {
+                    item_type: "function_call".to_string(),
+                    name: "exec_command".to_string(),
+                    arguments: "{\"cmd\":\"pwd\"}".to_string(),
+                    call_id: "toolu_abc123".to_string(),
+                },
+                InputItem::FunctionCallOutput {
+                    item_type: "function_call_output".to_string(),
+                    call_id: "toolu_abc123".to_string(),
+                    output: serde_json::Value::String("ok".to_string()),
+                },
+            ]),
+            tools: None,
+            include: None,
+            parallel_tool_calls: None,
+            store: None,
+            instructions: None,
+            stream: None,
+            stream_options: None,
+            conversation: None,
+            tool_choice: None,
+            max_output_tokens: None,
+            temperature: None,
+            top_p: None,
+            metadata: None,
+            previous_response_id: None,
+            modalities: None,
+            audio: None,
+            text: None,
+            reasoning_effort: None,
+            truncation: None,
+            user: None,
+            max_tool_calls: None,
+            service_tier: None,
+            background: None,
+            top_logprobs: None,
+        };
+
+        let converted = ChatCompletionsRequest::try_from(req).expect("conversion should succeed");
+        assert_eq!(converted.messages.len(), 2);
+
+        assert!(matches!(converted.messages[0].role, Role::Assistant));
+        let tool_calls = converted.messages[0]
+            .tool_calls
+            .as_ref()
+            .expect("assistant tool_calls should be present");
+        assert_eq!(tool_calls.len(), 1);
+        assert_eq!(tool_calls[0].id, "toolu_abc123");
+
+        assert!(matches!(converted.messages[1].role, Role::Tool));
+        assert_eq!(
+            converted.messages[1].tool_call_id.as_deref(),
+            Some("toolu_abc123")
+        );
+    }
 }
diff --git a/crates/hermesllm/src/transforms/response_streaming/to_openai_streaming.rs b/crates/hermesllm/src/transforms/response_streaming/to_openai_streaming.rs
index 328317bc..4aa719af 100644
--- a/crates/hermesllm/src/transforms/response_streaming/to_openai_streaming.rs
+++ b/crates/hermesllm/src/transforms/response_streaming/to_openai_streaming.rs
@@ -512,19 +512,12 @@ impl TryFrom<ChatCompletionsStreamResponse> for ResponsesAPIStreamEvent {
                 }
             }
 
-            // Handle finish_reason - this is a completion signal
-            // Return an empty delta that the buffer can use to detect completion
+            // Handle finish_reason - this is a completion signal.
+            // Emit an explicit Done marker so the buffering layer can finalize
+            // even if an upstream [DONE] marker is missing/delayed.
             if choice.finish_reason.is_some() {
-                // Return a minimal text delta to signal completion
-                // The buffer will handle the finish_reason and generate response.completed
-                return Ok(ResponsesAPIStreamEvent::ResponseOutputTextDelta {
-                    item_id: "".to_string(), // Buffer will fill this
-                    output_index: choice.index as i32,
-                    content_index: 0,
-                    delta: "".to_string(), // Empty delta signals completion
-                    logprobs: vec![],
-                    obfuscation: None,
-                    sequence_number: 0, // Buffer will fill this
+                return Ok(ResponsesAPIStreamEvent::Done {
+                    sequence_number: 0, // Buffer will assign final sequence
                 });
             }
 
diff --git a/crates/llm_gateway/src/stream_context.rs b/crates/llm_gateway/src/stream_context.rs
index 547ba166..7a353bcb 100644
--- a/crates/llm_gateway/src/stream_context.rs
+++ b/crates/llm_gateway/src/stream_context.rs
@@ -1046,7 +1046,8 @@ impl HttpContext for StreamContext {
                 );
 
                 match ProviderRequestType::try_from((deserialized_client_request, upstream)) {
-                    Ok(request) => {
+                    Ok(mut request) => {
+                        request.normalize_for_upstream(self.get_provider_id(), upstream);
                         debug!(
                             "request_id={}: upstream request payload: {}",
                             self.request_identifier(),
diff --git a/demos/README.md b/demos/README.md
index a2613454..6e467a33 100644
--- a/demos/README.md
+++ b/demos/README.md
@@ -16,6 +16,7 @@ This directory contains demos showcasing Plano's capabilities as an AI-native pr
 | [Preference-Based Routing](llm_routing/preference_based_routing/) | Routes prompts to LLMs based on user-defined preferences and task type (e.g. code generation vs. understanding) |
 | [Model Alias Routing](llm_routing/model_alias_routing/) | Maps semantic aliases (`arch.summarize.v1`) to provider-specific models for centralized governance |
 | [Claude Code Router](llm_routing/claude_code_router/) | Extends Claude Code with multi-provider access and preference-aligned routing for coding tasks |
+| [Codex Router](llm_routing/codex_router/) | Extends Codex CLI with multi-provider access and preference-aligned routing for coding tasks |
 
 ## Agent Orchestration
 
diff --git a/demos/llm_routing/codex_router/README.md b/demos/llm_routing/codex_router/README.md
new file mode 100644
index 00000000..d3662581
--- /dev/null
+++ b/demos/llm_routing/codex_router/README.md
@@ -0,0 +1,92 @@
+# Codex Router - Multi-Model Access with Intelligent Routing
+
+Plano extends Codex CLI to access multiple LLM providers through a single interface. This gives you:
+
+1. **Access to Models**: Connect to OpenAI, Anthropic, xAI, Gemini, and local models via Ollama
+2. **Intelligent Routing via Preferences for Coding Tasks**: Configure which models handle specific development tasks:
+   - Code generation and implementation
+   - Code understanding and analysis
+   - Debugging and optimization
+   - Architecture and system design
+
+Uses a [1.5B preference-aligned router LLM](https://arxiv.org/abs/2506.16655) to automatically select the best model based on your request type.
+
+## Benefits
+
+- **Single Interface**: Access multiple LLM providers through the same Codex CLI
+- **Task-Aware Routing**: Requests are analyzed and routed to models based on task type (code generation vs code understanding)
+- **Provider Flexibility**: Add or remove providers without changing your workflow
+- **Routing Transparency**: See which model handles each request and why
+
+## Quick Start
+
+### Prerequisites
+
+```bash
+# Install Codex CLI
+npm install -g @openai/codex
+
+# Install Plano CLI
+pip install planoai
+```
+
+### Step 1: Open the Demo
+
+```bash
+git clone https://github.com/katanemo/arch.git
+cd arch/demos/llm_routing/codex_router
+```
+
+### Step 2: Set API Keys
+
+```bash
+export OPENAI_API_KEY="your-openai-key-here"
+export ANTHROPIC_API_KEY="your-anthropic-key-here"
+export XAI_API_KEY="your-xai-key-here"
+export GEMINI_API_KEY="your-gemini-key-here"
+```
+
+### Step 3: Start Plano
+
+```bash
+planoai up
+# or: uvx planoai up
+```
+
+### Step 4: Launch Codex Through Plano
+
+```bash
+planoai cli-agent codex
+# or: uvx planoai cli-agent codex
+```
+
+By default, `planoai cli-agent codex` starts Codex with `gpt-5.3-codex`. With this demo config:
+
+- `code understanding` prompts are routed to `gpt-5-2025-08-07`
+- `code generation` prompts are routed to `gpt-5.3-codex`
+
+## Monitor Routing Decisions
+
+In a second terminal:
+
+```bash
+sh pretty_model_resolution.sh
+```
+
+This shows each request model and the final model selected by Plano's router.
+
+## Configuration Highlights
+
+`config.yaml` demonstrates:
+
+- OpenAI default model for Codex sessions (`gpt-5.3-codex`)
+- Routing preference override for code understanding (`gpt-5-2025-08-07`)
+- Additional providers (Anthropic, xAI, Gemini, Ollama local) to show cross-provider routing support
+
+## Optional Overrides
+
+Set a different Codex session model:
+
+```bash
+planoai cli-agent codex --settings='{"CODEX_MODEL":"gpt-5-2025-08-07"}'
+```
diff --git a/demos/llm_routing/codex_router/config.yaml b/demos/llm_routing/codex_router/config.yaml
new file mode 100644
index 00000000..7cafe641
--- /dev/null
+++ b/demos/llm_routing/codex_router/config.yaml
@@ -0,0 +1,38 @@
+version: v0.3.0
+
+listeners:
+  - type: model
+    name: model_listener
+    port: 12000
+
+model_providers:
+  # OpenAI models used by Codex defaults and preference routing
+  - model: openai/gpt-5.3-codex
+    default: true
+    access_key: $OPENAI_API_KEY
+    routing_preferences:
+      - name: code generation
+        description: generating new code snippets, functions, or boilerplate based on user prompts or requirements
+
+  - model: xai/grok-4-1-fast-non-reasoning
+    access_key: $GROK_API_KEY
+    routing_preferences:
+      - name: project understanding
+        description: understand repository structure, codebase, and code files, readmes, and other documentation
+
+  # Additional providers (optional): Codex can route to any configured model
+  # - model: anthropic/claude-sonnet-4-5
+  #   access_key: $ANTHROPIC_API_KEY
+
+  # - model: xai/grok-4-1-fast-non-reasoning
+  #   access_key: $GROK_API_KEY
+
+  - model: ollama/llama3.1
+    base_url: http://localhost:11434
+
+model_aliases:
+  arch.codex.default:
+    target: gpt-5.3-codex
+
+tracing:
+  random_sampling: 100
diff --git a/demos/llm_routing/codex_router/pretty_model_resolution.sh b/demos/llm_routing/codex_router/pretty_model_resolution.sh
new file mode 100644
index 00000000..b6187e65
--- /dev/null
+++ b/demos/llm_routing/codex_router/pretty_model_resolution.sh
@@ -0,0 +1,33 @@
+#!/usr/bin/env bash
+# Pretty-print Plano MODEL_RESOLUTION lines from docker logs
+# - hides Arch-Router
+# - prints timestamp
+# - colors MODEL_RESOLUTION red
+# - colors req_model cyan
+# - colors resolved_model magenta
+# - removes provider and streaming
+
+docker logs -f plano 2>&1 \
+| awk '
+/MODEL_RESOLUTION:/ && $0 !~ /Arch-Router/ {
+  # extract timestamp between first [ and ]
+  ts=""
+  if (match($0, /\[[0-9-]+ [0-9:.]+\]/)) {
+    ts=substr($0, RSTART+1, RLENGTH-2)
+  }
+
+  # split out after MODEL_RESOLUTION:
+  n = split($0, parts, /MODEL_RESOLUTION: */)
+  line = parts[2]
+
+  # remove provider and streaming fields
+  sub(/ *provider='\''[^'\'']+'\''/, "", line)
+  sub(/ *streaming=(true|false)/, "", line)
+
+  # highlight fields
+  gsub(/req_model='\''[^'\'']+'\''/, "\033[36m&\033[0m", line)
+  gsub(/resolved_model='\''[^'\'']+'\''/, "\033[35m&\033[0m", line)
+
+  # print timestamp + MODEL_RESOLUTION
+  printf "\033[90m[%s]\033[0m \033[31mMODEL_RESOLUTION\033[0m: %s\n", ts, line
+}'

From b4313d93a480f2470a5630451305697999fed8b1 Mon Sep 17 00:00:00 2001
From: Adil Hafeez <adil.hafeez@gmail.com>
Date: Wed, 11 Mar 2026 12:49:36 -0700
Subject: [PATCH 5/8] Run demos without Docker (#809)

---
 demos/advanced/currency_exchange/run_demo.sh  | 23 ++---
 .../multi_turn_rag/docker-compose.yaml        | 11 ---
 demos/advanced/multi_turn_rag/pyproject.toml  | 12 +++
 demos/advanced/multi_turn_rag/run_demo.sh     | 29 ++++---
 demos/advanced/multi_turn_rag/start_agents.sh | 24 ++++++
 demos/advanced/stock_quote/run_demo.sh        | 23 ++---
 .../multi_agent_crewai_langchain/README.md    | 27 ++++--
 .../multi_agent_crewai_langchain/config.yaml  |  4 +-
 .../docker-compose.yaml                       | 25 +-----
 .../multi_agent_crewai_langchain/run_demo.sh  | 33 ++++----
 .../start_agents.sh                           | 30 +++++++
 .../travel_agents/README.md                   | 29 +++++--
 .../travel_agents/docker-compose.yaml         | 32 +------
 .../travel_agents/run_demo.sh                 | 28 +++++--
 .../travel_agents/start_agents.sh             | 30 +++++++
 demos/filter_chains/http_filter/README.md     | 26 +++---
 demos/filter_chains/http_filter/config.yaml   |  8 +-
 .../http_filter/docker-compose.yaml           | 14 +---
 demos/filter_chains/http_filter/run_demo.sh   | 28 +++++--
 .../filter_chains/http_filter/start_agents.sh | 84 +++++--------------
 demos/filter_chains/mcp_filter/README.md      | 20 +++--
 .../mcp_filter/docker-compose.yaml            | 14 +---
 demos/filter_chains/mcp_filter/run_demo.sh    | 28 +++++--
 demos/getting_started/llm_gateway/README.md   | 11 ++-
 demos/getting_started/llm_gateway/run_demo.sh | 23 ++---
 .../weather_forecast/README.md                | 19 ++++-
 .../weather_forecast/docker-compose.yaml      | 10 ---
 .../weather_forecast/run_demo.sh              | 29 ++++---
 .../weather_forecast/start_agents.sh          | 24 ++++++
 demos/integrations/ollama/run_demo.sh         | 52 ++++++++----
 .../spotify_bearer_auth/run_demo.sh           | 23 ++---
 .../preference_based_routing/README.md        | 18 ++--
 .../preference_based_routing/run_demo.sh      | 22 ++---
 tests/e2e/run_e2e_tests.sh                    | 11 ++-
 tests/e2e/run_prompt_gateway_tests.sh         | 11 ++-
 35 files changed, 488 insertions(+), 347 deletions(-)
 create mode 100644 demos/advanced/multi_turn_rag/pyproject.toml
 create mode 100755 demos/advanced/multi_turn_rag/start_agents.sh
 create mode 100755 demos/agent_orchestration/multi_agent_crewai_langchain/start_agents.sh
 create mode 100755 demos/agent_orchestration/travel_agents/start_agents.sh
 mode change 100644 => 100755 demos/filter_chains/http_filter/start_agents.sh
 create mode 100755 demos/getting_started/weather_forecast/start_agents.sh

diff --git a/demos/advanced/currency_exchange/run_demo.sh b/demos/advanced/currency_exchange/run_demo.sh
index 6623dee5..e430a1cd 100644
--- a/demos/advanced/currency_exchange/run_demo.sh
+++ b/demos/advanced/currency_exchange/run_demo.sh
@@ -18,22 +18,24 @@ start_demo() {
     echo ".env file created with OPENAI_API_KEY."
   fi
 
-  # Step 3: Start Plano
+  # Step 3: Optionally start UI services (AnythingLLM, Jaeger)
+  # Jaeger must start before Plano so it can bind the OTEL port (4317)
+  if [ "$1" == "--with-ui" ]; then
+    echo "Starting UI services (AnythingLLM, Jaeger)..."
+    docker compose up -d
+  fi
+
+  # Step 4: Start Plano
   echo "Starting Plano with config.yaml..."
   planoai up config.yaml
-
-  # Step 4: Start developer services
-  echo "Starting Network Agent using Docker Compose..."
-  docker compose up -d  # Run in detached mode
 }
 
 # Function to stop the demo
 stop_demo() {
-  # Step 1: Stop Docker Compose services
-  echo "Stopping Network Agent using Docker Compose..."
-  docker compose down
+  # Stop Docker Compose services if running
+  docker compose down 2>/dev/null || true
 
-  # Step 2: Stop Plano
+  # Stop Plano
   echo "Stopping Plano..."
   planoai down
 }
@@ -42,6 +44,5 @@ stop_demo() {
 if [ "$1" == "down" ]; then
   stop_demo
 else
-  # Default action is to bring the demo up
-  start_demo
+  start_demo "$1"
 fi
diff --git a/demos/advanced/multi_turn_rag/docker-compose.yaml b/demos/advanced/multi_turn_rag/docker-compose.yaml
index 1c3ed73c..f36987e4 100644
--- a/demos/advanced/multi_turn_rag/docker-compose.yaml
+++ b/demos/advanced/multi_turn_rag/docker-compose.yaml
@@ -1,15 +1,4 @@
 services:
-  rag_energy_source_agent:
-    build:
-      context: .
-      dockerfile: Dockerfile
-    ports:
-      - "18083:80"
-    healthcheck:
-        test: ["CMD", "curl" ,"http://localhost:80/healthz"]
-        interval: 5s
-        retries: 20
-
   anythingllm:
     image: mintplexlabs/anythingllm
     restart: always
diff --git a/demos/advanced/multi_turn_rag/pyproject.toml b/demos/advanced/multi_turn_rag/pyproject.toml
new file mode 100644
index 00000000..05824bd6
--- /dev/null
+++ b/demos/advanced/multi_turn_rag/pyproject.toml
@@ -0,0 +1,12 @@
+[project]
+name = "multi-turn-rag"
+version = "0.1.0"
+requires-python = ">=3.12"
+dependencies = [
+    "fastapi",
+    "uvicorn",
+    "pydantic>=2.8",
+    "httpx>=0.27",
+    "openai>=1.51",
+    "python-dotenv>=1.0",
+]
diff --git a/demos/advanced/multi_turn_rag/run_demo.sh b/demos/advanced/multi_turn_rag/run_demo.sh
index f9434aa2..5bec6368 100644
--- a/demos/advanced/multi_turn_rag/run_demo.sh
+++ b/demos/advanced/multi_turn_rag/run_demo.sh
@@ -18,22 +18,32 @@ start_demo() {
     echo ".env file created with OPENAI_API_KEY."
   fi
 
-  # Step 3: Start Plano
+  # Step 3: Optionally start UI services (AnythingLLM)
+  # UI services must start before Plano to avoid OTEL port conflicts
+  if [ "$1" == "--with-ui" ]; then
+    echo "Starting UI services (AnythingLLM)..."
+    docker compose up -d
+  fi
+
+  # Step 4: Start Plano
   echo "Starting Plano with config.yaml..."
   planoai up config.yaml
 
-  # Step 4: Start Network Agent
-  echo "Starting HR Agent using Docker Compose..."
-  docker compose up -d  # Run in detached mode
+  # Step 5: Start agents natively
+  echo "Starting agents..."
+  bash start_agents.sh &
 }
 
 # Function to stop the demo
 stop_demo() {
-  # Step 1: Stop Docker Compose services
-  echo "Stopping HR Agent using Docker Compose..."
-  docker compose down -v
+  # Stop agents
+  echo "Stopping agents..."
+  pkill -f start_agents.sh 2>/dev/null || true
 
-  # Step 2: Stop Plano
+  # Stop Docker Compose services if running
+  docker compose down 2>/dev/null || true
+
+  # Stop Plano
   echo "Stopping Plano..."
   planoai down
 }
@@ -42,6 +52,5 @@ stop_demo() {
 if [ "$1" == "down" ]; then
   stop_demo
 else
-  # Default action is to bring the demo up
-  start_demo
+  start_demo "$1"
 fi
diff --git a/demos/advanced/multi_turn_rag/start_agents.sh b/demos/advanced/multi_turn_rag/start_agents.sh
new file mode 100755
index 00000000..00b7f1b1
--- /dev/null
+++ b/demos/advanced/multi_turn_rag/start_agents.sh
@@ -0,0 +1,24 @@
+#!/bin/bash
+set -e
+
+PIDS=()
+
+log() { echo "$(date '+%F %T') - $*"; }
+
+cleanup() {
+    log "Stopping agents..."
+    for PID in "${PIDS[@]}"; do
+        kill $PID 2>/dev/null && log "Stopped process $PID"
+    done
+    exit 0
+}
+
+trap cleanup EXIT INT TERM
+
+log "Starting rag_energy_source_agent on port 18083..."
+uv run uvicorn main:app --host 0.0.0.0 --port 18083 &
+PIDS+=($!)
+
+for PID in "${PIDS[@]}"; do
+    wait "$PID"
+done
diff --git a/demos/advanced/stock_quote/run_demo.sh b/demos/advanced/stock_quote/run_demo.sh
index 6623dee5..e430a1cd 100644
--- a/demos/advanced/stock_quote/run_demo.sh
+++ b/demos/advanced/stock_quote/run_demo.sh
@@ -18,22 +18,24 @@ start_demo() {
     echo ".env file created with OPENAI_API_KEY."
   fi
 
-  # Step 3: Start Plano
+  # Step 3: Optionally start UI services (AnythingLLM, Jaeger)
+  # Jaeger must start before Plano so it can bind the OTEL port (4317)
+  if [ "$1" == "--with-ui" ]; then
+    echo "Starting UI services (AnythingLLM, Jaeger)..."
+    docker compose up -d
+  fi
+
+  # Step 4: Start Plano
   echo "Starting Plano with config.yaml..."
   planoai up config.yaml
-
-  # Step 4: Start developer services
-  echo "Starting Network Agent using Docker Compose..."
-  docker compose up -d  # Run in detached mode
 }
 
 # Function to stop the demo
 stop_demo() {
-  # Step 1: Stop Docker Compose services
-  echo "Stopping Network Agent using Docker Compose..."
-  docker compose down
+  # Stop Docker Compose services if running
+  docker compose down 2>/dev/null || true
 
-  # Step 2: Stop Plano
+  # Stop Plano
   echo "Stopping Plano..."
   planoai down
 }
@@ -42,6 +44,5 @@ stop_demo() {
 if [ "$1" == "down" ]; then
   stop_demo
 else
-  # Default action is to bring the demo up
-  start_demo
+  start_demo "$1"
 fi
diff --git a/demos/agent_orchestration/multi_agent_crewai_langchain/README.md b/demos/agent_orchestration/multi_agent_crewai_langchain/README.md
index e2fe23fb..97d71e7f 100644
--- a/demos/agent_orchestration/multi_agent_crewai_langchain/README.md
+++ b/demos/agent_orchestration/multi_agent_crewai_langchain/README.md
@@ -41,21 +41,36 @@ cd demos/agent_orchestration/multi_agent_crewai_langchain
 ./run_demo.sh
 ```
 
-This starts Plano natively and brings up via Docker Compose:
+This starts Plano natively and runs agents as local processes:
 - **CrewAI Flight Agent** (port 10520) - flight search
 - **LangChain Weather Agent** (port 10510) - weather forecasts
-- **AnythingLLM** (port 3001) - chat interface
-- **Jaeger** (port 16686) - distributed tracing
 
 Plano runs natively on the host (ports 12000, 8001).
 
+To also start AnythingLLM (chat UI), Jaeger (tracing), and other optional services:
+
+```bash
+./run_demo.sh --with-ui
+```
+
+This additionally starts:
+- **AnythingLLM** (port 3001) - chat interface
+- **Jaeger** (port 16686) - distributed tracing
+
 ### Try It Out
 
-1. **Open the Chat Interface**
+1. **Using curl**
+   ```bash
+   curl -X POST http://localhost:8001/v1/chat/completions \
+     -H "Content-Type: application/json" \
+     -d '{"model": "gpt-4o", "messages": [{"role": "user", "content": "What is the weather in San Francisco?"}]}'
+   ```
+
+2. **Using AnythingLLM (requires `--with-ui`)**
    - Navigate to [http://localhost:3001](http://localhost:3001)
    - Create an account (stored locally)
 
-2. **Ask Multi-Agent Questions**
+3. **Ask Multi-Agent Questions**
    ```
    "What's the weather in San Francisco and can you find flights from Seattle to San Francisco?"
    ```
@@ -65,7 +80,7 @@ Plano runs natively on the host (ports 12000, 8001).
    - Routes the flight part to the CrewAI agent
    - Combines responses seamlessly
 
-3. **View Distributed Traces**
+4. **View Distributed Traces (requires `--with-ui`)**
    - Open [http://localhost:16686](http://localhost:16686) (Jaeger UI)
    - See how requests flow through both agents
 
diff --git a/demos/agent_orchestration/multi_agent_crewai_langchain/config.yaml b/demos/agent_orchestration/multi_agent_crewai_langchain/config.yaml
index b3a204f3..ef522337 100644
--- a/demos/agent_orchestration/multi_agent_crewai_langchain/config.yaml
+++ b/demos/agent_orchestration/multi_agent_crewai_langchain/config.yaml
@@ -2,9 +2,9 @@ version: v0.3.0
 
 agents:
   - id: weather_agent
-    url: http://langchain-weather-agent:10510
+    url: http://localhost:10510
   - id: flight_agent
-    url: http://crewai-flight-agent:10520
+    url: http://localhost:10520
 
 model_providers:
   - model: openai/gpt-4o
diff --git a/demos/agent_orchestration/multi_agent_crewai_langchain/docker-compose.yaml b/demos/agent_orchestration/multi_agent_crewai_langchain/docker-compose.yaml
index 2d9c180b..74954562 100644
--- a/demos/agent_orchestration/multi_agent_crewai_langchain/docker-compose.yaml
+++ b/demos/agent_orchestration/multi_agent_crewai_langchain/docker-compose.yaml
@@ -1,27 +1,5 @@
 
 services:
-  crewai-flight-agent:
-    build:
-      dockerfile: Dockerfile
-    restart: always
-    ports:
-      - "10520:10520"
-    environment:
-      - LLM_GATEWAY_ENDPOINT=http://host.docker.internal:12000/v1
-      - AEROAPI_KEY=${AEROAPI_KEY:?AEROAPI_KEY environment variable is required but not set}
-      - PYTHONUNBUFFERED=1
-    command: ["python", "-u", "crewai/flight_agent.py"]
-
-  langchain-weather-agent:
-    build:
-      dockerfile: Dockerfile
-    restart: always
-    ports:
-      - "10510:10510"
-    environment:
-      - LLM_GATEWAY_ENDPOINT=http://host.docker.internal:12000/v1
-    command: ["python", "-u", "langchain/weather_agent.py"]
-
   anythingllm:
     image: mintplexlabs/anythingllm
     restart: always
@@ -36,6 +14,8 @@ services:
       - GENERIC_OPEN_AI_MODEL_PREF=gpt-4o-mini
       - GENERIC_OPEN_AI_MODEL_TOKEN_LIMIT=128000
       - GENERIC_OPEN_AI_API_KEY=sk-placeholder
+    extra_hosts:
+      - "host.docker.internal:host-gateway"
 
   jaeger:
     build:
@@ -44,3 +24,4 @@ services:
     ports:
       - "16686:16686"  # Jaeger UI
       - "4317:4317"    # OTLP gRPC receiver
+      - "4318:4318"    # OTLP HTTP receiver
diff --git a/demos/agent_orchestration/multi_agent_crewai_langchain/run_demo.sh b/demos/agent_orchestration/multi_agent_crewai_langchain/run_demo.sh
index b7dc0fad..35bbbbdd 100755
--- a/demos/agent_orchestration/multi_agent_crewai_langchain/run_demo.sh
+++ b/demos/agent_orchestration/multi_agent_crewai_langchain/run_demo.sh
@@ -12,33 +12,38 @@ start_demo() {
       echo "Error: OPENAI_API_KEY environment variable is not set for the demo."
       exit 1
     fi
-    if [ -z "$AEROAPI_KEY" ]; then
-      echo "Error: AEROAPI_KEY environment variable is not set for the demo."
-      exit 1
-    fi
 
     echo "Creating .env file..."
     echo "OPENAI_API_KEY=$OPENAI_API_KEY" > .env
-    echo "AEROAPI_KEY=$AEROAPI_KEY" >> .env
     echo ".env file created with API keys."
   fi
 
-  # Step 3: Start Plano
+  # Step 3: Optionally start UI services (AnythingLLM, Jaeger)
+  # Jaeger must start before Plano so it can bind the OTEL port (4317)
+  if [ "$1" == "--with-ui" ]; then
+    echo "Starting UI services (AnythingLLM, Jaeger)..."
+    docker compose up -d
+  fi
+
+  # Step 4: Start Plano
   echo "Starting Plano with config.yaml..."
   planoai up config.yaml
 
-  # Step 4: Start agents and services
-  echo "Starting agents using Docker Compose..."
-  docker compose up -d
+  # Step 5: Start agents natively
+  echo "Starting agents..."
+  bash start_agents.sh &
 }
 
 # Function to stop the demo
 stop_demo() {
-  # Step 1: Stop Docker Compose services
-  echo "Stopping Docker Compose services..."
-  docker compose down
+  # Stop agents
+  echo "Stopping agents..."
+  pkill -f start_agents.sh 2>/dev/null || true
 
-  # Step 2: Stop Plano
+  # Stop Docker Compose services if running
+  docker compose down 2>/dev/null || true
+
+  # Stop Plano
   echo "Stopping Plano..."
   planoai down
 }
@@ -47,5 +52,5 @@ stop_demo() {
 if [ "$1" == "down" ]; then
   stop_demo
 else
-  start_demo
+  start_demo "$1"
 fi
diff --git a/demos/agent_orchestration/multi_agent_crewai_langchain/start_agents.sh b/demos/agent_orchestration/multi_agent_crewai_langchain/start_agents.sh
new file mode 100755
index 00000000..78d2fecb
--- /dev/null
+++ b/demos/agent_orchestration/multi_agent_crewai_langchain/start_agents.sh
@@ -0,0 +1,30 @@
+#!/bin/bash
+set -e
+
+PIDS=()
+
+log() { echo "$(date '+%F %T') - $*"; }
+
+cleanup() {
+    log "Stopping agents..."
+    for PID in "${PIDS[@]}"; do
+        kill $PID 2>/dev/null && log "Stopped process $PID"
+    done
+    exit 0
+}
+
+trap cleanup EXIT INT TERM
+
+export LLM_GATEWAY_ENDPOINT=http://localhost:12000/v1
+
+log "Starting langchain weather_agent on port 10510..."
+uv run python langchain/weather_agent.py &
+PIDS+=($!)
+
+log "Starting crewai flight_agent on port 10520..."
+uv run python crewai/flight_agent.py &
+PIDS+=($!)
+
+for PID in "${PIDS[@]}"; do
+    wait "$PID"
+done
diff --git a/demos/agent_orchestration/travel_agents/README.md b/demos/agent_orchestration/travel_agents/README.md
index d6468612..7886539d 100644
--- a/demos/agent_orchestration/travel_agents/README.md
+++ b/demos/agent_orchestration/travel_agents/README.md
@@ -23,9 +23,10 @@ All agents use Plano's agent orchestration LLM to intelligently route user reque
 ## Prerequisites
 
 - [Plano CLI](https://docs.planoai.dev/get_started/quickstart.html#prerequisites) installed (`pip install planoai`)
-- Docker and Docker Compose (for agent services)
+- [uv](https://docs.astral.sh/uv/) installed (for running agents natively)
 - [OpenAI API key](https://platform.openai.com/api-keys)
 - [FlightAware AeroAPI key](https://www.flightaware.com/aeroapi/portal)
+- Docker and Docker Compose (optional, only needed for `--with-ui`)
 
 > **Note:** You'll need to obtain a FlightAware AeroAPI key for live flight data. Visit [https://www.flightaware.com/aeroapi/portal](https://www.flightaware.com/aeroapi/portal) to get your API key.
 
@@ -46,16 +47,34 @@ export OPENAI_API_KEY="your OpenAI api key"
 ./run_demo.sh
 ```
 
-This starts Plano natively and brings up via Docker Compose:
+This starts Plano natively and runs agents as local processes:
 - Weather Agent on port 10510
 - Flight Agent on port 10520
-- Open WebUI on port 8080
 
 Plano runs natively on the host (port 8001).
 
+To also start Open WebUI, Jaeger tracing, and other optional services, pass `--with-ui`:
+
+```bash
+./run_demo.sh --with-ui
+```
+
+This additionally starts:
+- Open WebUI on port 8080
+- Jaeger tracing UI on port 16686
+
 ### 4. Test the System
 
-Use Open WebUI at http://localhost:8080
+**Option A: Using curl**
+```bash
+curl -X POST http://localhost:8001/v1/chat/completions \
+  -H "Content-Type: application/json" \
+  -d '{"model": "gpt-5.2", "messages": [{"role": "user", "content": "What is the weather in Istanbul?"}]}'
+```
+
+**Option B: Using Open WebUI (requires `--with-ui`)**
+
+Navigate to http://localhost:8080
 
 > **Note:** The Open WebUI may take a few minutes to start up and be fully ready. Please wait for the container to finish initializing before accessing the interface. Once ready, make sure to select the **gpt-5.2** model from the model dropdown menu in the UI.
 
@@ -102,7 +121,7 @@ Each agent:
 3. Generates response using GPT-5.2
 4. Streams response back to user
 
-Both agents run as Docker containers and communicate with Plano running natively on the host.
+Both agents run as native local processes and communicate with Plano running natively on the host.
 
 ## Observability
 
diff --git a/demos/agent_orchestration/travel_agents/docker-compose.yaml b/demos/agent_orchestration/travel_agents/docker-compose.yaml
index f0fb78e5..66edbdc3 100644
--- a/demos/agent_orchestration/travel_agents/docker-compose.yaml
+++ b/demos/agent_orchestration/travel_agents/docker-compose.yaml
@@ -1,32 +1,5 @@
 
 services:
-  weather-agent:
-    build:
-      context: .
-      dockerfile: Dockerfile
-    container_name: weather-agent
-    restart: always
-    ports:
-      - "10510:10510"
-    environment:
-      - LLM_GATEWAY_ENDPOINT=http://host.docker.internal:12000/v1
-    command: ["uv", "run", "python", "src/travel_agents/weather_agent.py"]
-    extra_hosts:
-      - "host.docker.internal:host-gateway"
-  flight-agent:
-    build:
-      context: .
-      dockerfile: Dockerfile
-    container_name: flight-agent
-    restart: always
-    ports:
-      - "10520:10520"
-    environment:
-      - LLM_GATEWAY_ENDPOINT=http://host.docker.internal:12000/v1
-      - AEROAPI_KEY=${AEROAPI_KEY:? AEROAPI_KEY environment variable is required but not set}
-    command: ["uv", "run", "python", "src/travel_agents/flight_agent.py"]
-    extra_hosts:
-      - "host.docker.internal:host-gateway"
   open-web-ui:
     image: dyrnq/open-webui:main
     restart: always
@@ -40,9 +13,8 @@ services:
       - ENABLE_TITLE_GENERATION=false
       - ENABLE_TAGS_GENERATION=false
       - ENABLE_AUTOCOMPLETE_GENERATION=false
-    depends_on:
-      - weather-agent
-      - flight-agent
+    extra_hosts:
+      - "host.docker.internal:host-gateway"
   jaeger:
     build:
       context: ../../shared/jaeger
diff --git a/demos/agent_orchestration/travel_agents/run_demo.sh b/demos/agent_orchestration/travel_agents/run_demo.sh
index b7dc0fad..643a0aa2 100755
--- a/demos/agent_orchestration/travel_agents/run_demo.sh
+++ b/demos/agent_orchestration/travel_agents/run_demo.sh
@@ -23,22 +23,32 @@ start_demo() {
     echo ".env file created with API keys."
   fi
 
-  # Step 3: Start Plano
+  # Step 3: Optionally start UI services (Open WebUI, Jaeger)
+  # Jaeger must start before Plano so it can bind the OTEL port (4317)
+  if [ "$1" == "--with-ui" ]; then
+    echo "Starting UI services (Open WebUI, Jaeger)..."
+    docker compose up -d
+  fi
+
+  # Step 4: Start Plano
   echo "Starting Plano with config.yaml..."
   planoai up config.yaml
 
-  # Step 4: Start agents and services
-  echo "Starting agents using Docker Compose..."
-  docker compose up -d
+  # Step 5: Start agents natively
+  echo "Starting agents..."
+  bash start_agents.sh &
 }
 
 # Function to stop the demo
 stop_demo() {
-  # Step 1: Stop Docker Compose services
-  echo "Stopping Docker Compose services..."
-  docker compose down
+  # Stop agents
+  echo "Stopping agents..."
+  pkill -f start_agents.sh 2>/dev/null || true
 
-  # Step 2: Stop Plano
+  # Stop Docker Compose services if running
+  docker compose down 2>/dev/null || true
+
+  # Stop Plano
   echo "Stopping Plano..."
   planoai down
 }
@@ -47,5 +57,5 @@ stop_demo() {
 if [ "$1" == "down" ]; then
   stop_demo
 else
-  start_demo
+  start_demo "$1"
 fi
diff --git a/demos/agent_orchestration/travel_agents/start_agents.sh b/demos/agent_orchestration/travel_agents/start_agents.sh
new file mode 100755
index 00000000..4f2e32a7
--- /dev/null
+++ b/demos/agent_orchestration/travel_agents/start_agents.sh
@@ -0,0 +1,30 @@
+#!/bin/bash
+set -e
+
+PIDS=()
+
+log() { echo "$(date '+%F %T') - $*"; }
+
+cleanup() {
+    log "Stopping agents..."
+    for PID in "${PIDS[@]}"; do
+        kill $PID 2>/dev/null && log "Stopped process $PID"
+    done
+    exit 0
+}
+
+trap cleanup EXIT INT TERM
+
+export LLM_GATEWAY_ENDPOINT=http://localhost:12000/v1
+
+log "Starting weather_agent on port 10510..."
+uv run python src/travel_agents/weather_agent.py &
+PIDS+=($!)
+
+log "Starting flight_agent on port 10520..."
+uv run python src/travel_agents/flight_agent.py &
+PIDS+=($!)
+
+for PID in "${PIDS[@]}"; do
+    wait "$PID"
+done
diff --git a/demos/filter_chains/http_filter/README.md b/demos/filter_chains/http_filter/README.md
index 5e675113..86748217 100644
--- a/demos/filter_chains/http_filter/README.md
+++ b/demos/filter_chains/http_filter/README.md
@@ -41,23 +41,27 @@ export OPENAI_API_KEY="your-key"
 ./run_demo.sh
 ```
 
-This starts Plano natively and brings up via Docker Compose:
-- Input Guards MCP server on port 10500
-- Query Rewriter MCP server on port 10501
-- Context Builder MCP server on port 10502
+This starts Plano natively and runs agents as local processes:
+- Input Guards HTTP server on port 10500
+- Query Rewriter HTTP server on port 10501
+- Context Builder HTTP server on port 10502
 - RAG Agent REST server on port 10505
-- Jaeger UI for viewing traces at http://localhost:16686
-- AnythingLLM at http://localhost:3001 for interactive queries
 
 Plano runs natively on the host (port 8001 and 12000).
 
+To also start AnythingLLM (chat UI) and Jaeger (tracing):
+
+```bash
+./run_demo.sh --with-ui
+```
+
+This additionally starts:
+- Jaeger UI for viewing traces at http://localhost:16686
+- AnythingLLM at http://localhost:3001 for interactive queries
+
 ### 2. Test the system
 
-**Option A: Using AnythingLLM (recommended)**
-
-Navigate to http://localhost:3001 and send queries through the chat interface.
-
-**Option B: Using curl**
+**Option A: Using curl (recommended)**
 ```bash
 curl -X POST http://localhost:8001/v1/chat/completions \
   -H "Content-Type: application/json" \
diff --git a/demos/filter_chains/http_filter/config.yaml b/demos/filter_chains/http_filter/config.yaml
index 117931e2..014a141a 100644
--- a/demos/filter_chains/http_filter/config.yaml
+++ b/demos/filter_chains/http_filter/config.yaml
@@ -2,23 +2,23 @@ version: v0.3.0
 
 agents:
   - id: rag_agent
-    url: http://rag-agents:10505
+    url: http://localhost:10505
 
 filters:
   - id: input_guards
-    url: http://rag-agents:10500
+    url: http://localhost:10500
     type: http
     # type: mcp (default)
     # transport: streamable-http (default)
     # tool: input_guards (default - same as filter id)
   - id: query_rewriter
-    url: http://rag-agents:10501
+    url: http://localhost:10501
     type: http
     # type: mcp (default)
     # transport: streamable-http (default)
     # tool: query_rewriter (default - same as filter id)
   - id: context_builder
-    url: http://rag-agents:10502
+    url: http://localhost:10502
     type: http
 
 model_providers:
diff --git a/demos/filter_chains/http_filter/docker-compose.yaml b/demos/filter_chains/http_filter/docker-compose.yaml
index 64962bce..0361926c 100644
--- a/demos/filter_chains/http_filter/docker-compose.yaml
+++ b/demos/filter_chains/http_filter/docker-compose.yaml
@@ -1,16 +1,4 @@
 services:
-  rag-agents:
-    build:
-      context: .
-      dockerfile: Dockerfile
-    ports:
-      - "10500:10500"
-      - "10501:10501"
-      - "10502:10502"
-      - "10505:10505"
-    environment:
-      - LLM_GATEWAY_ENDPOINT=${LLM_GATEWAY_ENDPOINT:-http://host.docker.internal:12000/v1}
-      - OPENAI_API_KEY=${OPENAI_API_KEY:?OPENAI_API_KEY environment variable is required but not set}
   jaeger:
     build:
       context: ../../shared/jaeger
@@ -32,3 +20,5 @@ services:
       - GENERIC_OPEN_AI_MODEL_PREF=gpt-4o-mini
       - GENERIC_OPEN_AI_MODEL_TOKEN_LIMIT=128000
       - GENERIC_OPEN_AI_API_KEY=sk-placeholder
+    extra_hosts:
+      - "host.docker.internal:host-gateway"
diff --git a/demos/filter_chains/http_filter/run_demo.sh b/demos/filter_chains/http_filter/run_demo.sh
index bed84f16..f203f5b1 100755
--- a/demos/filter_chains/http_filter/run_demo.sh
+++ b/demos/filter_chains/http_filter/run_demo.sh
@@ -18,22 +18,32 @@ start_demo() {
     echo ".env file created with OPENAI_API_KEY."
   fi
 
-  # Step 3: Start Plano
+  # Step 3: Optionally start UI services (AnythingLLM, Jaeger)
+  # Jaeger must start before Plano so it can bind the OTEL port (4317)
+  if [ "$1" == "--with-ui" ]; then
+    echo "Starting UI services (AnythingLLM, Jaeger)..."
+    docker compose up -d
+  fi
+
+  # Step 4: Start Plano
   echo "Starting Plano with config.yaml..."
   planoai up config.yaml
 
-  # Step 4: Start services
-  echo "Starting services using Docker Compose..."
-  docker compose up -d
+  # Step 5: Start agents natively
+  echo "Starting agents..."
+  bash start_agents.sh &
 }
 
 # Function to stop the demo
 stop_demo() {
-  # Step 1: Stop Docker Compose services
-  echo "Stopping Docker Compose services..."
-  docker compose down
+  # Stop agents
+  echo "Stopping agents..."
+  pkill -f start_agents.sh 2>/dev/null || true
 
-  # Step 2: Stop Plano
+  # Stop Docker Compose services if running
+  docker compose down 2>/dev/null || true
+
+  # Stop Plano
   echo "Stopping Plano..."
   planoai down
 }
@@ -42,5 +52,5 @@ stop_demo() {
 if [ "$1" == "down" ]; then
   stop_demo
 else
-  start_demo
+  start_demo "$1"
 fi
diff --git a/demos/filter_chains/http_filter/start_agents.sh b/demos/filter_chains/http_filter/start_agents.sh
old mode 100644
new mode 100755
index 06cabeec..8dfdc0f4
--- a/demos/filter_chains/http_filter/start_agents.sh
+++ b/demos/filter_chains/http_filter/start_agents.sh
@@ -1,78 +1,38 @@
-# #!/bin/bash
-# set -e
-
-# WAIT_FOR_PIDS=()
-
-# log() {
-#   timestamp=$(python3 -c 'from datetime import datetime; print(datetime.now().strftime("%Y-%m-%d %H:%M:%S,%f")[:23])')
-#   message="$*"
-#   echo "$timestamp - $message"
-# }
-
-# cleanup() {
-#     log "Caught signal, terminating all user processes ..."
-#     for PID in "${WAIT_FOR_PIDS[@]}"; do
-#         if kill $PID 2> /dev/null; then
-#             log "killed process: $PID"
-#         fi
-#     done
-#     exit 1
-# }
-
-# trap cleanup EXIT
-
-# log "Starting input_guards agent on port 10500/mcp..."
-# uv run python -m rag_agent --rest-server --host 0.0.0.0 --rest-port 10500 --agent input_guards &
-# WAIT_FOR_PIDS+=($!)
-
-# log "Starting query_rewriter agent on port 10501/mcp..."
-# uv run python -m rag_agent --rest-server --host 0.0.0.0 --rest-port 10501 --agent query_rewriter &
-# WAIT_FOR_PIDS+=($!)
-
-# log "Starting context_builder agent on port 10502/mcp..."
-# uv run python -m rag_agent --rest-server --host 0.0.0.0 --rest-port 10502 --agent context_builder &
-# WAIT_FOR_PIDS+=($!)
-
-# # log "Starting response_generator agent on port 10400..."
-# # uv run python -m rag_agent --host 0.0.0.0 --port 10400 --agent response_generator &
-# # WAIT_FOR_PIDS+=($!)
-
-# log "Starting response_generator agent on port 10505..."
-# uv run python -m rag_agent --rest-server --host 0.0.0.0 --rest-port 10505 --agent response_generator &
-# WAIT_FOR_PIDS+=($!)
-
-# for PID in "${WAIT_FOR_PIDS[@]}"; do
-#     wait "$PID"
-# done
-
-
-
-
 #!/bin/bash
 set -e
 
-export PYTHONPATH=/app/src
-
-pids=()
+PIDS=()
 
 log() { echo "$(date '+%F %T') - $*"; }
 
-log "Starting input_guards HTTP server on :10500"
+cleanup() {
+    log "Stopping agents..."
+    for PID in "${PIDS[@]}"; do
+        kill $PID 2>/dev/null && log "Stopped process $PID"
+    done
+    exit 0
+}
+
+trap cleanup EXIT INT TERM
+
+export PYTHONPATH=./src
+
+log "Starting input_guards HTTP server on port 10500..."
 uv run uvicorn rag_agent.input_guards:app --host 0.0.0.0 --port 10500 &
-pids+=($!)
+PIDS+=($!)
 
-log "Starting query_rewriter HTTP server on :10501"
+log "Starting query_rewriter HTTP server on port 10501..."
 uv run uvicorn rag_agent.query_rewriter:app --host 0.0.0.0 --port 10501 &
-pids+=($!)
+PIDS+=($!)
 
-log "Starting context_builder HTTP server on :10502"
+log "Starting context_builder HTTP server on port 10502..."
 uv run uvicorn rag_agent.context_builder:app --host 0.0.0.0 --port 10502 &
-pids+=($!)
+PIDS+=($!)
 
-log "Starting response_generator (OpenAI-compatible) on :10505"
+log "Starting response_generator (OpenAI-compatible) on port 10505..."
 uv run uvicorn rag_agent.rag_agent:app --host 0.0.0.0 --port 10505 &
-pids+=($!)
+PIDS+=($!)
 
-for PID in "${pids[@]}"; do
+for PID in "${PIDS[@]}"; do
     wait "$PID"
 done
diff --git a/demos/filter_chains/mcp_filter/README.md b/demos/filter_chains/mcp_filter/README.md
index 5e675113..798015e2 100644
--- a/demos/filter_chains/mcp_filter/README.md
+++ b/demos/filter_chains/mcp_filter/README.md
@@ -41,23 +41,27 @@ export OPENAI_API_KEY="your-key"
 ./run_demo.sh
 ```
 
-This starts Plano natively and brings up via Docker Compose:
+This starts Plano natively and runs agents as local processes:
 - Input Guards MCP server on port 10500
 - Query Rewriter MCP server on port 10501
 - Context Builder MCP server on port 10502
 - RAG Agent REST server on port 10505
-- Jaeger UI for viewing traces at http://localhost:16686
-- AnythingLLM at http://localhost:3001 for interactive queries
 
 Plano runs natively on the host (port 8001 and 12000).
 
+To also start AnythingLLM (chat UI) and Jaeger (tracing):
+
+```bash
+./run_demo.sh --with-ui
+```
+
+This additionally starts:
+- Jaeger UI for viewing traces at http://localhost:16686
+- AnythingLLM at http://localhost:3001 for interactive queries
+
 ### 2. Test the system
 
-**Option A: Using AnythingLLM (recommended)**
-
-Navigate to http://localhost:3001 and send queries through the chat interface.
-
-**Option B: Using curl**
+**Option A: Using curl (recommended)**
 ```bash
 curl -X POST http://localhost:8001/v1/chat/completions \
   -H "Content-Type: application/json" \
diff --git a/demos/filter_chains/mcp_filter/docker-compose.yaml b/demos/filter_chains/mcp_filter/docker-compose.yaml
index 64962bce..0361926c 100644
--- a/demos/filter_chains/mcp_filter/docker-compose.yaml
+++ b/demos/filter_chains/mcp_filter/docker-compose.yaml
@@ -1,16 +1,4 @@
 services:
-  rag-agents:
-    build:
-      context: .
-      dockerfile: Dockerfile
-    ports:
-      - "10500:10500"
-      - "10501:10501"
-      - "10502:10502"
-      - "10505:10505"
-    environment:
-      - LLM_GATEWAY_ENDPOINT=${LLM_GATEWAY_ENDPOINT:-http://host.docker.internal:12000/v1}
-      - OPENAI_API_KEY=${OPENAI_API_KEY:?OPENAI_API_KEY environment variable is required but not set}
   jaeger:
     build:
       context: ../../shared/jaeger
@@ -32,3 +20,5 @@ services:
       - GENERIC_OPEN_AI_MODEL_PREF=gpt-4o-mini
       - GENERIC_OPEN_AI_MODEL_TOKEN_LIMIT=128000
       - GENERIC_OPEN_AI_API_KEY=sk-placeholder
+    extra_hosts:
+      - "host.docker.internal:host-gateway"
diff --git a/demos/filter_chains/mcp_filter/run_demo.sh b/demos/filter_chains/mcp_filter/run_demo.sh
index bed84f16..f203f5b1 100755
--- a/demos/filter_chains/mcp_filter/run_demo.sh
+++ b/demos/filter_chains/mcp_filter/run_demo.sh
@@ -18,22 +18,32 @@ start_demo() {
     echo ".env file created with OPENAI_API_KEY."
   fi
 
-  # Step 3: Start Plano
+  # Step 3: Optionally start UI services (AnythingLLM, Jaeger)
+  # Jaeger must start before Plano so it can bind the OTEL port (4317)
+  if [ "$1" == "--with-ui" ]; then
+    echo "Starting UI services (AnythingLLM, Jaeger)..."
+    docker compose up -d
+  fi
+
+  # Step 4: Start Plano
   echo "Starting Plano with config.yaml..."
   planoai up config.yaml
 
-  # Step 4: Start services
-  echo "Starting services using Docker Compose..."
-  docker compose up -d
+  # Step 5: Start agents natively
+  echo "Starting agents..."
+  bash start_agents.sh &
 }
 
 # Function to stop the demo
 stop_demo() {
-  # Step 1: Stop Docker Compose services
-  echo "Stopping Docker Compose services..."
-  docker compose down
+  # Stop agents
+  echo "Stopping agents..."
+  pkill -f start_agents.sh 2>/dev/null || true
 
-  # Step 2: Stop Plano
+  # Stop Docker Compose services if running
+  docker compose down 2>/dev/null || true
+
+  # Stop Plano
   echo "Stopping Plano..."
   planoai down
 }
@@ -42,5 +52,5 @@ stop_demo() {
 if [ "$1" == "down" ]; then
   stop_demo
 else
-  start_demo
+  start_demo "$1"
 fi
diff --git a/demos/getting_started/llm_gateway/README.md b/demos/getting_started/llm_gateway/README.md
index e87467fc..b29397b6 100644
--- a/demos/getting_started/llm_gateway/README.md
+++ b/demos/getting_started/llm_gateway/README.md
@@ -7,7 +7,14 @@ This demo shows how you can use Plano gateway to manage keys and route to upstre
    ```sh
    sh run_demo.sh
    ```
-1. Navigate to http://localhost:3001/
+1. Test with curl (see example below)
+
+To also start the AnythingLLM chat UI and Jaeger tracing, pass `--with-ui`:
+```sh
+sh run_demo.sh --with-ui
+```
+
+Then navigate to http://localhost:3001/ for AnythingLLM.
 
 Following screen shows an example of interaction with Plano gateway showing dynamic routing. You can select between different LLMs using "override model" option in the chat UI.
 
@@ -47,7 +54,7 @@ $ curl --header 'Content-Type: application/json' \
 ```
 
 # Observability
-For tracing you can head over to http://localhost:16686/ to view recent traces.
+For tracing, start with `--with-ui` and head over to http://localhost:16686/ to view recent traces.
 
 Following is a screenshot of tracing UI showing call received by Plano gateway and making upstream call to LLM,
 
diff --git a/demos/getting_started/llm_gateway/run_demo.sh b/demos/getting_started/llm_gateway/run_demo.sh
index b049bf31..e430a1cd 100644
--- a/demos/getting_started/llm_gateway/run_demo.sh
+++ b/demos/getting_started/llm_gateway/run_demo.sh
@@ -18,22 +18,24 @@ start_demo() {
     echo ".env file created with OPENAI_API_KEY."
   fi
 
-  # Step 3: Start Plano
+  # Step 3: Optionally start UI services (AnythingLLM, Jaeger)
+  # Jaeger must start before Plano so it can bind the OTEL port (4317)
+  if [ "$1" == "--with-ui" ]; then
+    echo "Starting UI services (AnythingLLM, Jaeger)..."
+    docker compose up -d
+  fi
+
+  # Step 4: Start Plano
   echo "Starting Plano with config.yaml..."
   planoai up config.yaml
-
-  # Step 4: Start LLM Routing
-  echo "Starting LLM Routing using Docker Compose..."
-  docker compose up -d  # Run in detached mode
 }
 
 # Function to stop the demo
 stop_demo() {
-  # Step 1: Stop Docker Compose services
-  echo "Stopping LLM Routing using Docker Compose..."
-  docker compose down
+  # Stop Docker Compose services if running
+  docker compose down 2>/dev/null || true
 
-  # Step 2: Stop Plano
+  # Stop Plano
   echo "Stopping Plano..."
   planoai down
 }
@@ -42,6 +44,5 @@ stop_demo() {
 if [ "$1" == "down" ]; then
   stop_demo
 else
-  # Default action is to bring the demo up
-  start_demo
+  start_demo "$1"
 fi
diff --git a/demos/getting_started/weather_forecast/README.md b/demos/getting_started/weather_forecast/README.md
index 8a9eb6c0..91fa810f 100644
--- a/demos/getting_started/weather_forecast/README.md
+++ b/demos/getting_started/weather_forecast/README.md
@@ -10,15 +10,26 @@ This demo shows how you can use Plano's core function calling capabilities.
 3. ```sh
    sh run_demo.sh
    ```
-4. Navigate to http://localhost:3001/
-5. You can type in queries like "how is the weather?"
+4. Test with curl:
+   ```sh
+   curl http://localhost:10000/v1/chat/completions \
+     -H "Content-Type: application/json" \
+     -d '{"model": "gpt-4o", "messages": [{"role": "user", "content": "how is the weather in San Francisco?"}]}'
+   ```
 
 Here is a sample interaction,
 <img width="575" alt="image" src="https://github.com/user-attachments/assets/e0929490-3eb2-4130-ae87-a732aea4d059">
 
-## Tracing
+## Using the Chat UI and Tracing (optional)
 
-To see a tracing dashboard, navigate to http://localhost:16686/ to open Jaeger UI.
+To start AnythingLLM (chat UI) and other optional services, pass `--with-ui`:
+
+```sh
+sh run_demo.sh --with-ui
+```
+
+- Navigate to http://localhost:3001/ for AnythingLLM
+- Navigate to http://localhost:16686/ for Jaeger tracing UI
 
 ### Stopping Demo
 
diff --git a/demos/getting_started/weather_forecast/docker-compose.yaml b/demos/getting_started/weather_forecast/docker-compose.yaml
index 84074ab9..f36987e4 100644
--- a/demos/getting_started/weather_forecast/docker-compose.yaml
+++ b/demos/getting_started/weather_forecast/docker-compose.yaml
@@ -1,14 +1,4 @@
 services:
-  weather_forecast_service:
-    build:
-      context: ./
-    environment:
-      - OLTP_HOST=http://jaeger:4317
-    extra_hosts:
-      - "host.docker.internal:host-gateway"
-    ports:
-      - "18083:80"
-
   anythingllm:
     image: mintplexlabs/anythingllm
     restart: always
diff --git a/demos/getting_started/weather_forecast/run_demo.sh b/demos/getting_started/weather_forecast/run_demo.sh
index c8eb96e5..c77f2d83 100644
--- a/demos/getting_started/weather_forecast/run_demo.sh
+++ b/demos/getting_started/weather_forecast/run_demo.sh
@@ -72,23 +72,32 @@ start_demo() {
     exit 1
   fi
 
-  # Step 4: Start Plano
+  # Step 4: Optionally start UI services (AnythingLLM, Jaeger, etc.)
+  # Jaeger must start before Plano so it can bind the OTEL port (4317)
+  if [ "$1" == "--with-ui" ] || [ "$2" == "--with-ui" ]; then
+    echo "Starting UI services with $COMPOSE_FILE..."
+    docker compose -f "$COMPOSE_FILE" up -d
+  fi
+
+  # Step 5: Start Plano
   echo "Starting Plano with config.yaml..."
   planoai up config.yaml
 
-  # Step 5: Start Network Agent with the chosen Docker Compose file
-  echo "Starting Network Agent with $COMPOSE_FILE..."
-  docker compose -f "$COMPOSE_FILE" up -d # Run in detached mode
+  # Step 6: Start agents natively
+  echo "Starting agents..."
+  bash start_agents.sh &
 }
 
 # Function to stop the demo
 stop_demo() {
-  echo "Stopping all Docker Compose services..."
+  # Stop agents
+  echo "Stopping agents..."
+  pkill -f start_agents.sh 2>/dev/null || true
 
-  # Stop all services by iterating through all configurations
+  # Stop all Docker Compose services if running
+  echo "Stopping Docker Compose services..."
   for compose_file in ./docker-compose*.yaml; do
-    echo "Stopping services in $compose_file..."
-    docker compose -f "$compose_file" down
+    docker compose -f "$compose_file" down 2>/dev/null || true
   done
 
   # Stop Plano
@@ -101,6 +110,6 @@ if [ "$1" == "down" ]; then
   # Call stop_demo with the second argument as the demo to stop
   stop_demo
 else
-  # Use the argument (jaeger, logfire, signoz) to determine the compose file
-  start_demo "$1"
+  # Use the argument (jaeger, logfire, signoz, --with-ui) to determine the compose file
+  start_demo "$1" "$2"
 fi
diff --git a/demos/getting_started/weather_forecast/start_agents.sh b/demos/getting_started/weather_forecast/start_agents.sh
new file mode 100755
index 00000000..548f2bf7
--- /dev/null
+++ b/demos/getting_started/weather_forecast/start_agents.sh
@@ -0,0 +1,24 @@
+#!/bin/bash
+set -e
+
+PIDS=()
+
+log() { echo "$(date '+%F %T') - $*"; }
+
+cleanup() {
+    log "Stopping agents..."
+    for PID in "${PIDS[@]}"; do
+        kill $PID 2>/dev/null && log "Stopped process $PID"
+    done
+    exit 0
+}
+
+trap cleanup EXIT INT TERM
+
+log "Starting weather_forecast_service on port 18083..."
+uv run uvicorn main:app --host 0.0.0.0 --port 18083 &
+PIDS+=($!)
+
+for PID in "${PIDS[@]}"; do
+    wait "$PID"
+done
diff --git a/demos/integrations/ollama/run_demo.sh b/demos/integrations/ollama/run_demo.sh
index 6623dee5..5bbf183b 100644
--- a/demos/integrations/ollama/run_demo.sh
+++ b/demos/integrations/ollama/run_demo.sh
@@ -7,33 +7,58 @@ start_demo() {
   if [ -f ".env" ]; then
     echo ".env file already exists. Skipping creation."
   else
-    # Step 2: Create `.env` file and set OpenAI key
+    # Step 2: Create `.env` file and set API keys
     if [ -z "$OPENAI_API_KEY" ]; then
       echo "Error: OPENAI_API_KEY environment variable is not set for the demo."
       exit 1
     fi
+    if [ -z "$ANTHROPIC_API_KEY" ]; then
+      echo "Warning: ANTHROPIC_API_KEY environment variable is not set. Anthropic features may not work."
+    fi
 
     echo "Creating .env file..."
     echo "OPENAI_API_KEY=$OPENAI_API_KEY" > .env
-    echo ".env file created with OPENAI_API_KEY."
+    if [ -n "$ANTHROPIC_API_KEY" ]; then
+      echo "ANTHROPIC_API_KEY=$ANTHROPIC_API_KEY" >> .env
+    fi
+    echo ".env file created with API keys."
   fi
 
-  # Step 3: Start Plano
-  echo "Starting Plano with config.yaml..."
-  planoai up config.yaml
+  # Step 3: Optionally start UI services (AnythingLLM, Jaeger)
+  # Jaeger must start before Plano so it can bind the OTEL port (4317)
+  if [ "$1" == "--with-ui" ]; then
+    echo "Starting UI services (AnythingLLM, Jaeger)..."
+    docker compose up -d
+  fi
 
-  # Step 4: Start developer services
-  echo "Starting Network Agent using Docker Compose..."
-  docker compose up -d  # Run in detached mode
+  # Step 4: Start Plano
+  echo "Starting Plano with arch_config_with_aliases.yaml..."
+  planoai up arch_config_with_aliases.yaml
+
+  echo ""
+  echo "Plano started successfully."
+  echo "Please run the following CURL command to test model alias routing. Additional instructions are in the README.md file."
+  echo ""
+  echo "curl -sS -X POST \"http://localhost:12000/v1/chat/completions\" \
+    -H \"Authorization: Bearer test-key\" \
+    -H \"Content-Type: application/json\" \
+    -d '{
+      \"model\": \"arch.summarize.v1\",
+      \"max_tokens\": 50,
+      \"messages\": [
+        { \"role\": \"user\",
+          \"content\": \"Hello, please respond with exactly: Hello from alias arch.summarize.v1!\"
+        }
+      ]
+    }' | jq ."
 }
 
 # Function to stop the demo
 stop_demo() {
-  # Step 1: Stop Docker Compose services
-  echo "Stopping Network Agent using Docker Compose..."
-  docker compose down
+  # Stop Docker Compose services if running
+  docker compose down 2>/dev/null || true
 
-  # Step 2: Stop Plano
+  # Stop Plano
   echo "Stopping Plano..."
   planoai down
 }
@@ -42,6 +67,5 @@ stop_demo() {
 if [ "$1" == "down" ]; then
   stop_demo
 else
-  # Default action is to bring the demo up
-  start_demo
+  start_demo "$1"
 fi
diff --git a/demos/integrations/spotify_bearer_auth/run_demo.sh b/demos/integrations/spotify_bearer_auth/run_demo.sh
index 6623dee5..e430a1cd 100644
--- a/demos/integrations/spotify_bearer_auth/run_demo.sh
+++ b/demos/integrations/spotify_bearer_auth/run_demo.sh
@@ -18,22 +18,24 @@ start_demo() {
     echo ".env file created with OPENAI_API_KEY."
   fi
 
-  # Step 3: Start Plano
+  # Step 3: Optionally start UI services (AnythingLLM, Jaeger)
+  # Jaeger must start before Plano so it can bind the OTEL port (4317)
+  if [ "$1" == "--with-ui" ]; then
+    echo "Starting UI services (AnythingLLM, Jaeger)..."
+    docker compose up -d
+  fi
+
+  # Step 4: Start Plano
   echo "Starting Plano with config.yaml..."
   planoai up config.yaml
-
-  # Step 4: Start developer services
-  echo "Starting Network Agent using Docker Compose..."
-  docker compose up -d  # Run in detached mode
 }
 
 # Function to stop the demo
 stop_demo() {
-  # Step 1: Stop Docker Compose services
-  echo "Stopping Network Agent using Docker Compose..."
-  docker compose down
+  # Stop Docker Compose services if running
+  docker compose down 2>/dev/null || true
 
-  # Step 2: Stop Plano
+  # Stop Plano
   echo "Stopping Plano..."
   planoai down
 }
@@ -42,6 +44,5 @@ stop_demo() {
 if [ "$1" == "down" ]; then
   stop_demo
 else
-  # Default action is to bring the demo up
-  start_demo
+  start_demo "$1"
 fi
diff --git a/demos/llm_routing/preference_based_routing/README.md b/demos/llm_routing/preference_based_routing/README.md
index 03d28cee..9d71971c 100644
--- a/demos/llm_routing/preference_based_routing/README.md
+++ b/demos/llm_routing/preference_based_routing/README.md
@@ -10,19 +10,27 @@ cd demos/llm_routing/preference_based_routing
 ./run_demo.sh
 ```
 
-Or manually:
+To also start AnythingLLM (chat UI) and Jaeger (tracing):
 
-1. Start Plano
 ```bash
-planoai up config.yaml
+./run_demo.sh --with-ui
 ```
 
-2. Start AnythingLLM
+Then open AnythingLLM at http://localhost:3001/
+
+Or start manually:
+
+1. (Optional) Start AnythingLLM and Jaeger
 ```bash
 docker compose up -d
 ```
 
-3. open AnythingLLM http://localhost:3001/
+2. Start Plano
+```bash
+planoai up config.yaml
+```
+
+3. Test with curl or open AnythingLLM http://localhost:3001/
 
 # Testing out preference based routing
 
diff --git a/demos/llm_routing/preference_based_routing/run_demo.sh b/demos/llm_routing/preference_based_routing/run_demo.sh
index c9525c26..30e0c67b 100755
--- a/demos/llm_routing/preference_based_routing/run_demo.sh
+++ b/demos/llm_routing/preference_based_routing/run_demo.sh
@@ -24,22 +24,24 @@ start_demo() {
     echo ".env file created with API keys."
   fi
 
-  # Step 3: Start Plano
+  # Step 3: Optionally start UI services (AnythingLLM, Jaeger)
+  # Jaeger must start before Plano so it can bind the OTEL port (4317)
+  if [ "$1" == "--with-ui" ]; then
+    echo "Starting UI services (AnythingLLM, Jaeger)..."
+    docker compose up -d
+  fi
+
+  # Step 4: Start Plano
   echo "Starting Plano with config.yaml..."
   planoai up config.yaml
-
-  # Step 4: Start services
-  echo "Starting services using Docker Compose..."
-  docker compose up -d
 }
 
 # Function to stop the demo
 stop_demo() {
-  # Step 1: Stop Docker Compose services
-  echo "Stopping Docker Compose services..."
-  docker compose down
+  # Stop Docker Compose services if running
+  docker compose down 2>/dev/null || true
 
-  # Step 2: Stop Plano
+  # Stop Plano
   echo "Stopping Plano..."
   planoai down
 }
@@ -48,5 +50,5 @@ stop_demo() {
 if [ "$1" == "down" ]; then
   stop_demo
 else
-  start_demo
+  start_demo "$1"
 fi
diff --git a/tests/e2e/run_e2e_tests.sh b/tests/e2e/run_e2e_tests.sh
index c24931f4..a164b7f9 100644
--- a/tests/e2e/run_e2e_tests.sh
+++ b/tests/e2e/run_e2e_tests.sh
@@ -21,10 +21,11 @@ trap 'print_debug' INT TERM ERR
 
 log starting > ../build.log
 
-log building and running function_calling demo
+log starting weather_forecast agent natively
 log ===========================================
 cd ../../demos/getting_started/weather_forecast/
-docker compose up weather_forecast_service --build -d
+bash start_agents.sh &
+AGENTS_PID=$!
 cd -
 
 log building and installing plano cli
@@ -78,8 +79,6 @@ log running e2e tests for openai responses api client
 log ========================================
 uv run pytest test_openai_responses_api_client_with_state.py
 
-log shutting down the weather_forecast demo
+log shutting down the weather_forecast agent
 log =======================================
-cd ../../demos/getting_started/weather_forecast
-docker compose down
-cd -
+kill $AGENTS_PID 2>/dev/null || true
diff --git a/tests/e2e/run_prompt_gateway_tests.sh b/tests/e2e/run_prompt_gateway_tests.sh
index 58d850d8..1e947813 100755
--- a/tests/e2e/run_prompt_gateway_tests.sh
+++ b/tests/e2e/run_prompt_gateway_tests.sh
@@ -32,10 +32,11 @@ cd -
 # Re-sync e2e deps
 uv sync
 
-# Start weather_forecast service (needed for prompt_gateway tests)
-log "building and running weather_forecast service"
+# Start weather_forecast service natively (needed for prompt_gateway tests)
+log "starting weather_forecast agent natively"
 cd ../../demos/getting_started/weather_forecast/
-docker compose up weather_forecast_service --build -d
+bash start_agents.sh &
+AGENTS_PID=$!
 cd -
 
 # Start gateway with prompt_gateway config
@@ -52,6 +53,4 @@ uv run pytest test_prompt_gateway.py
 # Cleanup
 log "shutting down"
 planoai down --docker || true
-cd ../../demos/getting_started/weather_forecast
-docker compose down
-cd -
+kill $AGENTS_PID 2>/dev/null || true

From 5400b0a2fa476dfc629bf0dfab95e826a640a67b Mon Sep 17 00:00:00 2001
From: Adil Hafeez <adil.hafeez@gmail.com>
Date: Wed, 11 Mar 2026 15:28:50 -0700
Subject: [PATCH 6/8] add instructions on hosting arch-router locally (#819)

---
 .../preference_based_routing/README.md        |  31 +++++
 docs/source/guides/llm_router.rst             | 123 ++++++++++++++++++
 2 files changed, 154 insertions(+)

diff --git a/demos/llm_routing/preference_based_routing/README.md b/demos/llm_routing/preference_based_routing/README.md
index 9d71971c..009002fd 100644
--- a/demos/llm_routing/preference_based_routing/README.md
+++ b/demos/llm_routing/preference_based_routing/README.md
@@ -32,6 +32,37 @@ planoai up config.yaml
 
 3. Test with curl or open AnythingLLM http://localhost:3001/
 
+## Running with local Arch-Router (via Ollama)
+
+By default, Plano uses a hosted Arch-Router endpoint. To self-host Arch-Router locally using Ollama:
+
+1. Install [Ollama](https://ollama.ai) and pull the model:
+```bash
+ollama pull hf.co/katanemo/Arch-Router-1.5B.gguf:Q4_K_M
+```
+
+2. Make sure Ollama is running (`ollama serve` or the macOS app).
+
+3. Start Plano with the local config:
+```bash
+planoai up plano_config_local.yaml
+```
+
+4. Test routing:
+```bash
+curl -s "http://localhost:12000/routing/v1/messages" \
+  -H "Content-Type: application/json" \
+  -d '{
+    "model": "gpt-4o-mini",
+    "max_tokens": 1024,
+    "messages": [
+      {"role": "user", "content": "Create a REST API endpoint in Rust using actix-web"}
+    ]
+  }'
+```
+
+You should see the router select the appropriate model based on the routing preferences defined in `plano_config_local.yaml`.
+
 # Testing out preference based routing
 
 We have defined two routes 1. code generation and 2. code understanding
diff --git a/docs/source/guides/llm_router.rst b/docs/source/guides/llm_router.rst
index 188b1e30..41c51b4a 100644
--- a/docs/source/guides/llm_router.rst
+++ b/docs/source/guides/llm_router.rst
@@ -228,6 +228,129 @@ In summary, Arch-Router demonstrates:
 - **Production-Ready Performance**: Optimized for low-latency, high-throughput applications in multi-model environments.
 
 
+Self-hosting Arch-Router
+------------------------
+
+By default, Plano uses a hosted Arch-Router endpoint. To run Arch-Router locally, you can serve the model yourself using either **Ollama** or **vLLM**.
+
+Using Ollama (recommended for local development)
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+1. **Install Ollama**
+
+   Download and install from `ollama.ai <https://ollama.ai>`_.
+
+2. **Pull and serve Arch-Router**
+
+   .. code-block:: bash
+
+       ollama pull hf.co/katanemo/Arch-Router-1.5B.gguf:Q4_K_M
+       ollama serve
+
+   This downloads the quantized GGUF model from HuggingFace and starts serving on ``http://localhost:11434``.
+
+3. **Configure Plano to use local Arch-Router**
+
+   .. code-block:: yaml
+
+       routing:
+         model: Arch-Router
+         llm_provider: arch-router
+
+       model_providers:
+         - name: arch-router
+           model: arch/hf.co/katanemo/Arch-Router-1.5B.gguf:Q4_K_M
+           base_url: http://localhost:11434
+
+         - model: openai/gpt-5.2
+           access_key: $OPENAI_API_KEY
+           default: true
+
+         - model: anthropic/claude-sonnet-4-5
+           access_key: $ANTHROPIC_API_KEY
+           routing_preferences:
+             - name: creative writing
+               description: creative content generation, storytelling, and writing assistance
+
+4. **Verify the model is running**
+
+   .. code-block:: bash
+
+       curl http://localhost:11434/v1/models
+
+   You should see ``Arch-Router-1.5B`` listed in the response.
+
+Using vLLM (recommended for production / EC2)
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+vLLM provides higher throughput and GPU optimizations suitable for production deployments.
+
+1. **Install vLLM**
+
+   .. code-block:: bash
+
+       pip install vllm
+
+2. **Download the model weights**
+
+   The GGUF weights are downloaded automatically from HuggingFace on first use. To pre-download:
+
+   .. code-block:: bash
+
+       pip install huggingface_hub
+       huggingface-cli download katanemo/Arch-Router-1.5B.gguf
+
+3. **Start the vLLM server**
+
+   After downloading, find the GGUF file and Jinja template in the HuggingFace cache:
+
+   .. code-block:: bash
+
+       # Find the downloaded files
+       SNAPSHOT_DIR=$(ls -d ~/.cache/huggingface/hub/models--katanemo--Arch-Router-1.5B.gguf/snapshots/*/ | head -1)
+
+       vllm serve ${SNAPSHOT_DIR}Arch-Router-1.5B-Q4_K_M.gguf \
+           --host 0.0.0.0 \
+           --port 10000 \
+           --load-format gguf \
+           --chat-template ${SNAPSHOT_DIR}template.jinja \
+           --tokenizer katanemo/Arch-Router-1.5B \
+           --served-model-name Arch-Router \
+           --gpu-memory-utilization 0.3 \
+           --tensor-parallel-size 1 \
+           --enable-prefix-caching
+
+4. **Configure Plano to use the vLLM endpoint**
+
+   .. code-block:: yaml
+
+       routing:
+         model: Arch-Router
+         llm_provider: arch-router
+
+       model_providers:
+         - name: arch-router
+           model: Arch-Router
+           base_url: http://<your-server-ip>:10000
+
+         - model: openai/gpt-5.2
+           access_key: $OPENAI_API_KEY
+           default: true
+
+         - model: anthropic/claude-sonnet-4-5
+           access_key: $ANTHROPIC_API_KEY
+           routing_preferences:
+             - name: creative writing
+               description: creative content generation, storytelling, and writing assistance
+
+5. **Verify the server is running**
+
+   .. code-block:: bash
+
+       curl http://localhost:10000/health
+       curl http://localhost:10000/v1/models
+
+
 Combining Routing Methods
 -------------------------
 

From 2f52774c0ecf6de6c9df657418020de468aef1a1 Mon Sep 17 00:00:00 2001
From: Adil Hafeez <adil.hafeez@gmail.com>
Date: Fri, 13 Mar 2026 00:18:41 -0700
Subject: [PATCH 7/8] Add Claude Code skills and streamline CLAUDE.md (#823)

* add claude code skills and streamline CLAUDE.md

* remove claude code attribution from PR skill

* update pr skill
---
 .claude/skills/build-wasm/SKILL.md   |  12 +++
 .claude/skills/check/SKILL.md        |  12 +++
 .claude/skills/new-provider/SKILL.md |  17 +++
 .claude/skills/pr/SKILL.md           |  16 +++
 .claude/skills/release/SKILL.md      |  28 +++++
 .claude/skills/test-python/SKILL.md  |   9 ++
 CLAUDE.md                            | 152 ++++++++++-----------------
 7 files changed, 147 insertions(+), 99 deletions(-)
 create mode 100644 .claude/skills/build-wasm/SKILL.md
 create mode 100644 .claude/skills/check/SKILL.md
 create mode 100644 .claude/skills/new-provider/SKILL.md
 create mode 100644 .claude/skills/pr/SKILL.md
 create mode 100644 .claude/skills/release/SKILL.md
 create mode 100644 .claude/skills/test-python/SKILL.md

diff --git a/.claude/skills/build-wasm/SKILL.md b/.claude/skills/build-wasm/SKILL.md
new file mode 100644
index 00000000..dffff783
--- /dev/null
+++ b/.claude/skills/build-wasm/SKILL.md
@@ -0,0 +1,12 @@
+---
+name: build-wasm
+description: Build the WASM plugins for Envoy. Use when WASM plugin code changes.
+---
+
+Build the WASM plugins:
+
+```
+cd crates && cargo build --release --target=wasm32-wasip1 -p llm_gateway -p prompt_gateway
+```
+
+If the build fails, diagnose and fix the errors.
diff --git a/.claude/skills/check/SKILL.md b/.claude/skills/check/SKILL.md
new file mode 100644
index 00000000..4d2427e2
--- /dev/null
+++ b/.claude/skills/check/SKILL.md
@@ -0,0 +1,12 @@
+---
+name: check
+description: Run Rust fmt, clippy, and unit tests. Use after making Rust code changes.
+---
+
+Run all local checks in order:
+
+1. `cd crates && cargo fmt --all -- --check` — if formatting fails, run `cargo fmt --all` to fix it
+2. `cd crates && cargo clippy --locked --all-targets --all-features -- -D warnings` — fix any warnings
+3. `cd crates && cargo test --lib` — ensure all unit tests pass
+
+Report a summary of what passed/failed.
diff --git a/.claude/skills/new-provider/SKILL.md b/.claude/skills/new-provider/SKILL.md
new file mode 100644
index 00000000..74ba0f6d
--- /dev/null
+++ b/.claude/skills/new-provider/SKILL.md
@@ -0,0 +1,17 @@
+---
+name: new-provider
+description: Add a new LLM provider to hermesllm. Use when integrating a new AI provider.
+disable-model-invocation: true
+user-invocable: true
+---
+
+Add a new LLM provider to hermesllm. The user will provide the provider name as $ARGUMENTS.
+
+1. Add a new variant to `ProviderId` enum in `crates/hermesllm/src/providers/id.rs`
+2. Implement string parsing in the `TryFrom<&str>` impl for the new provider
+3. If the provider uses a non-OpenAI API format, create request/response types in `crates/hermesllm/src/apis/`
+4. Add variant to `ProviderRequestType` and `ProviderResponseType` enums and update all match arms
+5. Add model list to `crates/hermesllm/src/providers/provider_models.yaml`
+6. Update `SupportedUpstreamAPIs` mapping if needed
+
+After making changes, run `cd crates && cargo test --lib` to verify everything compiles and tests pass.
diff --git a/.claude/skills/pr/SKILL.md b/.claude/skills/pr/SKILL.md
new file mode 100644
index 00000000..43e4b46f
--- /dev/null
+++ b/.claude/skills/pr/SKILL.md
@@ -0,0 +1,16 @@
+---
+name: pr
+description: Create a feature branch and open a pull request for the current changes.
+disable-model-invocation: true
+user-invocable: true
+---
+
+Create a pull request for the current changes:
+
+1. Determine the GitHub username via `gh api user --jq .login`. If the login is `adilhafeez`, use `adil` instead.
+2. Create a feature branch using format `<username>/<feature_name>` — infer the feature name from the changes
+3. Run `cd crates && cargo fmt --all -- --check` and `cd crates && cargo clippy --locked --all-targets --all-features -- -D warnings` to verify Rust code is clean
+4. Commit all changes with a short, concise commit message (one line, no Co-Authored-By)
+5. Push the branch and create a PR targeting `main`
+
+Keep the PR title short (under 70 chars). Include a brief summary in the body. Never include a "Test plan" section or any "Generated with Claude Code" attribution.
diff --git a/.claude/skills/release/SKILL.md b/.claude/skills/release/SKILL.md
new file mode 100644
index 00000000..80510004
--- /dev/null
+++ b/.claude/skills/release/SKILL.md
@@ -0,0 +1,28 @@
+---
+name: release
+description: Bump the Plano version across all required files. Use when preparing a release.
+disable-model-invocation: true
+user-invocable: true
+---
+
+Prepare a release version bump. The user may provide the new version number as $ARGUMENTS (e.g., `/release 0.4.12`), or a bump type (`major`, `minor`, `patch`).
+
+If no argument is provided, read the current version from `cli/planoai/__init__.py`, auto-increment the patch version (e.g., `0.4.11` → `0.4.12`), and confirm with the user before proceeding.
+
+Update the version string in ALL of these files:
+
+- `.github/workflows/ci.yml`
+- `cli/planoai/__init__.py`
+- `cli/planoai/consts.py`
+- `cli/pyproject.toml`
+- `build_filter_image.sh`
+- `config/validate_plano_config.sh`
+- `docs/source/conf.py`
+- `docs/source/get_started/quickstart.rst`
+- `docs/source/resources/deployment.rst`
+- `apps/www/src/components/Hero.tsx`
+- `demos/llm_routing/preference_based_routing/README.md`
+
+Do NOT change version strings in `*.lock` files or `Cargo.lock`.
+
+After making changes, show a summary of all files modified and the old → new version.
diff --git a/.claude/skills/test-python/SKILL.md b/.claude/skills/test-python/SKILL.md
new file mode 100644
index 00000000..2aa40ded
--- /dev/null
+++ b/.claude/skills/test-python/SKILL.md
@@ -0,0 +1,9 @@
+---
+name: test-python
+description: Run Python CLI tests. Use after making changes to cli/ code.
+---
+
+1. `cd cli && uv sync` — ensure dependencies are installed
+2. `cd cli && uv run pytest -v` — run all tests
+
+If tests fail, diagnose and fix the issues.
diff --git a/CLAUDE.md b/CLAUDE.md
index 71c94303..58b2191f 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -1,152 +1,106 @@
 # CLAUDE.md
 
-This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.
-
-## Project Overview
-
 Plano is an AI-native proxy server and data plane for agentic applications, built on Envoy proxy. It centralizes agent orchestration, LLM routing, observability, and safety guardrails as an out-of-process dataplane.
 
 ## Build & Test Commands
 
-### Rust (crates/)
-
 ```bash
-# Build WASM plugins (must target wasm32-wasip1)
+# Rust — WASM plugins (must target wasm32-wasip1)
 cd crates && cargo build --release --target=wasm32-wasip1 -p llm_gateway -p prompt_gateway
 
-# Build brightstaff binary (native target)
+# Rust — brightstaff binary (native target)
 cd crates && cargo build --release -p brightstaff
 
-# Run unit tests
+# Rust — tests, format, lint
 cd crates && cargo test --lib
-
-# Format check
 cd crates && cargo fmt --all -- --check
-
-# Lint
 cd crates && cargo clippy --locked --all-targets --all-features -- -D warnings
-```
 
-### Python CLI (cli/)
+# Python CLI
+cd cli && uv sync && uv run pytest -v
 
-```bash
-cd cli && uv sync              # Install dependencies
-cd cli && uv run pytest -v     # Run tests
-cd cli && uv run planoai --help  # Run CLI
-```
+# JS/TS (Turbo monorepo)
+npm run build && npm run lint && npm run typecheck
 
-### JavaScript/TypeScript (apps/, packages/)
-
-```bash
-npm run build      # Build all (via Turbo)
-npm run lint       # Lint all
-npm run dev        # Dev servers
-npm run typecheck  # Type check
-```
-
-### Pre-commit (runs fmt, clippy, cargo test, black, yaml checks)
-
-```bash
+# Pre-commit (fmt, clippy, cargo test, black, yaml)
 pre-commit run --all-files
-```
 
-### Docker
-
-```bash
+# Docker
 docker build -t katanemo/plano:latest .
 ```
 
-### E2E Tests (tests/e2e/)
-
-E2E tests require a built Docker image and API keys. They run via `tests/e2e/run_e2e_tests.sh` which executes four test suites: `test_prompt_gateway.py`, `test_model_alias_routing.py`, `test_openai_responses_api_client.py`, and `test_openai_responses_api_client_with_state.py`.
+E2E tests require a Docker image and API keys: `tests/e2e/run_e2e_tests.sh`
 
 ## Architecture
 
-### Core Data Flow
-
-Requests flow through Envoy proxy with two WASM filter plugins, backed by a native Rust binary:
-
 ```
 Client → Envoy (prompt_gateway.wasm → llm_gateway.wasm) → Agents/LLM Providers
                               ↕
                          brightstaff (native binary: state, routing, signals, tracing)
 ```
 
-### Rust Crates (crates/)
+### Crates (crates/)
 
-All crates share a Cargo workspace. Two compile to `wasm32-wasip1` for Envoy, the rest are native:
-
-- **prompt_gateway** (WASM) — Proxy-WASM filter for prompt/message processing, guardrails, and filter chains
+- **prompt_gateway** (WASM) — Proxy-WASM filter for prompt processing, guardrails, filter chains
 - **llm_gateway** (WASM) — Proxy-WASM filter for LLM request/response handling and routing
-- **brightstaff** (native binary) — Core application server: handlers, router, signals, state management, tracing
-- **common** (library) — Shared across all crates: configuration, LLM provider abstractions, HTTP utilities, routing logic, rate limiting, tokenizer, PII detection, tracing
-- **hermesllm** (library) — Translates LLM API formats between providers (OpenAI, Anthropic, Gemini, Mistral, Grok, AWS Bedrock, Azure, together.ai). Key types: `ProviderId`, `ProviderRequest`, `ProviderResponse`, `ProviderStreamResponse`
+- **brightstaff** (native) — Core server: handlers, router, signals, state, tracing
+- **common** (lib) — Shared: config, HTTP, routing, rate limiting, tokenizer, PII, tracing
+- **hermesllm** (lib) — LLM API translation between providers. Key types: `ProviderId`, `ProviderRequest`, `ProviderResponse`, `ProviderStreamResponse`
 
 ### Python CLI (cli/planoai/)
 
-The `planoai` CLI manages the Plano lifecycle. Key commands:
-- `planoai up <config.yaml>` — Validate config, check API keys, start Docker container
-- `planoai down` — Stop container
-- `planoai build` — Build Docker image from repo root
-- `planoai logs` — Stream access/debug logs
-- `planoai trace` — OTEL trace collection and analysis
-- `planoai init` — Initialize new project
-- `planoai cli_agent` — Start a CLI agent connected to Plano
-- `planoai generate_prompt_targets` — Generate prompt_targets from python methods
+Entry point: `main.py`. Built with `rich-click`. Commands: `up`, `down`, `build`, `logs`, `trace`, `init`, `cli_agent`, `generate_prompt_targets`.
 
-Entry point: `cli/planoai/main.py`. Container lifecycle in `core.py`. Docker operations in `docker_cli.py`.
+### Config (config/)
 
-### Configuration System (config/)
+- `plano_config_schema.yaml` — JSON Schema for validating user configs
+- `envoy.template.yaml` — Jinja2 template → Envoy config
+- `supervisord.conf` — Process supervisor for Envoy + brightstaff
 
-- `plano_config_schema.yaml` — JSON Schema (draft-07) for validating user config files
-- `envoy.template.yaml` — Jinja2 template rendered into Envoy proxy config
-- `supervisord.conf` — Process supervisor for Envoy + brightstaff in the container
+### JS Apps (apps/, packages/)
 
-User configs define: `agents` (id + url), `model_providers` (model + access_key), `listeners` (type: agent/model/prompt, with router strategy), `filters` (filter chains), and `tracing` settings.
+Turbo monorepo with Next.js 16 / React 19. Not part of the core proxy.
 
-### JavaScript Apps (apps/, packages/)
+## WASM Plugin Rules
 
-Turbo monorepo with Next.js 16 / React 19 applications and shared packages (UI components, Tailwind config, TypeScript config). Not part of the core proxy — these are web applications.
+Code in `prompt_gateway` and `llm_gateway` runs in Envoy's WASM sandbox:
+
+- **No std networking/filesystem** — use proxy-wasm host calls only
+- **No tokio/async** — synchronous, callback-driven. `Action::Pause` / `Action::Continue` for flow control
+- **Lifecycle**: `RootContext` → `on_configure`, `create_http_context`; `HttpContext` → `on_http_request/response_headers/body`
+- **HTTP callouts**: `dispatch_http_call()` → store context in `callouts: RefCell<HashMap<u32, CallContext>>` → match in `on_http_call_response()`
+- **Config**: `Rc`-wrapped, loaded once in `on_configure()` via `serde_yaml::from_slice()`
+- **Dependencies must be no_std compatible** (e.g., `governor` with `features = ["no_std"]`)
+- **Crate type**: `cdylib` → produces `.wasm`
+
+## Adding a New LLM Provider
+
+1. Add variant to `ProviderId` in `crates/hermesllm/src/providers/id.rs` + `TryFrom<&str>`
+2. Create request/response types in `crates/hermesllm/src/apis/` if non-OpenAI format
+3. Add variant to `ProviderRequestType`/`ProviderResponseType` enums, update all match arms
+4. Add models to `crates/hermesllm/src/providers/provider_models.yaml`
+5. Update `SupportedUpstreamAPIs` mapping if needed
 
 ## Release Process
 
-To prepare a release (e.g., bumping from `0.4.6` to `0.4.7`), update the version string in all of the following files:
+Update version (e.g., `0.4.11` → `0.4.12`) in all of these files:
 
-**CI Workflow:**
-- `.github/workflows/ci.yml` — docker build/save tags
+- `.github/workflows/ci.yml`, `build_filter_image.sh`, `config/validate_plano_config.sh`
+- `cli/planoai/__init__.py`, `cli/planoai/consts.py`, `cli/pyproject.toml`
+- `docs/source/conf.py`, `docs/source/get_started/quickstart.rst`, `docs/source/resources/deployment.rst`
+- `apps/www/src/components/Hero.tsx`, `demos/llm_routing/preference_based_routing/README.md`
 
-**CLI:**
-- `cli/planoai/__init__.py` — `__version__`
-- `cli/planoai/consts.py` — `PLANO_DOCKER_IMAGE` default
-- `cli/pyproject.toml` — `version`
-
-**Build & Config:**
-- `build_filter_image.sh` — docker build tag
-- `config/validate_plano_config.sh` — docker image tag
-
-**Docs:**
-- `docs/source/conf.py` — `release`
-- `docs/source/get_started/quickstart.rst` — install commands and example output
-- `docs/source/resources/deployment.rst` — docker image tag
-
-**Website & Demos:**
-- `apps/www/src/components/Hero.tsx` — version badge
-- `demos/llm_routing/preference_based_routing/README.md` — example output
-
-**Important:** Do NOT change `0.4.6` references in `*.lock` files or `Cargo.lock` — those refer to the `colorama` and `http-body` dependency versions, not Plano.
-
-Commit message format: `release X.Y.Z`
+Do NOT change version strings in `*.lock` files or `Cargo.lock`. Commit message: `release X.Y.Z`
 
 ## Workflow Preferences
 
-- **Git commits:** Do NOT add `Co-Authored-By` lines. Keep commit messages short and concise (one line, no verbose descriptions). NEVER commit and push directly to `main`—always use a feature branch and PR.
-- **Git branches:** Use the format `<github_username>/<feature_name>` when creating branches for PRs. Determine the username from `gh api user --jq .login`.
-- **GitHub issues:** When a GitHub issue URL is pasted, fetch all requirements and context from the issue first. The end goal is always a PR with all tests passing.
+- **Commits:** No `Co-Authored-By`. Short one-line messages. Never push directly to `main` — always feature branch + PR.
+- **Branches:** Use `adil/<feature_name>` format.
+- **Issues:** When a GitHub issue URL is pasted, fetch all context first. Goal is always a PR with passing tests.
 
 ## Key Conventions
 
-- Rust edition 2021, formatted with `cargo fmt`, linted with `cargo clippy -D warnings`
-- Python formatted with Black
-- WASM plugins must target `wasm32-wasip1` — they run inside Envoy, not as native binaries
-- The Docker image bundles Envoy + WASM plugins + brightstaff + Python CLI into a single container managed by supervisord
-- API keys come from environment variables or `.env` files, never hardcoded
+- Rust edition 2021, `cargo fmt`, `cargo clippy -D warnings`
+- Python: Black. Rust errors: `thiserror` with `#[from]`
+- API keys from env vars or `.env`, never hardcoded
+- Provider dispatch: `ProviderRequestType`/`ProviderResponseType` enums implementing `ProviderRequest`/`ProviderResponse` traits

From 785bf7e021297665b95d51933412b874e1e6a2f5 Mon Sep 17 00:00:00 2001
From: Adil Hafeez <adil.hafeez@gmail.com>
Date: Fri, 13 Mar 2026 00:28:35 -0700
Subject: [PATCH 8/8] add build-cli and build-brightstaff skills (#824)

---
 .claude/skills/build-brightstaff/SKILL.md | 12 ++++++++++++
 .claude/skills/build-cli/SKILL.md         | 10 ++++++++++
 2 files changed, 22 insertions(+)
 create mode 100644 .claude/skills/build-brightstaff/SKILL.md
 create mode 100644 .claude/skills/build-cli/SKILL.md

diff --git a/.claude/skills/build-brightstaff/SKILL.md b/.claude/skills/build-brightstaff/SKILL.md
new file mode 100644
index 00000000..6fc97b19
--- /dev/null
+++ b/.claude/skills/build-brightstaff/SKILL.md
@@ -0,0 +1,12 @@
+---
+name: build-brightstaff
+description: Build the brightstaff native binary. Use when brightstaff code changes.
+---
+
+Build brightstaff:
+
+```
+cd crates && cargo build --release -p brightstaff
+```
+
+If the build fails, diagnose and fix the errors.
diff --git a/.claude/skills/build-cli/SKILL.md b/.claude/skills/build-cli/SKILL.md
new file mode 100644
index 00000000..0e2aec7f
--- /dev/null
+++ b/.claude/skills/build-cli/SKILL.md
@@ -0,0 +1,10 @@
+---
+name: build-cli
+description: Build and install the Python CLI (planoai). Use after making changes to cli/ code to install locally.
+---
+
+1. `cd cli && uv sync` — ensure dependencies are installed
+2. `cd cli && uv tool install --editable .` — install the CLI locally
+3. Verify the installation: `cd cli && uv run planoai --help`
+
+If the build or install fails, diagnose and fix the issues.