From 028a2cd19661d82e912a7ed02177fe1f3399d4bf Mon Sep 17 00:00:00 2001 From: Adil Hafeez Date: Mon, 9 Mar 2026 16:32:16 -0700 Subject: [PATCH 1/8] add routing service (#814) fixes https://github.com/katanemo/plano/issues/810 --- crates/brightstaff/src/handlers/mod.rs | 1 + .../brightstaff/src/handlers/router_chat.rs | 9 +- .../src/handlers/routing_service.rs | 163 ++++++++++++++++++ crates/brightstaff/src/main.rs | 23 ++- .../model_routing_service/README.md | 92 ++++++++++ .../model_routing_service/config.yaml | 27 +++ .../llm_routing/model_routing_service/demo.sh | 65 +++++++ 7 files changed, 375 insertions(+), 5 deletions(-) create mode 100644 crates/brightstaff/src/handlers/routing_service.rs create mode 100644 demos/llm_routing/model_routing_service/README.md create mode 100644 demos/llm_routing/model_routing_service/config.yaml create mode 100755 demos/llm_routing/model_routing_service/demo.sh diff --git a/crates/brightstaff/src/handlers/mod.rs b/crates/brightstaff/src/handlers/mod.rs index 0bbd3454..9c602e93 100644 --- a/crates/brightstaff/src/handlers/mod.rs +++ b/crates/brightstaff/src/handlers/mod.rs @@ -7,6 +7,7 @@ pub mod models; pub mod pipeline_processor; pub mod response_handler; pub mod router_chat; +pub mod routing_service; pub mod utils; #[cfg(test)] diff --git a/crates/brightstaff/src/handlers/router_chat.rs b/crates/brightstaff/src/handlers/router_chat.rs index d71734fa..345632fc 100644 --- a/crates/brightstaff/src/handlers/router_chat.rs +++ b/crates/brightstaff/src/handlers/router_chat.rs @@ -10,6 +10,7 @@ use crate::tracing::routing; pub struct RoutingResult { pub model_name: String, + pub route_name: Option, } pub struct RoutingError { @@ -133,9 +134,12 @@ pub async fn router_chat_get_upstream_model( match routing_result { Ok(route) => match route { - Some((_, model_name)) => { + Some((route_name, model_name)) => { current_span.record("route.selected_model", model_name.as_str()); - Ok(RoutingResult { model_name }) + Ok(RoutingResult { + model_name, + route_name: Some(route_name), + }) } None => { // No route determined, return sentinel value "none" @@ -145,6 +149,7 @@ pub async fn router_chat_get_upstream_model( Ok(RoutingResult { model_name: "none".to_string(), + route_name: None, }) } }, diff --git a/crates/brightstaff/src/handlers/routing_service.rs b/crates/brightstaff/src/handlers/routing_service.rs new file mode 100644 index 00000000..32f37a08 --- /dev/null +++ b/crates/brightstaff/src/handlers/routing_service.rs @@ -0,0 +1,163 @@ +use bytes::Bytes; +use common::configuration::SpanAttributes; +use common::consts::{REQUEST_ID_HEADER, TRACE_PARENT_HEADER}; +use common::errors::BrightStaffError; +use hermesllm::clients::SupportedAPIsFromClient; +use hermesllm::ProviderRequestType; +use http_body_util::combinators::BoxBody; +use http_body_util::{BodyExt, Full}; +use hyper::{Request, Response, StatusCode}; +use std::sync::Arc; +use tracing::{debug, info, info_span, warn, Instrument}; + +use crate::handlers::router_chat::router_chat_get_upstream_model; +use crate::router::llm_router::RouterService; +use crate::tracing::{collect_custom_trace_attributes, operation_component, set_service_name}; + +#[derive(serde::Serialize)] +struct RoutingDecisionResponse { + model: String, + route: Option, + trace_id: String, +} + +pub async fn routing_decision( + request: Request, + router_service: Arc, + request_path: String, + span_attributes: Arc>, +) -> Result>, hyper::Error> { + let request_headers = request.headers().clone(); + let request_id: String = request_headers + .get(REQUEST_ID_HEADER) + .and_then(|h| h.to_str().ok()) + .map(|s| s.to_string()) + .unwrap_or_else(|| uuid::Uuid::new_v4().to_string()); + + let custom_attrs = + collect_custom_trace_attributes(&request_headers, span_attributes.as_ref().as_ref()); + + let request_span = info_span!( + "routing_decision", + component = "routing", + request_id = %request_id, + http.method = %request.method(), + http.path = %request_path, + ); + + routing_decision_inner( + request, + router_service, + request_id, + request_path, + request_headers, + custom_attrs, + ) + .instrument(request_span) + .await +} + +async fn routing_decision_inner( + request: Request, + router_service: Arc, + request_id: String, + request_path: String, + request_headers: hyper::HeaderMap, + custom_attrs: std::collections::HashMap, +) -> Result>, hyper::Error> { + set_service_name(operation_component::ROUTING); + opentelemetry::trace::get_active_span(|span| { + for (key, value) in &custom_attrs { + span.set_attribute(opentelemetry::KeyValue::new(key.clone(), value.clone())); + } + }); + + // Extract or generate traceparent + let traceparent: String = match request_headers + .get(TRACE_PARENT_HEADER) + .and_then(|h| h.to_str().ok()) + .map(|s| s.to_string()) + { + Some(tp) => tp, + None => { + let trace_id = uuid::Uuid::new_v4().to_string().replace("-", ""); + let generated_tp = format!("00-{}-0000000000000000-01", trace_id); + warn!( + generated_traceparent = %generated_tp, + "TRACE_PARENT header missing, generated new traceparent" + ); + generated_tp + } + }; + + // Extract trace_id from traceparent (format: 00-{trace_id}-{span_id}-{flags}) + let trace_id = traceparent + .split('-') + .nth(1) + .unwrap_or("unknown") + .to_string(); + + // Parse request body + let chat_request_bytes = request.collect().await?.to_bytes(); + + debug!( + body = %String::from_utf8_lossy(&chat_request_bytes), + "routing decision request body received" + ); + + let client_request = match ProviderRequestType::try_from(( + &chat_request_bytes[..], + &SupportedAPIsFromClient::from_endpoint(request_path.as_str()).unwrap(), + )) { + Ok(request) => request, + Err(err) => { + warn!(error = %err, "failed to parse request for routing decision"); + return Ok(BrightStaffError::InvalidRequest(format!( + "Failed to parse request: {}", + err + )) + .into_response()); + } + }; + + // Call the existing routing logic + let routing_result = router_chat_get_upstream_model( + router_service, + client_request, + &traceparent, + &request_path, + &request_id, + ) + .await; + + match routing_result { + Ok(result) => { + let response = RoutingDecisionResponse { + model: result.model_name, + route: result.route_name, + trace_id, + }; + + info!( + model = %response.model, + route = ?response.route, + "routing decision completed" + ); + + let json = serde_json::to_string(&response).unwrap(); + let body = Full::new(Bytes::from(json)) + .map_err(|never| match never {}) + .boxed(); + + Ok(Response::builder() + .status(StatusCode::OK) + .header("Content-Type", "application/json") + .body(body) + .unwrap()) + } + Err(err) => { + warn!(error = %err.message, "routing decision failed"); + Ok(BrightStaffError::InternalServerError(err.message).into_response()) + } + } +} diff --git a/crates/brightstaff/src/main.rs b/crates/brightstaff/src/main.rs index 97345556..51c9127f 100644 --- a/crates/brightstaff/src/main.rs +++ b/crates/brightstaff/src/main.rs @@ -2,6 +2,7 @@ use brightstaff::handlers::agent_chat_completions::agent_chat; use brightstaff::handlers::function_calling::function_calling_chat_handler; use brightstaff::handlers::llm::llm_chat; use brightstaff::handlers::models::list_models; +use brightstaff::handlers::routing_service::routing_decision; use brightstaff::router::llm_router::RouterService; use brightstaff::router::plano_orchestrator::OrchestratorService; use brightstaff::state::memory::MemoryConversationalStorage; @@ -194,7 +195,7 @@ async fn main() -> Result<(), Box> { let state_storage = state_storage.clone(); async move { - let path = req.uri().path(); + let path = req.uri().path().to_string(); // Check if path starts with /agents if path.starts_with("/agents") { // Check if it matches one of the agent API paths @@ -217,7 +218,23 @@ async fn main() -> Result<(), Box> { .await; } } - match (req.method(), path) { + if let Some(stripped_path) = path.strip_prefix("/routing") { + let stripped_path = stripped_path.to_string(); + if matches!( + stripped_path.as_str(), + CHAT_COMPLETIONS_PATH | MESSAGES_PATH | OPENAI_RESPONSES_API_PATH + ) { + return routing_decision( + req, + router_service, + stripped_path, + span_attributes, + ) + .with_context(parent_cx) + .await; + } + } + match (req.method(), path.as_str()) { ( &Method::POST, CHAT_COMPLETIONS_PATH | MESSAGES_PATH | OPENAI_RESPONSES_API_PATH, @@ -270,7 +287,7 @@ async fn main() -> Result<(), Box> { Ok(response) } _ => { - debug!(method = %req.method(), path = %req.uri().path(), "no route found"); + debug!(method = %req.method(), path = %path, "no route found"); let mut not_found = Response::new(empty()); *not_found.status_mut() = StatusCode::NOT_FOUND; Ok(not_found) diff --git a/demos/llm_routing/model_routing_service/README.md b/demos/llm_routing/model_routing_service/README.md new file mode 100644 index 00000000..85d56abf --- /dev/null +++ b/demos/llm_routing/model_routing_service/README.md @@ -0,0 +1,92 @@ +# Model Routing Service Demo + +This demo shows how to use the `/routing/v1/*` endpoints to get routing decisions without proxying requests to an LLM. The endpoint accepts standard LLM request formats and returns which model Plano's router would select. + +## Setup + +Make sure you have Plano CLI installed (`pip install planoai` or `uv tool install planoai`). + +```bash +export OPENAI_API_KEY= +export ANTHROPIC_API_KEY= +``` + +Start Plano: +```bash +cd demos/llm_routing/model_routing_service +planoai up config.yaml +``` + +## Run the demo + +```bash +./demo.sh +``` + +## Endpoints + +All three LLM API formats are supported: + +| Endpoint | Format | +|---|---| +| `POST /routing/v1/chat/completions` | OpenAI Chat Completions | +| `POST /routing/v1/messages` | Anthropic Messages | +| `POST /routing/v1/responses` | OpenAI Responses API | + +## Example + +```bash +curl http://localhost:12000/routing/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "gpt-4o-mini", + "messages": [{"role": "user", "content": "Write a Python function for binary search"}] + }' +``` + +Response: +```json +{ + "model": "anthropic/claude-sonnet-4-20250514", + "route": "code_generation", + "trace_id": "c16d1096c1af4a17abb48fb182918a88" +} +``` + +The response tells you which model would handle this request and which route was matched, without actually making the LLM call. + +## Demo Output + +``` +=== Model Routing Service Demo === + +--- 1. Code generation query (OpenAI format) --- +{ + "model": "anthropic/claude-sonnet-4-20250514", + "route": "code_generation", + "trace_id": "c16d1096c1af4a17abb48fb182918a88" +} + +--- 2. Complex reasoning query (OpenAI format) --- +{ + "model": "openai/gpt-4o", + "route": "complex_reasoning", + "trace_id": "30795e228aff4d7696f082ed01b75ad4" +} + +--- 3. Simple query - no routing match (OpenAI format) --- +{ + "model": "none", + "route": null, + "trace_id": "ae0b6c3b220d499fb5298ac63f4eac0e" +} + +--- 4. Code generation query (Anthropic format) --- +{ + "model": "anthropic/claude-sonnet-4-20250514", + "route": "code_generation", + "trace_id": "26be822bbdf14a3ba19fe198e55ea4a9" +} + +=== Demo Complete === +``` diff --git a/demos/llm_routing/model_routing_service/config.yaml b/demos/llm_routing/model_routing_service/config.yaml new file mode 100644 index 00000000..7b98b25b --- /dev/null +++ b/demos/llm_routing/model_routing_service/config.yaml @@ -0,0 +1,27 @@ +version: v0.3.0 + +listeners: + - type: model + name: model_listener + port: 12000 + +model_providers: + + - model: openai/gpt-4o-mini + access_key: $OPENAI_API_KEY + default: true + + - model: openai/gpt-4o + access_key: $OPENAI_API_KEY + routing_preferences: + - name: complex_reasoning + description: complex reasoning tasks, multi-step analysis, or detailed explanations + + - model: anthropic/claude-sonnet-4-20250514 + access_key: $ANTHROPIC_API_KEY + routing_preferences: + - name: code_generation + description: generating new code, writing functions, or creating boilerplate + +tracing: + random_sampling: 100 diff --git a/demos/llm_routing/model_routing_service/demo.sh b/demos/llm_routing/model_routing_service/demo.sh new file mode 100755 index 00000000..3e9b0584 --- /dev/null +++ b/demos/llm_routing/model_routing_service/demo.sh @@ -0,0 +1,65 @@ +#!/bin/bash +set -e + +PLANO_URL="${PLANO_URL:-http://localhost:12000}" + +echo "=== Model Routing Service Demo ===" +echo "" +echo "This demo shows how to use the /routing/v1/* endpoints to get" +echo "routing decisions without actually proxying the request to an LLM." +echo "" + +# --- Example 1: OpenAI Chat Completions format --- +echo "--- 1. Code generation query (OpenAI format) ---" +echo "" +curl -s "$PLANO_URL/routing/v1/chat/completions" \ + -H "Content-Type: application/json" \ + -d '{ + "model": "gpt-4o-mini", + "messages": [ + {"role": "user", "content": "Write a Python function that implements binary search on a sorted array"} + ] + }' | python3 -m json.tool +echo "" + +# --- Example 2: Complex reasoning query --- +echo "--- 2. Complex reasoning query (OpenAI format) ---" +echo "" +curl -s "$PLANO_URL/routing/v1/chat/completions" \ + -H "Content-Type: application/json" \ + -d '{ + "model": "gpt-4o-mini", + "messages": [ + {"role": "user", "content": "Explain the trade-offs between microservices and monolithic architectures, considering scalability, team structure, and operational complexity"} + ] + }' | python3 -m json.tool +echo "" + +# --- Example 3: Simple query (no routing match) --- +echo "--- 3. Simple query - no routing match (OpenAI format) ---" +echo "" +curl -s "$PLANO_URL/routing/v1/chat/completions" \ + -H "Content-Type: application/json" \ + -d '{ + "model": "gpt-4o-mini", + "messages": [ + {"role": "user", "content": "What is the capital of France?"} + ] + }' | python3 -m json.tool +echo "" + +# --- Example 4: Anthropic Messages format --- +echo "--- 4. Code generation query (Anthropic format) ---" +echo "" +curl -s "$PLANO_URL/routing/v1/messages" \ + -H "Content-Type: application/json" \ + -d '{ + "model": "gpt-4o-mini", + "max_tokens": 1024, + "messages": [ + {"role": "user", "content": "Create a REST API endpoint in Rust using actix-web that handles user registration"} + ] + }' | python3 -m json.tool +echo "" + +echo "=== Demo Complete ===" From 97b7a390efe323a0e4ef8bb21cdcb4c9a1334f6d Mon Sep 17 00:00:00 2001 From: Adil Hafeez Date: Tue, 10 Mar 2026 12:23:18 -0700 Subject: [PATCH 2/8] support inline routing_policy in request body (#811) (#815) --- crates/brightstaff/src/handlers/llm.rs | 19 +- .../brightstaff/src/handlers/router_chat.rs | 26 ++- .../src/handlers/routing_service.rs | 202 +++++++++++++++++- .../llm_routing/model_routing_service/demo.sh | 55 +++++ 4 files changed, 286 insertions(+), 16 deletions(-) diff --git a/crates/brightstaff/src/handlers/llm.rs b/crates/brightstaff/src/handlers/llm.rs index ee41dd2d..b03d4d29 100644 --- a/crates/brightstaff/src/handlers/llm.rs +++ b/crates/brightstaff/src/handlers/llm.rs @@ -126,13 +126,27 @@ async fn llm_chat_inner( } }; - let chat_request_bytes = request.collect().await?.to_bytes(); + let raw_bytes = request.collect().await?.to_bytes(); debug!( - body = %String::from_utf8_lossy(&chat_request_bytes), + body = %String::from_utf8_lossy(&raw_bytes), "request body received" ); + // Extract routing_policy from request body if present + let (chat_request_bytes, inline_routing_policy) = + match crate::handlers::routing_service::extract_routing_policy(&raw_bytes, false) { + Ok(result) => result, + Err(err) => { + warn!(error = %err, "failed to parse request JSON"); + return Ok(BrightStaffError::InvalidRequest(format!( + "Failed to parse request: {}", + err + )) + .into_response()); + } + }; + let mut client_request = match ProviderRequestType::try_from(( &chat_request_bytes[..], &SupportedAPIsFromClient::from_endpoint(request_path.as_str()).unwrap(), @@ -335,6 +349,7 @@ async fn llm_chat_inner( &traceparent, &request_path, &request_id, + inline_routing_policy, ) .await } diff --git a/crates/brightstaff/src/handlers/router_chat.rs b/crates/brightstaff/src/handlers/router_chat.rs index 345632fc..910e5408 100644 --- a/crates/brightstaff/src/handlers/router_chat.rs +++ b/crates/brightstaff/src/handlers/router_chat.rs @@ -38,6 +38,7 @@ pub async fn router_chat_get_upstream_model( traceparent: &str, request_path: &str, request_id: &str, + inline_usage_preferences: Option>, ) -> Result { // Clone metadata for routing before converting (which consumes client_request) let routing_metadata = client_request.metadata().clone(); @@ -76,16 +77,21 @@ pub async fn router_chat_get_upstream_model( "router request" ); - // Extract usage preferences from metadata - let usage_preferences_str: Option = routing_metadata.as_ref().and_then(|metadata| { - metadata - .get("plano_preference_config") - .map(|value| value.to_string()) - }); - - let usage_preferences: Option> = usage_preferences_str - .as_ref() - .and_then(|s| serde_yaml::from_str(s).ok()); + // Use inline preferences if provided, otherwise fall back to metadata extraction + let usage_preferences: Option> = if inline_usage_preferences.is_some() + { + inline_usage_preferences + } else { + let usage_preferences_str: Option = + routing_metadata.as_ref().and_then(|metadata| { + metadata + .get("plano_preference_config") + .map(|value| value.to_string()) + }); + usage_preferences_str + .as_ref() + .and_then(|s| serde_yaml::from_str(s).ok()) + }; // Prepare log message with latest message from chat request let latest_message_for_log = chat_request diff --git a/crates/brightstaff/src/handlers/routing_service.rs b/crates/brightstaff/src/handlers/routing_service.rs index 32f37a08..4eae4685 100644 --- a/crates/brightstaff/src/handlers/routing_service.rs +++ b/crates/brightstaff/src/handlers/routing_service.rs @@ -1,5 +1,5 @@ use bytes::Bytes; -use common::configuration::SpanAttributes; +use common::configuration::{ModelUsagePreference, SpanAttributes}; use common::consts::{REQUEST_ID_HEADER, TRACE_PARENT_HEADER}; use common::errors::BrightStaffError; use hermesllm::clients::SupportedAPIsFromClient; @@ -14,6 +14,53 @@ use crate::handlers::router_chat::router_chat_get_upstream_model; use crate::router::llm_router::RouterService; use crate::tracing::{collect_custom_trace_attributes, operation_component, set_service_name}; +const ROUTING_POLICY_SIZE_WARNING_BYTES: usize = 5120; + +/// Extracts `routing_policy` from a JSON body, returning the cleaned body bytes +/// and parsed preferences. The `routing_policy` field is removed from the JSON +/// before re-serializing so downstream parsers don't see the non-standard field. +/// +/// If `warn_on_size` is true, logs a warning when the serialized policy exceeds 5KB. +pub fn extract_routing_policy( + raw_bytes: &[u8], + warn_on_size: bool, +) -> Result<(Bytes, Option>), String> { + let mut json_body: serde_json::Value = serde_json::from_slice(raw_bytes) + .map_err(|err| format!("Failed to parse JSON: {}", err))?; + + let preferences = json_body + .as_object_mut() + .and_then(|obj| obj.remove("routing_policy")) + .and_then(|policy_value| { + if warn_on_size { + let policy_str = serde_json::to_string(&policy_value).unwrap_or_default(); + if policy_str.len() > ROUTING_POLICY_SIZE_WARNING_BYTES { + warn!( + size_bytes = policy_str.len(), + limit_bytes = ROUTING_POLICY_SIZE_WARNING_BYTES, + "routing_policy exceeds recommended size limit" + ); + } + } + match serde_json::from_value::>(policy_value) { + Ok(prefs) => { + info!( + num_models = prefs.len(), + "using inline routing_policy from request body" + ); + Some(prefs) + } + Err(err) => { + warn!(error = %err, "failed to parse routing_policy"); + None + } + } + }); + + let bytes = Bytes::from(serde_json::to_vec(&json_body).unwrap()); + Ok((bytes, preferences)) +} + #[derive(serde::Serialize)] struct RoutingDecisionResponse { model: String, @@ -98,13 +145,26 @@ async fn routing_decision_inner( .to_string(); // Parse request body - let chat_request_bytes = request.collect().await?.to_bytes(); + let raw_bytes = request.collect().await?.to_bytes(); debug!( - body = %String::from_utf8_lossy(&chat_request_bytes), + body = %String::from_utf8_lossy(&raw_bytes), "routing decision request body received" ); + // Extract routing_policy from request body before parsing as ProviderRequestType + let (chat_request_bytes, inline_preferences) = match extract_routing_policy(&raw_bytes, true) { + Ok(result) => result, + Err(err) => { + warn!(error = %err, "failed to parse request JSON"); + return Ok(BrightStaffError::InvalidRequest(format!( + "Failed to parse request JSON: {}", + err + )) + .into_response()); + } + }; + let client_request = match ProviderRequestType::try_from(( &chat_request_bytes[..], &SupportedAPIsFromClient::from_endpoint(request_path.as_str()).unwrap(), @@ -120,13 +180,14 @@ async fn routing_decision_inner( } }; - // Call the existing routing logic + // Call the existing routing logic with inline preferences let routing_result = router_chat_get_upstream_model( router_service, client_request, &traceparent, &request_path, &request_id, + inline_preferences, ) .await; @@ -161,3 +222,136 @@ async fn routing_decision_inner( } } } + +#[cfg(test)] +mod tests { + use super::*; + + fn make_chat_body(extra_fields: &str) -> Vec { + let extra = if extra_fields.is_empty() { + String::new() + } else { + format!(", {}", extra_fields) + }; + format!( + r#"{{"model": "gpt-4o-mini", "messages": [{{"role": "user", "content": "hello"}}]{}}}"#, + extra + ) + .into_bytes() + } + + #[test] + fn extract_routing_policy_no_policy() { + let body = make_chat_body(""); + let (cleaned, prefs) = extract_routing_policy(&body, false).unwrap(); + + assert!(prefs.is_none()); + let cleaned_json: serde_json::Value = serde_json::from_slice(&cleaned).unwrap(); + assert_eq!(cleaned_json["model"], "gpt-4o-mini"); + assert!(cleaned_json.get("routing_policy").is_none()); + } + + #[test] + fn extract_routing_policy_valid_policy() { + let policy = r#""routing_policy": [ + { + "model": "openai/gpt-4o", + "routing_preferences": [ + {"name": "coding", "description": "code generation tasks"} + ] + }, + { + "model": "openai/gpt-4o-mini", + "routing_preferences": [ + {"name": "general", "description": "general questions"} + ] + } + ]"#; + let body = make_chat_body(policy); + let (cleaned, prefs) = extract_routing_policy(&body, false).unwrap(); + + let prefs = prefs.expect("should have parsed preferences"); + assert_eq!(prefs.len(), 2); + assert_eq!(prefs[0].model, "openai/gpt-4o"); + assert_eq!(prefs[0].routing_preferences[0].name, "coding"); + assert_eq!(prefs[1].model, "openai/gpt-4o-mini"); + assert_eq!(prefs[1].routing_preferences[0].name, "general"); + + // routing_policy should be stripped from cleaned body + let cleaned_json: serde_json::Value = serde_json::from_slice(&cleaned).unwrap(); + assert!(cleaned_json.get("routing_policy").is_none()); + assert_eq!(cleaned_json["model"], "gpt-4o-mini"); + } + + #[test] + fn extract_routing_policy_invalid_policy_returns_none() { + // routing_policy is present but has wrong shape + let policy = r#""routing_policy": "not-an-array""#; + let body = make_chat_body(policy); + let (cleaned, prefs) = extract_routing_policy(&body, false).unwrap(); + + // Invalid policy should be ignored (returns None), not error + assert!(prefs.is_none()); + // routing_policy should still be stripped from cleaned body + let cleaned_json: serde_json::Value = serde_json::from_slice(&cleaned).unwrap(); + assert!(cleaned_json.get("routing_policy").is_none()); + } + + #[test] + fn extract_routing_policy_invalid_json_returns_error() { + let body = b"not valid json"; + let result = extract_routing_policy(body, false); + assert!(result.is_err()); + assert!(result.unwrap_err().contains("Failed to parse JSON")); + } + + #[test] + fn extract_routing_policy_empty_array() { + let policy = r#""routing_policy": []"#; + let body = make_chat_body(policy); + let (_, prefs) = extract_routing_policy(&body, false).unwrap(); + + let prefs = prefs.expect("empty array is valid"); + assert_eq!(prefs.len(), 0); + } + + #[test] + fn extract_routing_policy_preserves_other_fields() { + let policy = r#""routing_policy": [{"model": "gpt-4o", "routing_preferences": [{"name": "test", "description": "test"}]}], "temperature": 0.5, "max_tokens": 100"#; + let body = make_chat_body(policy); + let (cleaned, prefs) = extract_routing_policy(&body, false).unwrap(); + + assert!(prefs.is_some()); + let cleaned_json: serde_json::Value = serde_json::from_slice(&cleaned).unwrap(); + assert_eq!(cleaned_json["temperature"], 0.5); + assert_eq!(cleaned_json["max_tokens"], 100); + assert!(cleaned_json.get("routing_policy").is_none()); + } + + #[test] + fn routing_decision_response_serialization() { + let response = RoutingDecisionResponse { + model: "openai/gpt-4o".to_string(), + route: Some("code_generation".to_string()), + trace_id: "abc123".to_string(), + }; + let json = serde_json::to_string(&response).unwrap(); + let parsed: serde_json::Value = serde_json::from_str(&json).unwrap(); + assert_eq!(parsed["model"], "openai/gpt-4o"); + assert_eq!(parsed["route"], "code_generation"); + assert_eq!(parsed["trace_id"], "abc123"); + } + + #[test] + fn routing_decision_response_serialization_no_route() { + let response = RoutingDecisionResponse { + model: "none".to_string(), + route: None, + trace_id: "abc123".to_string(), + }; + let json = serde_json::to_string(&response).unwrap(); + let parsed: serde_json::Value = serde_json::from_str(&json).unwrap(); + assert_eq!(parsed["model"], "none"); + assert!(parsed["route"].is_null()); + } +} diff --git a/demos/llm_routing/model_routing_service/demo.sh b/demos/llm_routing/model_routing_service/demo.sh index 3e9b0584..0c3fdc5d 100755 --- a/demos/llm_routing/model_routing_service/demo.sh +++ b/demos/llm_routing/model_routing_service/demo.sh @@ -62,4 +62,59 @@ curl -s "$PLANO_URL/routing/v1/messages" \ }' | python3 -m json.tool echo "" +# --- Example 5: Inline routing policy in request body --- +echo "--- 5. Inline routing_policy (no config needed) ---" +echo "" +curl -s "$PLANO_URL/routing/v1/chat/completions" \ + -H "Content-Type: application/json" \ + -d '{ + "model": "gpt-4o-mini", + "messages": [ + {"role": "user", "content": "Write a quicksort implementation in Go"} + ], + "routing_policy": [ + { + "model": "openai/gpt-4o", + "routing_preferences": [ + {"name": "coding", "description": "code generation, writing functions, debugging"} + ] + }, + { + "model": "openai/gpt-4o-mini", + "routing_preferences": [ + {"name": "general", "description": "general questions, simple lookups, casual conversation"} + ] + } + ] + }' | python3 -m json.tool +echo "" + +# --- Example 6: Inline routing policy with Anthropic format --- +echo "--- 6. Inline routing_policy (Anthropic format) ---" +echo "" +curl -s "$PLANO_URL/routing/v1/messages" \ + -H "Content-Type: application/json" \ + -d '{ + "model": "gpt-4o-mini", + "max_tokens": 1024, + "messages": [ + {"role": "user", "content": "What is the weather like today?"} + ], + "routing_policy": [ + { + "model": "openai/gpt-4o", + "routing_preferences": [ + {"name": "coding", "description": "code generation, writing functions, debugging"} + ] + }, + { + "model": "openai/gpt-4o-mini", + "routing_preferences": [ + {"name": "general", "description": "general questions, simple lookups, casual conversation"} + ] + } + ] + }' | python3 -m json.tool +echo "" + echo "=== Demo Complete ===" From 5189f7907a94fc8fbb43ae0b658116866fc48b5f Mon Sep 17 00:00:00 2001 From: Adil Hafeez Date: Tue, 10 Mar 2026 12:27:31 -0700 Subject: [PATCH 3/8] add k8s deploy guide (#816) --- docs/source/resources/deployment.rst | 188 +++++++++++++++++++++++++++ 1 file changed, 188 insertions(+) diff --git a/docs/source/resources/deployment.rst b/docs/source/resources/deployment.rst index 71452ea3..7b8b0554 100644 --- a/docs/source/resources/deployment.rst +++ b/docs/source/resources/deployment.rst @@ -100,6 +100,194 @@ You can also use the CLI with Docker mode: planoai up plano_config.yaml --docker planoai down --docker +Kubernetes Deployment +--------------------- + +Plano runs as a single container in Kubernetes. The container bundles Envoy, WASM plugins, and brightstaff, managed by supervisord internally. Deploy it as a standard Kubernetes Deployment with your ``plano_config.yaml`` mounted via a ConfigMap and API keys injected via a Secret. + +.. note:: + All environment variables referenced in your ``plano_config.yaml`` (e.g. ``$OPENAI_API_KEY``) must be set in the container environment. Use Kubernetes Secrets for API keys. + +Step 1: Create the Config +~~~~~~~~~~~~~~~~~~~~~~~~~ + +Store your ``plano_config.yaml`` in a ConfigMap: + +.. code-block:: bash + + kubectl create configmap plano-config --from-file=plano_config.yaml=./plano_config.yaml + +Step 2: Create API Key Secrets +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Store your LLM provider API keys in a Secret: + +.. code-block:: bash + + kubectl create secret generic plano-secrets \ + --from-literal=OPENAI_API_KEY=sk-... \ + --from-literal=ANTHROPIC_API_KEY=sk-ant-... + +Step 3: Deploy Plano +~~~~~~~~~~~~~~~~~~~~ + +Create a ``plano-deployment.yaml``: + +.. code-block:: yaml + + apiVersion: apps/v1 + kind: Deployment + metadata: + name: plano + labels: + app: plano + spec: + replicas: 1 + selector: + matchLabels: + app: plano + template: + metadata: + labels: + app: plano + spec: + containers: + - name: plano + image: katanemo/plano:0.4.11 + ports: + - containerPort: 12000 # LLM gateway (chat completions, model routing) + name: llm-gateway + envFrom: + - secretRef: + name: plano-secrets + env: + - name: LOG_LEVEL + value: "info" + volumeMounts: + - name: plano-config + mountPath: /app/plano_config.yaml + subPath: plano_config.yaml + readOnly: true + readinessProbe: + httpGet: + path: /healthz + port: 12000 + initialDelaySeconds: 5 + periodSeconds: 10 + livenessProbe: + httpGet: + path: /healthz + port: 12000 + initialDelaySeconds: 10 + periodSeconds: 30 + resources: + requests: + memory: "256Mi" + cpu: "250m" + limits: + memory: "512Mi" + cpu: "1000m" + volumes: + - name: plano-config + configMap: + name: plano-config + --- + apiVersion: v1 + kind: Service + metadata: + name: plano + spec: + selector: + app: plano + ports: + - name: llm-gateway + port: 12000 + targetPort: 12000 + +Apply it: + +.. code-block:: bash + + kubectl apply -f plano-deployment.yaml + +Step 4: Verify +~~~~~~~~~~~~~~ + +.. code-block:: bash + + # Check pod status + kubectl get pods -l app=plano + + # Check logs + kubectl logs -l app=plano -f + + # Test routing (port-forward for local testing) + kubectl port-forward svc/plano 12000:12000 + + curl -s -H "Content-Type: application/json" \ + -d '{"messages":[{"role":"user","content":"tell me a joke"}], "model":"none"}' \ + http://localhost:12000/v1/chat/completions | jq .model + +Updating Configuration +~~~~~~~~~~~~~~~~~~~~~~ + +To update ``plano_config.yaml``, replace the ConfigMap and restart the pod: + +.. code-block:: bash + + kubectl create configmap plano-config \ + --from-file=plano_config.yaml=./plano_config.yaml \ + --dry-run=client -o yaml | kubectl apply -f - + + kubectl rollout restart deployment/plano + +Enabling OTEL Tracing +~~~~~~~~~~~~~~~~~~~~~ + +Plano emits OpenTelemetry traces for every request — including routing decisions, model selection, and upstream latency. To export traces to an OTEL collector in your cluster, add the ``tracing`` section to your ``plano_config.yaml``: + +.. code-block:: yaml + + tracing: + opentracing_grpc_endpoint: "http://otel-collector.monitoring:4317" + random_sampling: 100 # percentage of requests to trace (1-100) + trace_arch_internal: true # include internal Plano spans + span_attributes: + header_prefixes: # capture request headers as span attributes + - "x-" + static: # add static attributes to all spans + environment: "production" + service: "plano" + +Set the ``OTEL_TRACING_GRPC_ENDPOINT`` environment variable or configure it directly in the config. Plano propagates the ``traceparent`` header end-to-end, so traces correlate across your upstream and downstream services. + +Environment Variables Reference +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The following environment variables can be set on the container: + +.. list-table:: + :header-rows: 1 + :widths: 30 50 20 + + * - Variable + - Description + - Default + * - ``LOG_LEVEL`` + - Log verbosity (``debug``, ``info``, ``warn``, ``error``) + - ``info`` + * - ``OPENAI_API_KEY`` + - OpenAI API key (if referenced in config) + - + * - ``ANTHROPIC_API_KEY`` + - Anthropic API key (if referenced in config) + - + * - ``OTEL_TRACING_GRPC_ENDPOINT`` + - OTEL collector endpoint for trace export + - ``http://localhost:4317`` + +Any environment variable referenced in ``plano_config.yaml`` with ``$VAR_NAME`` syntax will be substituted at startup. Use Kubernetes Secrets for sensitive values and ConfigMaps or ``env`` entries for non-sensitive configuration. + Runtime Tests ------------- From 66100976590bff14c1a4b66f6a090c950a0aad3d Mon Sep 17 00:00:00 2001 From: Musa Date: Tue, 10 Mar 2026 20:54:14 -0700 Subject: [PATCH 4/8] Support for Codex via Plano (#808) * Add Codex CLI support; xAI response improvements * Add native Plano running check and update CLI agent error handling * adding PR suggestions for transformations and code quality * message extraction logic in ResponsesAPIRequest * xAI support for Responses API by routing to native endpoint + refactor code --- cli/planoai/core.py | 129 +++- cli/planoai/main.py | 47 +- cli/uv.lock | 2 +- crates/brightstaff/src/handlers/llm.rs | 8 +- crates/brightstaff/src/state/mod.rs | 99 +++ crates/hermesllm/src/apis/openai.rs | 2 +- crates/hermesllm/src/apis/openai_responses.rs | 199 +++++- .../responses_api_streaming_buffer.rs | 66 +- crates/hermesllm/src/clients/endpoints.rs | 19 +- crates/hermesllm/src/providers/id.rs | 21 +- crates/hermesllm/src/providers/request.rs | 76 +++ .../src/transforms/request/from_openai.rs | 645 +++++++++++++++--- .../response_streaming/to_openai_streaming.rs | 17 +- crates/llm_gateway/src/stream_context.rs | 3 +- demos/README.md | 1 + demos/llm_routing/codex_router/README.md | 92 +++ demos/llm_routing/codex_router/config.yaml | 38 ++ .../codex_router/pretty_model_resolution.sh | 33 + 18 files changed, 1297 insertions(+), 200 deletions(-) create mode 100644 demos/llm_routing/codex_router/README.md create mode 100644 demos/llm_routing/codex_router/config.yaml create mode 100644 demos/llm_routing/codex_router/pretty_model_resolution.sh diff --git a/cli/planoai/core.py b/cli/planoai/core.py index e9ddc7bd..174f37c0 100644 --- a/cli/planoai/core.py +++ b/cli/planoai/core.py @@ -10,7 +10,6 @@ from planoai.consts import ( PLANO_DOCKER_IMAGE, PLANO_DOCKER_NAME, ) -import subprocess from planoai.docker_cli import ( docker_container_status, docker_remove_container, @@ -147,26 +146,48 @@ def stop_docker_container(service=PLANO_DOCKER_NAME): log.info(f"Failed to shut down services: {str(e)}") -def start_cli_agent(plano_config_file=None, settings_json="{}"): - """Start a CLI client connected to Plano.""" - - with open(plano_config_file, "r") as file: - plano_config = file.read() - plano_config_yaml = yaml.safe_load(plano_config) - - # Get egress listener configuration - egress_config = plano_config_yaml.get("listeners", {}).get("egress_traffic", {}) - host = egress_config.get("host", "127.0.0.1") - port = egress_config.get("port", 12000) - - # Parse additional settings from command line +def _parse_cli_agent_settings(settings_json: str) -> dict: try: - additional_settings = json.loads(settings_json) if settings_json else {} + return json.loads(settings_json) if settings_json else {} except json.JSONDecodeError: log.error("Settings must be valid JSON") sys.exit(1) - # Set up environment variables + +def _resolve_cli_agent_endpoint(plano_config_yaml: dict) -> tuple[str, int]: + listeners = plano_config_yaml.get("listeners") + + if isinstance(listeners, dict): + egress_config = listeners.get("egress_traffic", {}) + host = egress_config.get("host") or egress_config.get("address") or "0.0.0.0" + port = egress_config.get("port", 12000) + return host, port + + if isinstance(listeners, list): + for listener in listeners: + if listener.get("type") in ["model", "model_listener"]: + host = listener.get("host") or listener.get("address") or "0.0.0.0" + port = listener.get("port", 12000) + return host, port + + return "0.0.0.0", 12000 + + +def _apply_non_interactive_env(env: dict, additional_settings: dict) -> None: + if additional_settings.get("NON_INTERACTIVE_MODE", False): + env.update( + { + "CI": "true", + "FORCE_COLOR": "0", + "NODE_NO_READLINE": "1", + "TERM": "dumb", + } + ) + + +def _start_claude_cli_agent( + host: str, port: int, plano_config_yaml: dict, additional_settings: dict +) -> None: env = os.environ.copy() env.update( { @@ -186,7 +207,6 @@ def start_cli_agent(plano_config_file=None, settings_json="{}"): "ANTHROPIC_SMALL_FAST_MODEL" ] else: - # Check if arch.claude.code.small.fast alias exists in model_aliases model_aliases = plano_config_yaml.get("model_aliases", {}) if "arch.claude.code.small.fast" in model_aliases: env["ANTHROPIC_SMALL_FAST_MODEL"] = "arch.claude.code.small.fast" @@ -196,23 +216,10 @@ def start_cli_agent(plano_config_file=None, settings_json="{}"): ) log.info("Or provide ANTHROPIC_SMALL_FAST_MODEL in --settings JSON") - # Non-interactive mode configuration from additional_settings only - if additional_settings.get("NON_INTERACTIVE_MODE", False): - env.update( - { - "CI": "true", - "FORCE_COLOR": "0", - "NODE_NO_READLINE": "1", - "TERM": "dumb", - } - ) + _apply_non_interactive_env(env, additional_settings) - # Build claude command arguments claude_args = [] - - # Add settings if provided, excluding those already handled as environment variables if additional_settings: - # Filter out settings that are already processed as environment variables claude_settings = { k: v for k, v in additional_settings.items() @@ -221,10 +228,8 @@ def start_cli_agent(plano_config_file=None, settings_json="{}"): if claude_settings: claude_args.append(f"--settings={json.dumps(claude_settings)}") - # Use claude from PATH claude_path = "claude" log.info(f"Connecting Claude Code Agent to Plano at {host}:{port}") - try: subprocess.run([claude_path] + claude_args, env=env, check=True) except subprocess.CalledProcessError as e: @@ -235,3 +240,61 @@ def start_cli_agent(plano_config_file=None, settings_json="{}"): f"{claude_path} not found. Make sure Claude Code is installed: npm install -g @anthropic-ai/claude-code" ) sys.exit(1) + + +def _start_codex_cli_agent(host: str, port: int, additional_settings: dict) -> None: + env = os.environ.copy() + env.update( + { + "OPENAI_API_KEY": "test", # Use test token for plano + "OPENAI_BASE_URL": f"http://{host}:{port}/v1", + "NO_PROXY": host, + "DISABLE_TELEMETRY": "true", + } + ) + _apply_non_interactive_env(env, additional_settings) + + codex_model = additional_settings.get("CODEX_MODEL", "gpt-5.3-codex") + codex_path = "codex" + codex_args = ["--model", codex_model] + + log.info( + f"Connecting Codex CLI Agent to Plano at {host}:{port} (default model: {codex_model})" + ) + try: + subprocess.run([codex_path] + codex_args, env=env, check=True) + except subprocess.CalledProcessError as e: + log.error(f"Error starting codex: {e}") + sys.exit(1) + except FileNotFoundError: + log.error( + f"{codex_path} not found. Make sure Codex CLI is installed: npm install -g @openai/codex" + ) + sys.exit(1) + + +def start_cli_agent( + plano_config_file=None, cli_agent_type="claude", settings_json="{}" +): + """Start a CLI client connected to Plano.""" + + with open(plano_config_file, "r") as file: + plano_config = file.read() + plano_config_yaml = yaml.safe_load(plano_config) + + host, port = _resolve_cli_agent_endpoint(plano_config_yaml) + + additional_settings = _parse_cli_agent_settings(settings_json) + + if cli_agent_type == "claude": + _start_claude_cli_agent(host, port, plano_config_yaml, additional_settings) + return + + if cli_agent_type == "codex": + _start_codex_cli_agent(host, port, additional_settings) + return + + log.error( + f"Unsupported cli agent type '{cli_agent_type}'. Supported values: claude, codex" + ) + sys.exit(1) diff --git a/cli/planoai/main.py b/cli/planoai/main.py index a63f294e..e9cdc0a0 100644 --- a/cli/planoai/main.py +++ b/cli/planoai/main.py @@ -1,3 +1,4 @@ +import json import os import multiprocessing import subprocess @@ -31,6 +32,7 @@ from planoai.trace_cmd import trace as trace_cmd, start_trace_listener_backgroun from planoai.consts import ( DEFAULT_OTEL_TRACING_GRPC_ENDPOINT, DEFAULT_NATIVE_OTEL_TRACING_GRPC_ENDPOINT, + NATIVE_PID_FILE, PLANO_DOCKER_IMAGE, PLANO_DOCKER_NAME, ) @@ -40,6 +42,30 @@ from planoai.versioning import check_version_status, get_latest_version, get_ver log = getLogger(__name__) +def _is_native_plano_running() -> bool: + if not os.path.exists(NATIVE_PID_FILE): + return False + try: + with open(NATIVE_PID_FILE, "r") as f: + pids = json.load(f) + except (OSError, json.JSONDecodeError): + return False + + envoy_pid = pids.get("envoy_pid") + brightstaff_pid = pids.get("brightstaff_pid") + if not isinstance(envoy_pid, int) or not isinstance(brightstaff_pid, int): + return False + + for pid in (envoy_pid, brightstaff_pid): + try: + os.kill(pid, 0) + except ProcessLookupError: + return False + except PermissionError: + continue + return True + + def _is_port_in_use(port: int) -> bool: """Check if a TCP port is already bound on localhost.""" import socket @@ -523,7 +549,7 @@ def logs(debug, follow, docker): @click.command() -@click.argument("type", type=click.Choice(["claude"]), required=True) +@click.argument("type", type=click.Choice(["claude", "codex"]), required=True) @click.argument("file", required=False) # Optional file argument @click.option( "--path", default=".", help="Path to the directory containing plano_config.yaml" @@ -536,14 +562,19 @@ def logs(debug, follow, docker): def cli_agent(type, file, path, settings): """Start a CLI agent connected to Plano. - CLI_AGENT: The type of CLI agent to start (currently only 'claude' is supported) + CLI_AGENT: The type of CLI agent to start ('claude' or 'codex') """ - # Check if plano docker container is running - plano_status = docker_container_status(PLANO_DOCKER_NAME) - if plano_status != "running": - log.error(f"plano docker container is not running (status: {plano_status})") - log.error("Please start plano using the 'planoai up' command.") + native_running = _is_native_plano_running() + docker_running = False + if not native_running: + docker_running = docker_container_status(PLANO_DOCKER_NAME) == "running" + + if not (native_running or docker_running): + log.error("Plano is not running.") + log.error( + "Start Plano first using 'planoai up ' (native or --docker mode)." + ) sys.exit(1) # Determine plano_config.yaml path @@ -553,7 +584,7 @@ def cli_agent(type, file, path, settings): sys.exit(1) try: - start_cli_agent(plano_config_file, settings) + start_cli_agent(plano_config_file, type, settings) except SystemExit: # Re-raise SystemExit to preserve exit codes raise diff --git a/cli/uv.lock b/cli/uv.lock index 45ccf82e..9d85bf85 100644 --- a/cli/uv.lock +++ b/cli/uv.lock @@ -337,7 +337,7 @@ wheels = [ [[package]] name = "planoai" -version = "0.4.7" +version = "0.4.9" source = { editable = "." } dependencies = [ { name = "click" }, diff --git a/crates/brightstaff/src/handlers/llm.rs b/crates/brightstaff/src/handlers/llm.rs index b03d4d29..67afebff 100644 --- a/crates/brightstaff/src/handlers/llm.rs +++ b/crates/brightstaff/src/handlers/llm.rs @@ -198,6 +198,7 @@ async fn llm_chat_inner( let temperature = client_request.get_temperature(); let is_streaming_request = client_request.is_streaming(); let alias_resolved_model = resolve_model_alias(&model_from_request, &model_aliases); + let (provider_id, _) = get_provider_info(&llm_providers, &alias_resolved_model).await; // Validate that the requested model exists in configuration // This matches the validation in llm_gateway routing.rs @@ -249,7 +250,11 @@ async fn llm_chat_inner( if client_request.remove_metadata_key("plano_preference_config") { debug!("removed plano_preference_config from metadata"); } - + if let Some(ref client_api_kind) = client_api { + let upstream_api = + provider_id.compatible_api_for_client(client_api_kind, is_streaming_request); + client_request.normalize_for_upstream(provider_id, &upstream_api); + } // === v1/responses state management: Determine upstream API and combine input if needed === // Do this BEFORE routing since routing consumes the request // Only process state if state_storage is configured @@ -496,7 +501,6 @@ async fn llm_chat_inner( .into_response()), } } - /// Resolves model aliases by looking up the requested model in the model_aliases map. /// Returns the target model if an alias is found, otherwise returns the original model. fn resolve_model_alias( diff --git a/crates/brightstaff/src/state/mod.rs b/crates/brightstaff/src/state/mod.rs index ce3ec8ae..3d59f359 100644 --- a/crates/brightstaff/src/state/mod.rs +++ b/crates/brightstaff/src/state/mod.rs @@ -130,6 +130,7 @@ pub fn extract_input_items(input: &InputParam) -> Vec { }]), })] } + InputParam::SingleItem(item) => vec![item.clone()], InputParam::Items(items) => items.clone(), } } @@ -146,3 +147,101 @@ pub async fn retrieve_and_combine_input( let combined_input = storage.merge(&prev_state, current_input); Ok(combined_input) } + +#[cfg(test)] +mod tests { + use super::extract_input_items; + use hermesllm::apis::openai_responses::{ + InputContent, InputItem, InputMessage, InputParam, MessageContent, MessageRole, + }; + + #[test] + fn test_extract_input_items_converts_text_to_user_message_item() { + let extracted = extract_input_items(&InputParam::Text("hello world".to_string())); + assert_eq!(extracted.len(), 1); + + let InputItem::Message(message) = &extracted[0] else { + panic!("expected InputItem::Message"); + }; + assert!(matches!(message.role, MessageRole::User)); + + let MessageContent::Items(items) = &message.content else { + panic!("expected MessageContent::Items"); + }; + assert_eq!(items.len(), 1); + + let InputContent::InputText { text } = &items[0] else { + panic!("expected InputContent::InputText"); + }; + assert_eq!(text, "hello world"); + } + + #[test] + fn test_extract_input_items_preserves_single_item() { + let item = InputItem::Message(InputMessage { + role: MessageRole::Assistant, + content: MessageContent::Items(vec![InputContent::InputText { + text: "assistant note".to_string(), + }]), + }); + + let extracted = extract_input_items(&InputParam::SingleItem(item.clone())); + assert_eq!(extracted.len(), 1); + let InputItem::Message(message) = &extracted[0] else { + panic!("expected InputItem::Message"); + }; + assert!(matches!(message.role, MessageRole::Assistant)); + let MessageContent::Items(items) = &message.content else { + panic!("expected MessageContent::Items"); + }; + let InputContent::InputText { text } = &items[0] else { + panic!("expected InputContent::InputText"); + }; + assert_eq!(text, "assistant note"); + } + + #[test] + fn test_extract_input_items_preserves_items_list() { + let items = vec![ + InputItem::Message(InputMessage { + role: MessageRole::User, + content: MessageContent::Items(vec![InputContent::InputText { + text: "first".to_string(), + }]), + }), + InputItem::Message(InputMessage { + role: MessageRole::Assistant, + content: MessageContent::Items(vec![InputContent::InputText { + text: "second".to_string(), + }]), + }), + ]; + + let extracted = extract_input_items(&InputParam::Items(items.clone())); + assert_eq!(extracted.len(), items.len()); + + let InputItem::Message(first) = &extracted[0] else { + panic!("expected first item to be message"); + }; + assert!(matches!(first.role, MessageRole::User)); + let MessageContent::Items(first_items) = &first.content else { + panic!("expected MessageContent::Items"); + }; + let InputContent::InputText { text: first_text } = &first_items[0] else { + panic!("expected InputContent::InputText"); + }; + assert_eq!(first_text, "first"); + + let InputItem::Message(second) = &extracted[1] else { + panic!("expected second item to be message"); + }; + assert!(matches!(second.role, MessageRole::Assistant)); + let MessageContent::Items(second_items) = &second.content else { + panic!("expected MessageContent::Items"); + }; + let InputContent::InputText { text: second_text } = &second_items[0] else { + panic!("expected InputContent::InputText"); + }; + assert_eq!(second_text, "second"); + } +} diff --git a/crates/hermesllm/src/apis/openai.rs b/crates/hermesllm/src/apis/openai.rs index 53eee442..33f55b29 100644 --- a/crates/hermesllm/src/apis/openai.rs +++ b/crates/hermesllm/src/apis/openai.rs @@ -108,7 +108,7 @@ pub struct ChatCompletionsRequest { pub top_p: Option, pub top_logprobs: Option, pub user: Option, - // pub web_search: Option, // GOOD FIRST ISSUE: Future support for web search + pub web_search_options: Option, // VLLM-specific parameters (used by Arch-Function) pub top_k: Option, diff --git a/crates/hermesllm/src/apis/openai_responses.rs b/crates/hermesllm/src/apis/openai_responses.rs index 65f4dfa0..eac8a452 100644 --- a/crates/hermesllm/src/apis/openai_responses.rs +++ b/crates/hermesllm/src/apis/openai_responses.rs @@ -116,6 +116,8 @@ pub enum InputParam { Text(String), /// Array of input items (messages, references, outputs, etc.) Items(Vec), + /// Single input item (some clients send object instead of array) + SingleItem(InputItem), } /// Input item - can be a message, item reference, function call output, etc. @@ -130,12 +132,20 @@ pub enum InputItem { item_type: String, id: String, }, + /// Function call emitted by model in prior turn + FunctionCall { + #[serde(rename = "type")] + item_type: String, + name: String, + arguments: String, + call_id: String, + }, /// Function call output FunctionCallOutput { #[serde(rename = "type")] item_type: String, call_id: String, - output: String, + output: serde_json::Value, }, } @@ -166,6 +176,7 @@ pub enum MessageRole { Assistant, System, Developer, + Tool, } /// Input content types @@ -173,6 +184,7 @@ pub enum MessageRole { #[serde(tag = "type", rename_all = "snake_case")] pub enum InputContent { /// Text input + #[serde(rename = "input_text", alias = "text", alias = "output_text")] InputText { text: String }, /// Image input via URL InputImage { @@ -180,6 +192,7 @@ pub enum InputContent { detail: Option, }, /// File input via URL + #[serde(rename = "input_file", alias = "file")] InputFile { file_url: String }, /// Audio input InputAudio { @@ -207,10 +220,11 @@ pub struct AudioConfig { } /// Text configuration +#[skip_serializing_none] #[derive(Debug, Clone, Serialize, Deserialize)] pub struct TextConfig { /// Text format configuration - pub format: TextFormat, + pub format: Option, } /// Text format @@ -285,6 +299,7 @@ pub enum Tool { filters: Option, }, /// Web search tool + #[serde(rename = "web_search", alias = "web_search_preview")] WebSearchPreview { domains: Option>, search_context_size: Option, @@ -298,6 +313,12 @@ pub enum Tool { display_height_px: Option, display_number: Option, }, + /// Custom tool (provider/SDK-specific tool contract) + Custom { + name: Option, + description: Option, + format: Option, + }, } /// Ranking options for file search @@ -1015,6 +1036,30 @@ pub struct ListInputItemsResponse { // ProviderRequest Implementation // ============================================================================ +fn append_input_content_text(buffer: &mut String, content: &InputContent) { + match content { + InputContent::InputText { text } => buffer.push_str(text), + InputContent::InputImage { .. } => buffer.push_str("[Image]"), + InputContent::InputFile { .. } => buffer.push_str("[File]"), + InputContent::InputAudio { .. } => buffer.push_str("[Audio]"), + } +} + +fn append_content_items_text(buffer: &mut String, content_items: &[InputContent]) { + for content in content_items { + // Preserve existing behavior: each content item is prefixed with a space. + buffer.push(' '); + append_input_content_text(buffer, content); + } +} + +fn append_message_content_text(buffer: &mut String, content: &MessageContent) { + match content { + MessageContent::Text(text) => buffer.push_str(text), + MessageContent::Items(content_items) => append_content_items_text(buffer, content_items), + } +} + impl ProviderRequest for ResponsesAPIRequest { fn model(&self) -> &str { &self.model @@ -1031,36 +1076,27 @@ impl ProviderRequest for ResponsesAPIRequest { fn extract_messages_text(&self) -> String { match &self.input { InputParam::Text(text) => text.clone(), - InputParam::Items(items) => { - items.iter().fold(String::new(), |acc, item| { - match item { - InputItem::Message(msg) => { - let content_text = match &msg.content { - MessageContent::Text(text) => text.clone(), - MessageContent::Items(content_items) => { - content_items.iter().fold(String::new(), |acc, content| { - acc + " " - + &match content { - InputContent::InputText { text } => text.clone(), - InputContent::InputImage { .. } => { - "[Image]".to_string() - } - InputContent::InputFile { .. } => { - "[File]".to_string() - } - InputContent::InputAudio { .. } => { - "[Audio]".to_string() - } - } - }) - } - }; - acc + " " + &content_text - } - // Skip non-message items (references, outputs, etc.) - _ => acc, + InputParam::SingleItem(item) => { + // Normalize single-item input for extraction behavior parity. + match item { + InputItem::Message(msg) => { + let mut extracted = String::new(); + append_message_content_text(&mut extracted, &msg.content); + extracted } - }) + _ => String::new(), + } + } + InputParam::Items(items) => { + let mut extracted = String::new(); + for item in items { + if let InputItem::Message(msg) = item { + // Preserve existing behavior: each message is prefixed with a space. + extracted.push(' '); + append_message_content_text(&mut extracted, &msg.content); + } + } + extracted } } } @@ -1068,6 +1104,20 @@ impl ProviderRequest for ResponsesAPIRequest { fn get_recent_user_message(&self) -> Option { match &self.input { InputParam::Text(text) => Some(text.clone()), + InputParam::SingleItem(item) => match item { + InputItem::Message(msg) if matches!(msg.role, MessageRole::User) => { + match &msg.content { + MessageContent::Text(text) => Some(text.clone()), + MessageContent::Items(content_items) => { + content_items.iter().find_map(|content| match content { + InputContent::InputText { text } => Some(text.clone()), + _ => None, + }) + } + } + } + _ => None, + }, InputParam::Items(items) => { items.iter().rev().find_map(|item| { match item { @@ -1097,6 +1147,9 @@ impl ProviderRequest for ResponsesAPIRequest { .iter() .filter_map(|tool| match tool { Tool::Function { name, .. } => Some(name.clone()), + Tool::Custom { + name: Some(name), .. + } => Some(name.clone()), // Other tool types don't have user-defined names _ => None, }) @@ -1366,6 +1419,7 @@ impl crate::providers::streaming_response::ProviderStreamResponse for ResponsesA #[cfg(test)] mod tests { use super::*; + use serde_json::json; #[test] fn test_response_output_text_delta_deserialization() { @@ -1506,4 +1560,87 @@ mod tests { _ => panic!("Expected ResponseCompleted event"), } } + + #[test] + fn test_request_deserializes_custom_tool() { + let request = json!({ + "model": "gpt-5.3-codex", + "input": "apply the patch", + "tools": [ + { + "type": "custom", + "name": "run_patch", + "description": "Apply patch text", + "format": { + "kind": "patch", + "version": "v1" + } + } + ] + }); + + let bytes = serde_json::to_vec(&request).unwrap(); + let parsed = ResponsesAPIRequest::try_from(bytes.as_slice()).unwrap(); + let tools = parsed.tools.expect("tools should be present"); + assert_eq!(tools.len(), 1); + + match &tools[0] { + Tool::Custom { + name, + description, + format, + } => { + assert_eq!(name.as_deref(), Some("run_patch")); + assert_eq!(description.as_deref(), Some("Apply patch text")); + assert!(format.is_some()); + } + _ => panic!("expected custom tool"), + } + } + + #[test] + fn test_request_deserializes_web_search_tool_alias() { + let request = json!({ + "model": "gpt-5.3-codex", + "input": "find repository info", + "tools": [ + { + "type": "web_search", + "domains": ["github.com"], + "search_context_size": "medium" + } + ] + }); + + let bytes = serde_json::to_vec(&request).unwrap(); + let parsed = ResponsesAPIRequest::try_from(bytes.as_slice()).unwrap(); + let tools = parsed.tools.expect("tools should be present"); + assert_eq!(tools.len(), 1); + + match &tools[0] { + Tool::WebSearchPreview { + domains, + search_context_size, + .. + } => { + assert_eq!(domains.as_ref().map(Vec::len), Some(1)); + assert_eq!(search_context_size.as_deref(), Some("medium")); + } + _ => panic!("expected web search preview tool"), + } + } + + #[test] + fn test_request_deserializes_text_config_without_format() { + let request = json!({ + "model": "gpt-5.3-codex", + "input": "hello", + "text": {} + }); + + let bytes = serde_json::to_vec(&request).unwrap(); + let parsed = ResponsesAPIRequest::try_from(bytes.as_slice()).unwrap(); + assert!(parsed.text.is_some()); + assert!(parsed.text.unwrap().format.is_none()); + } } diff --git a/crates/hermesllm/src/apis/streaming_shapes/responses_api_streaming_buffer.rs b/crates/hermesllm/src/apis/streaming_shapes/responses_api_streaming_buffer.rs index 2aeb34ac..92589ccf 100644 --- a/crates/hermesllm/src/apis/streaming_shapes/responses_api_streaming_buffer.rs +++ b/crates/hermesllm/src/apis/streaming_shapes/responses_api_streaming_buffer.rs @@ -74,6 +74,7 @@ pub struct ResponsesAPIStreamBuffer { /// Lifecycle state flags created_emitted: bool, in_progress_emitted: bool, + finalized: bool, /// Track which output items we've added output_items_added: HashMap, // output_index -> item_id @@ -109,6 +110,7 @@ impl ResponsesAPIStreamBuffer { upstream_response_metadata: None, created_emitted: false, in_progress_emitted: false, + finalized: false, output_items_added: HashMap::new(), text_content: HashMap::new(), function_arguments: HashMap::new(), @@ -236,7 +238,7 @@ impl ResponsesAPIStreamBuffer { }), store: Some(true), text: Some(TextConfig { - format: TextFormat::Text, + format: Some(TextFormat::Text), }), audio: None, modalities: None, @@ -255,8 +257,38 @@ impl ResponsesAPIStreamBuffer { /// Finalize the response by emitting all *.done events and response.completed. /// Call this when the stream is complete (after seeing [DONE] or end_of_stream). pub fn finalize(&mut self) { + // Idempotent finalize: avoid duplicate response.completed loops. + if self.finalized { + return; + } + self.finalized = true; + let mut events = Vec::new(); + // Ensure lifecycle prelude is emitted even if finalize is triggered + // by finish_reason before any prior delta was processed. + if !self.created_emitted { + if self.response_id.is_none() { + self.response_id = Some(format!( + "resp_{}", + uuid::Uuid::new_v4().to_string().replace("-", "") + )); + self.created_at = Some( + std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .unwrap() + .as_secs() as i64, + ); + self.model = Some("unknown".to_string()); + } + events.push(self.create_response_created_event()); + self.created_emitted = true; + } + if !self.in_progress_emitted { + events.push(self.create_response_in_progress_event()); + self.in_progress_emitted = true; + } + // Emit done events for all accumulated content // Text content done events @@ -443,6 +475,12 @@ impl SseStreamBufferTrait for ResponsesAPIStreamBuffer { } }; + // Explicit completion marker from transform layer. + if matches!(stream_event.as_ref(), ResponsesAPIStreamEvent::Done { .. }) { + self.finalize(); + return; + } + let mut events = Vec::new(); // Capture upstream metadata from ResponseCreated or ResponseInProgress if present @@ -789,4 +827,30 @@ mod tests { println!("✓ NO completion events (partial stream, no [DONE])"); println!("✓ Arguments accumulated: '{{\"location\":\"'\n"); } + + #[test] + fn test_finish_reason_without_done_still_finalizes_once() { + let raw_input = r#"data: {"id":"chatcmpl-123","object":"chat.completion.chunk","created":1234567890,"model":"gpt-4o","choices":[{"index":0,"delta":{"role":"assistant","content":"Hello"},"finish_reason":null}]} + +data: {"id":"chatcmpl-123","object":"chat.completion.chunk","created":1234567890,"model":"gpt-4o","choices":[{"index":0,"delta":{},"finish_reason":"stop"}]}"#; + + let client_api = SupportedAPIsFromClient::OpenAIResponsesAPI(OpenAIApi::Responses); + let upstream_api = SupportedUpstreamAPIs::OpenAIChatCompletions(OpenAIApi::ChatCompletions); + + let stream_iter = SseStreamIter::try_from(raw_input.as_bytes()).unwrap(); + let mut buffer = ResponsesAPIStreamBuffer::new(); + + for raw_event in stream_iter { + let transformed_event = + SseEvent::try_from((raw_event, &client_api, &upstream_api)).unwrap(); + buffer.add_transformed_event(transformed_event); + } + + let output = String::from_utf8_lossy(&buffer.to_bytes()).to_string(); + let completed_count = output.matches("event: response.completed").count(); + assert_eq!( + completed_count, 1, + "response.completed should be emitted exactly once" + ); + } } diff --git a/crates/hermesllm/src/clients/endpoints.rs b/crates/hermesllm/src/clients/endpoints.rs index eff96cc5..23e14604 100644 --- a/crates/hermesllm/src/clients/endpoints.rs +++ b/crates/hermesllm/src/clients/endpoints.rs @@ -184,8 +184,8 @@ impl SupportedAPIsFromClient { SupportedAPIsFromClient::OpenAIResponsesAPI(_) => { // For Responses API, check if provider supports it, otherwise translate to chat/completions match provider_id { - // OpenAI and compatible providers that support /v1/responses - ProviderId::OpenAI => route_by_provider("/responses"), + // Providers that support /v1/responses natively + ProviderId::OpenAI | ProviderId::XAI => route_by_provider("/responses"), // All other providers: translate to /chat/completions _ => route_by_provider("/chat/completions"), } @@ -654,4 +654,19 @@ mod tests { "/custom/azure/path/gpt-4-deployment/chat/completions?api-version=2025-01-01-preview" ); } + + #[test] + fn test_responses_api_targets_xai_native_responses_endpoint() { + let api = SupportedAPIsFromClient::OpenAIResponsesAPI(OpenAIApi::Responses); + assert_eq!( + api.target_endpoint_for_provider( + &ProviderId::XAI, + "/v1/responses", + "grok-4-1-fast-reasoning", + false, + None + ), + "/v1/responses" + ); + } } diff --git a/crates/hermesllm/src/providers/id.rs b/crates/hermesllm/src/providers/id.rs index fff73f15..11008711 100644 --- a/crates/hermesllm/src/providers/id.rs +++ b/crates/hermesllm/src/providers/id.rs @@ -166,10 +166,11 @@ impl ProviderId { SupportedAPIsFromClient::OpenAIChatCompletions(_), ) => SupportedUpstreamAPIs::OpenAIChatCompletions(OpenAIApi::ChatCompletions), - // OpenAI Responses API - only OpenAI supports this - (ProviderId::OpenAI, SupportedAPIsFromClient::OpenAIResponsesAPI(_)) => { - SupportedUpstreamAPIs::OpenAIResponsesAPI(OpenAIApi::Responses) - } + // OpenAI Responses API - OpenAI and xAI support this natively + ( + ProviderId::OpenAI | ProviderId::XAI, + SupportedAPIsFromClient::OpenAIResponsesAPI(_), + ) => SupportedUpstreamAPIs::OpenAIResponsesAPI(OpenAIApi::Responses), // Amazon Bedrock natively supports Bedrock APIs (ProviderId::AmazonBedrock, SupportedAPIsFromClient::OpenAIChatCompletions(_)) => { @@ -328,4 +329,16 @@ mod tests { "AmazonBedrock should have models (mapped to amazon)" ); } + + #[test] + fn test_xai_uses_responses_api_for_responses_clients() { + use crate::clients::endpoints::{SupportedAPIsFromClient, SupportedUpstreamAPIs}; + + let client_api = SupportedAPIsFromClient::OpenAIResponsesAPI(OpenAIApi::Responses); + let upstream = ProviderId::XAI.compatible_api_for_client(&client_api, false); + assert!(matches!( + upstream, + SupportedUpstreamAPIs::OpenAIResponsesAPI(OpenAIApi::Responses) + )); + } } diff --git a/crates/hermesllm/src/providers/request.rs b/crates/hermesllm/src/providers/request.rs index e97e8a68..92688133 100644 --- a/crates/hermesllm/src/providers/request.rs +++ b/crates/hermesllm/src/providers/request.rs @@ -5,6 +5,7 @@ use crate::apis::amazon_bedrock::{ConverseRequest, ConverseStreamRequest}; use crate::apis::openai_responses::ResponsesAPIRequest; use crate::clients::endpoints::SupportedAPIsFromClient; use crate::clients::endpoints::SupportedUpstreamAPIs; +use crate::ProviderId; use serde_json::Value; use std::collections::HashMap; @@ -70,6 +71,25 @@ impl ProviderRequestType { Self::ResponsesAPIRequest(r) => r.set_messages(messages), } } + + /// Apply provider-specific request normalization before sending upstream. + pub fn normalize_for_upstream( + &mut self, + provider_id: ProviderId, + upstream_api: &SupportedUpstreamAPIs, + ) { + if provider_id == ProviderId::XAI + && matches!( + upstream_api, + SupportedUpstreamAPIs::OpenAIChatCompletions(_) + ) + { + if let Self::ChatCompletionsRequest(req) = self { + // xAI's legacy live-search shape is deprecated on chat/completions. + req.web_search_options = None; + } + } + } } impl ProviderRequest for ProviderRequestType { @@ -787,6 +807,62 @@ mod tests { } } + #[test] + fn test_normalize_for_upstream_xai_clears_chat_web_search_options() { + use crate::apis::openai::{Message, MessageContent, OpenAIApi, Role}; + + let mut request = ProviderRequestType::ChatCompletionsRequest(ChatCompletionsRequest { + model: "grok-4".to_string(), + messages: vec![Message { + role: Role::User, + content: Some(MessageContent::Text("hello".to_string())), + name: None, + tool_calls: None, + tool_call_id: None, + }], + web_search_options: Some(serde_json::json!({"search_context_size":"medium"})), + ..Default::default() + }); + + request.normalize_for_upstream( + ProviderId::XAI, + &SupportedUpstreamAPIs::OpenAIChatCompletions(OpenAIApi::ChatCompletions), + ); + + let ProviderRequestType::ChatCompletionsRequest(req) = request else { + panic!("expected chat request"); + }; + assert!(req.web_search_options.is_none()); + } + + #[test] + fn test_normalize_for_upstream_non_xai_keeps_chat_web_search_options() { + use crate::apis::openai::{Message, MessageContent, OpenAIApi, Role}; + + let mut request = ProviderRequestType::ChatCompletionsRequest(ChatCompletionsRequest { + model: "gpt-4o".to_string(), + messages: vec![Message { + role: Role::User, + content: Some(MessageContent::Text("hello".to_string())), + name: None, + tool_calls: None, + tool_call_id: None, + }], + web_search_options: Some(serde_json::json!({"search_context_size":"medium"})), + ..Default::default() + }); + + request.normalize_for_upstream( + ProviderId::OpenAI, + &SupportedUpstreamAPIs::OpenAIChatCompletions(OpenAIApi::ChatCompletions), + ); + + let ProviderRequestType::ChatCompletionsRequest(req) = request else { + panic!("expected chat request"); + }; + assert!(req.web_search_options.is_some()); + } + #[test] fn test_responses_api_to_anthropic_messages_conversion() { use crate::apis::anthropic::AnthropicApi::Messages; diff --git a/crates/hermesllm/src/transforms/request/from_openai.rs b/crates/hermesllm/src/transforms/request/from_openai.rs index ddc3b1ca..f2e8ab0d 100644 --- a/crates/hermesllm/src/transforms/request/from_openai.rs +++ b/crates/hermesllm/src/transforms/request/from_openai.rs @@ -10,7 +10,8 @@ use crate::apis::anthropic::{ ToolResultContent, }; use crate::apis::openai::{ - ChatCompletionsRequest, Message, MessageContent, Role, Tool, ToolChoice, ToolChoiceType, + ChatCompletionsRequest, FunctionCall as OpenAIFunctionCall, Message, MessageContent, Role, + Tool, ToolCall as OpenAIToolCall, ToolChoice, ToolChoiceType, }; use crate::apis::openai_responses::{ @@ -65,6 +66,14 @@ impl TryFrom for Vec { Ok(messages) } + InputParam::SingleItem(item) => { + // Some clients send a single object instead of an array. + let nested = ResponsesInputConverter { + input: InputParam::Items(vec![item]), + instructions: converter.instructions, + }; + Vec::::try_from(nested) + } InputParam::Items(items) => { // Convert input items to messages let mut converted_messages = Vec::new(); @@ -82,82 +91,145 @@ impl TryFrom for Vec { // Convert each input item for item in items { - if let InputItem::Message(input_msg) = item { - let role = match input_msg.role { - MessageRole::User => Role::User, - MessageRole::Assistant => Role::Assistant, - MessageRole::System => Role::System, - MessageRole::Developer => Role::System, // Map developer to system - }; + match item { + InputItem::Message(input_msg) => { + let role = match input_msg.role { + MessageRole::User => Role::User, + MessageRole::Assistant => Role::Assistant, + MessageRole::System => Role::System, + MessageRole::Developer => Role::System, // Map developer to system + MessageRole::Tool => Role::Tool, + }; - // Convert content based on MessageContent type - let content = match &input_msg.content { - crate::apis::openai_responses::MessageContent::Text(text) => { - // Simple text content - MessageContent::Text(text.clone()) - } - crate::apis::openai_responses::MessageContent::Items(content_items) => { - // Check if it's a single text item (can use simple text format) - if content_items.len() == 1 { - if let InputContent::InputText { text } = &content_items[0] { - MessageContent::Text(text.clone()) + // Convert content based on MessageContent type + let content = match &input_msg.content { + crate::apis::openai_responses::MessageContent::Text(text) => { + // Simple text content + MessageContent::Text(text.clone()) + } + crate::apis::openai_responses::MessageContent::Items( + content_items, + ) => { + // Check if it's a single text item (can use simple text format) + if content_items.len() == 1 { + if let InputContent::InputText { text } = &content_items[0] + { + MessageContent::Text(text.clone()) + } else { + // Single non-text item - use parts format + MessageContent::Parts( + content_items + .iter() + .filter_map(|c| match c { + InputContent::InputText { text } => { + Some(crate::apis::openai::ContentPart::Text { + text: text.clone(), + }) + } + InputContent::InputImage { image_url, .. } => { + Some(crate::apis::openai::ContentPart::ImageUrl { + image_url: crate::apis::openai::ImageUrl { + url: image_url.clone(), + detail: None, + }, + }) + } + InputContent::InputFile { .. } => None, // Skip files for now + InputContent::InputAudio { .. } => None, // Skip audio for now + }) + .collect(), + ) + } } else { - // Single non-text item - use parts format + // Multiple content items - convert to parts MessageContent::Parts( - content_items.iter() + content_items + .iter() .filter_map(|c| match c { InputContent::InputText { text } => { - Some(crate::apis::openai::ContentPart::Text { text: text.clone() }) + Some(crate::apis::openai::ContentPart::Text { + text: text.clone(), + }) } InputContent::InputImage { image_url, .. } => { Some(crate::apis::openai::ContentPart::ImageUrl { image_url: crate::apis::openai::ImageUrl { url: image_url.clone(), detail: None, - } + }, }) } InputContent::InputFile { .. } => None, // Skip files for now InputContent::InputAudio { .. } => None, // Skip audio for now }) - .collect() + .collect(), ) } - } else { - // Multiple content items - convert to parts - MessageContent::Parts( - content_items - .iter() - .filter_map(|c| match c { - InputContent::InputText { text } => { - Some(crate::apis::openai::ContentPart::Text { - text: text.clone(), - }) - } - InputContent::InputImage { image_url, .. } => Some( - crate::apis::openai::ContentPart::ImageUrl { - image_url: crate::apis::openai::ImageUrl { - url: image_url.clone(), - detail: None, - }, - }, - ), - InputContent::InputFile { .. } => None, // Skip files for now - InputContent::InputAudio { .. } => None, // Skip audio for now - }) - .collect(), - ) + } + }; + + converted_messages.push(Message { + role, + content: Some(content), + name: None, + tool_call_id: None, + tool_calls: None, + }); + } + InputItem::FunctionCallOutput { + item_type: _, + call_id, + output, + } => { + // Preserve tool result so upstream models do not re-issue the same tool call. + let output_text = match output { + serde_json::Value::String(s) => s.clone(), + other => serde_json::to_string(&other).unwrap_or_default(), + }; + converted_messages.push(Message { + role: Role::Tool, + content: Some(MessageContent::Text(output_text)), + name: None, + tool_call_id: Some(call_id), + tool_calls: None, + }); + } + InputItem::FunctionCall { + item_type: _, + name, + arguments, + call_id, + } => { + let tool_call = OpenAIToolCall { + id: call_id, + call_type: "function".to_string(), + function: OpenAIFunctionCall { name, arguments }, + }; + + // Prefer attaching tool_calls to the preceding assistant message when present. + if let Some(last) = converted_messages.last_mut() { + if matches!(last.role, Role::Assistant) { + if let Some(existing) = &mut last.tool_calls { + existing.push(tool_call); + } else { + last.tool_calls = Some(vec![tool_call]); + } + continue; } } - }; - converted_messages.push(Message { - role, - content: Some(content), - name: None, - tool_call_id: None, - tool_calls: None, - }); + converted_messages.push(Message { + role: Role::Assistant, + content: None, + name: None, + tool_call_id: None, + tool_calls: Some(vec![tool_call]), + }); + } + InputItem::ItemReference { .. } => { + // Item references/unknown entries are metadata-like and can be skipped + // for chat-completions conversion. + } } } @@ -397,6 +469,170 @@ impl TryFrom for ChatCompletionsRequest { type Error = TransformError; fn try_from(req: ResponsesAPIRequest) -> Result { + fn normalize_function_parameters( + parameters: Option, + fallback_extra: Option, + ) -> serde_json::Value { + // ChatCompletions function tools require JSON Schema with top-level type=object. + let mut base = serde_json::json!({ + "type": "object", + "properties": {}, + }); + + if let Some(serde_json::Value::Object(mut obj)) = parameters { + // Enforce a valid object schema shape regardless of upstream tool format. + obj.insert( + "type".to_string(), + serde_json::Value::String("object".to_string()), + ); + if !obj.contains_key("properties") { + obj.insert( + "properties".to_string(), + serde_json::Value::Object(serde_json::Map::new()), + ); + } + base = serde_json::Value::Object(obj); + } + + if let Some(extra) = fallback_extra { + if let serde_json::Value::Object(ref mut map) = base { + map.insert("x-custom-format".to_string(), extra); + } + } + + base + } + + let mut converted_chat_tools: Vec = Vec::new(); + let mut web_search_options: Option = None; + + if let Some(tools) = req.tools.clone() { + for (idx, tool) in tools.into_iter().enumerate() { + match tool { + ResponsesTool::Function { + name, + description, + parameters, + strict, + } => converted_chat_tools.push(Tool { + tool_type: "function".to_string(), + function: crate::apis::openai::Function { + name, + description, + parameters: normalize_function_parameters(parameters, None), + strict, + }, + }), + ResponsesTool::WebSearchPreview { + search_context_size, + user_location, + .. + } => { + if web_search_options.is_none() { + let user_location_value = user_location.map(|loc| { + let mut approx = serde_json::Map::new(); + if let Some(city) = loc.city { + approx.insert( + "city".to_string(), + serde_json::Value::String(city), + ); + } + if let Some(country) = loc.country { + approx.insert( + "country".to_string(), + serde_json::Value::String(country), + ); + } + if let Some(region) = loc.region { + approx.insert( + "region".to_string(), + serde_json::Value::String(region), + ); + } + if let Some(timezone) = loc.timezone { + approx.insert( + "timezone".to_string(), + serde_json::Value::String(timezone), + ); + } + + serde_json::json!({ + "type": loc.location_type, + "approximate": serde_json::Value::Object(approx), + }) + }); + + let mut web_search = serde_json::Map::new(); + if let Some(size) = search_context_size { + web_search.insert( + "search_context_size".to_string(), + serde_json::Value::String(size), + ); + } + if let Some(location) = user_location_value { + web_search.insert("user_location".to_string(), location); + } + web_search_options = Some(serde_json::Value::Object(web_search)); + } + } + ResponsesTool::Custom { + name, + description, + format, + } => { + // Custom tools do not have a strict ChatCompletions equivalent for all + // providers. Map them to a permissive function tool for compatibility. + let tool_name = name.unwrap_or_else(|| format!("custom_tool_{}", idx + 1)); + let parameters = normalize_function_parameters( + Some(serde_json::json!({ + "type": "object", + "properties": { + "input": { "type": "string" } + }, + "required": ["input"], + "additionalProperties": true, + })), + format, + ); + + converted_chat_tools.push(Tool { + tool_type: "function".to_string(), + function: crate::apis::openai::Function { + name: tool_name, + description, + parameters, + strict: Some(false), + }, + }); + } + ResponsesTool::FileSearch { .. } => { + return Err(TransformError::UnsupportedConversion( + "FileSearch tool is not supported in ChatCompletions API. Only function/custom/web search tools are supported in this conversion." + .to_string(), + )); + } + ResponsesTool::CodeInterpreter => { + return Err(TransformError::UnsupportedConversion( + "CodeInterpreter tool is not supported in ChatCompletions API conversion." + .to_string(), + )); + } + ResponsesTool::Computer { .. } => { + return Err(TransformError::UnsupportedConversion( + "Computer tool is not supported in ChatCompletions API conversion." + .to_string(), + )); + } + } + } + } + + let tools = if converted_chat_tools.is_empty() { + None + } else { + Some(converted_chat_tools) + }; + // Convert input to messages using the shared converter let converter = ResponsesInputConverter { input: req.input, @@ -418,57 +654,24 @@ impl TryFrom for ChatCompletionsRequest { service_tier: req.service_tier, top_logprobs: req.top_logprobs.map(|t| t as u32), modalities: req.modalities.map(|mods| { - mods.into_iter().map(|m| { - match m { + mods.into_iter() + .map(|m| match m { Modality::Text => "text".to_string(), Modality::Audio => "audio".to_string(), - } - }).collect() + }) + .collect() }), - stream_options: req.stream_options.map(|opts| { - crate::apis::openai::StreamOptions { + stream_options: req + .stream_options + .map(|opts| crate::apis::openai::StreamOptions { include_usage: opts.include_usage, - } + }), + reasoning_effort: req.reasoning_effort.map(|effort| match effort { + ReasoningEffort::Low => "low".to_string(), + ReasoningEffort::Medium => "medium".to_string(), + ReasoningEffort::High => "high".to_string(), }), - reasoning_effort: req.reasoning_effort.map(|effort| { - match effort { - ReasoningEffort::Low => "low".to_string(), - ReasoningEffort::Medium => "medium".to_string(), - ReasoningEffort::High => "high".to_string(), - } - }), - tools: req.tools.map(|tools| { - tools.into_iter().map(|tool| { - - // Only convert Function tools - other types are not supported in ChatCompletions - match tool { - ResponsesTool::Function { name, description, parameters, strict } => Ok(Tool { - tool_type: "function".to_string(), - function: crate::apis::openai::Function { - name, - description, - parameters: parameters.unwrap_or_else(|| serde_json::json!({ - "type": "object", - "properties": {} - })), - strict, - } - }), - ResponsesTool::FileSearch { .. } => Err(TransformError::UnsupportedConversion( - "FileSearch tool is not supported in ChatCompletions API. Only function tools are supported.".to_string() - )), - ResponsesTool::WebSearchPreview { .. } => Err(TransformError::UnsupportedConversion( - "WebSearchPreview tool is not supported in ChatCompletions API. Only function tools are supported.".to_string() - )), - ResponsesTool::CodeInterpreter => Err(TransformError::UnsupportedConversion( - "CodeInterpreter tool is not supported in ChatCompletions API. Only function tools are supported.".to_string() - )), - ResponsesTool::Computer { .. } => Err(TransformError::UnsupportedConversion( - "Computer tool is not supported in ChatCompletions API. Only function tools are supported.".to_string() - )), - } - }).collect::, _>>() - }).transpose()?, + tools, tool_choice: req.tool_choice.map(|choice| { match choice { ResponsesToolChoice::String(s) => { @@ -481,11 +684,14 @@ impl TryFrom for ChatCompletionsRequest { } ResponsesToolChoice::Named { function, .. } => ToolChoice::Function { choice_type: "function".to_string(), - function: crate::apis::openai::FunctionChoice { name: function.name } - } + function: crate::apis::openai::FunctionChoice { + name: function.name, + }, + }, } }), parallel_tool_calls: req.parallel_tool_calls, + web_search_options, ..Default::default() }) } @@ -1027,4 +1233,235 @@ mod tests { panic!("Expected text content block"); } } + + #[test] + fn test_responses_custom_tool_maps_to_function_tool_for_chat_completions() { + use crate::apis::openai_responses::{ + InputParam, ResponsesAPIRequest, Tool as ResponsesTool, + }; + + let req = ResponsesAPIRequest { + model: "gpt-5.3-codex".to_string(), + input: InputParam::Text("use custom tool".to_string()), + tools: Some(vec![ResponsesTool::Custom { + name: Some("run_patch".to_string()), + description: Some("Apply structured patch".to_string()), + format: Some(serde_json::json!({ + "kind": "patch", + "version": "v1" + })), + }]), + include: None, + parallel_tool_calls: None, + store: None, + instructions: None, + stream: None, + stream_options: None, + conversation: None, + tool_choice: None, + max_output_tokens: None, + temperature: None, + top_p: None, + metadata: None, + previous_response_id: None, + modalities: None, + audio: None, + text: None, + reasoning_effort: None, + truncation: None, + user: None, + max_tool_calls: None, + service_tier: None, + background: None, + top_logprobs: None, + }; + + let converted = ChatCompletionsRequest::try_from(req).expect("conversion should succeed"); + let tools = converted.tools.expect("tools should be present"); + assert_eq!(tools.len(), 1); + assert_eq!(tools[0].tool_type, "function"); + assert_eq!(tools[0].function.name, "run_patch"); + assert_eq!( + tools[0].function.description.as_deref(), + Some("Apply structured patch") + ); + } + + #[test] + fn test_responses_web_search_maps_to_chat_web_search_options() { + use crate::apis::openai_responses::{ + InputParam, ResponsesAPIRequest, Tool as ResponsesTool, UserLocation, + }; + + let req = ResponsesAPIRequest { + model: "gpt-5.3-codex".to_string(), + input: InputParam::Text("find project docs".to_string()), + tools: Some(vec![ResponsesTool::WebSearchPreview { + domains: Some(vec!["docs.planoai.dev".to_string()]), + search_context_size: Some("medium".to_string()), + user_location: Some(UserLocation { + location_type: "approximate".to_string(), + city: Some("San Francisco".to_string()), + country: Some("US".to_string()), + region: Some("CA".to_string()), + timezone: Some("America/Los_Angeles".to_string()), + }), + }]), + include: None, + parallel_tool_calls: None, + store: None, + instructions: None, + stream: None, + stream_options: None, + conversation: None, + tool_choice: None, + max_output_tokens: None, + temperature: None, + top_p: None, + metadata: None, + previous_response_id: None, + modalities: None, + audio: None, + text: None, + reasoning_effort: None, + truncation: None, + user: None, + max_tool_calls: None, + service_tier: None, + background: None, + top_logprobs: None, + }; + + let converted = ChatCompletionsRequest::try_from(req).expect("conversion should succeed"); + assert!(converted.web_search_options.is_some()); + } + + #[test] + fn test_responses_function_call_output_maps_to_tool_message() { + use crate::apis::openai_responses::{ + InputItem, InputParam, ResponsesAPIRequest, Tool as ResponsesTool, + }; + + let req = ResponsesAPIRequest { + model: "gpt-5.3-codex".to_string(), + input: InputParam::Items(vec![InputItem::FunctionCallOutput { + item_type: "function_call_output".to_string(), + call_id: "call_123".to_string(), + output: serde_json::json!({"status":"ok","stdout":"hello"}), + }]), + tools: Some(vec![ResponsesTool::Function { + name: "exec_command".to_string(), + description: Some("Execute a shell command".to_string()), + parameters: Some(serde_json::json!({ + "type": "object", + "properties": { + "cmd": { "type": "string" } + }, + "required": ["cmd"] + })), + strict: Some(false), + }]), + include: None, + parallel_tool_calls: None, + store: None, + instructions: None, + stream: None, + stream_options: None, + conversation: None, + tool_choice: None, + max_output_tokens: None, + temperature: None, + top_p: None, + metadata: None, + previous_response_id: None, + modalities: None, + audio: None, + text: None, + reasoning_effort: None, + truncation: None, + user: None, + max_tool_calls: None, + service_tier: None, + background: None, + top_logprobs: None, + }; + + let converted = ChatCompletionsRequest::try_from(req).expect("conversion should succeed"); + assert_eq!(converted.messages.len(), 1); + assert!(matches!(converted.messages[0].role, Role::Tool)); + assert_eq!( + converted.messages[0].tool_call_id.as_deref(), + Some("call_123") + ); + } + + #[test] + fn test_responses_function_call_and_output_preserve_call_id_link() { + use crate::apis::openai_responses::{ + InputItem, InputMessage, MessageContent as ResponsesMessageContent, MessageRole, + ResponsesAPIRequest, + }; + + let req = ResponsesAPIRequest { + model: "gpt-5.3-codex".to_string(), + input: InputParam::Items(vec![ + InputItem::Message(InputMessage { + role: MessageRole::Assistant, + content: ResponsesMessageContent::Items(vec![]), + }), + InputItem::FunctionCall { + item_type: "function_call".to_string(), + name: "exec_command".to_string(), + arguments: "{\"cmd\":\"pwd\"}".to_string(), + call_id: "toolu_abc123".to_string(), + }, + InputItem::FunctionCallOutput { + item_type: "function_call_output".to_string(), + call_id: "toolu_abc123".to_string(), + output: serde_json::Value::String("ok".to_string()), + }, + ]), + tools: None, + include: None, + parallel_tool_calls: None, + store: None, + instructions: None, + stream: None, + stream_options: None, + conversation: None, + tool_choice: None, + max_output_tokens: None, + temperature: None, + top_p: None, + metadata: None, + previous_response_id: None, + modalities: None, + audio: None, + text: None, + reasoning_effort: None, + truncation: None, + user: None, + max_tool_calls: None, + service_tier: None, + background: None, + top_logprobs: None, + }; + + let converted = ChatCompletionsRequest::try_from(req).expect("conversion should succeed"); + assert_eq!(converted.messages.len(), 2); + + assert!(matches!(converted.messages[0].role, Role::Assistant)); + let tool_calls = converted.messages[0] + .tool_calls + .as_ref() + .expect("assistant tool_calls should be present"); + assert_eq!(tool_calls.len(), 1); + assert_eq!(tool_calls[0].id, "toolu_abc123"); + + assert!(matches!(converted.messages[1].role, Role::Tool)); + assert_eq!( + converted.messages[1].tool_call_id.as_deref(), + Some("toolu_abc123") + ); + } } diff --git a/crates/hermesllm/src/transforms/response_streaming/to_openai_streaming.rs b/crates/hermesllm/src/transforms/response_streaming/to_openai_streaming.rs index 328317bc..4aa719af 100644 --- a/crates/hermesllm/src/transforms/response_streaming/to_openai_streaming.rs +++ b/crates/hermesllm/src/transforms/response_streaming/to_openai_streaming.rs @@ -512,19 +512,12 @@ impl TryFrom for ResponsesAPIStreamEvent { } } - // Handle finish_reason - this is a completion signal - // Return an empty delta that the buffer can use to detect completion + // Handle finish_reason - this is a completion signal. + // Emit an explicit Done marker so the buffering layer can finalize + // even if an upstream [DONE] marker is missing/delayed. if choice.finish_reason.is_some() { - // Return a minimal text delta to signal completion - // The buffer will handle the finish_reason and generate response.completed - return Ok(ResponsesAPIStreamEvent::ResponseOutputTextDelta { - item_id: "".to_string(), // Buffer will fill this - output_index: choice.index as i32, - content_index: 0, - delta: "".to_string(), // Empty delta signals completion - logprobs: vec![], - obfuscation: None, - sequence_number: 0, // Buffer will fill this + return Ok(ResponsesAPIStreamEvent::Done { + sequence_number: 0, // Buffer will assign final sequence }); } diff --git a/crates/llm_gateway/src/stream_context.rs b/crates/llm_gateway/src/stream_context.rs index 547ba166..7a353bcb 100644 --- a/crates/llm_gateway/src/stream_context.rs +++ b/crates/llm_gateway/src/stream_context.rs @@ -1046,7 +1046,8 @@ impl HttpContext for StreamContext { ); match ProviderRequestType::try_from((deserialized_client_request, upstream)) { - Ok(request) => { + Ok(mut request) => { + request.normalize_for_upstream(self.get_provider_id(), upstream); debug!( "request_id={}: upstream request payload: {}", self.request_identifier(), diff --git a/demos/README.md b/demos/README.md index a2613454..6e467a33 100644 --- a/demos/README.md +++ b/demos/README.md @@ -16,6 +16,7 @@ This directory contains demos showcasing Plano's capabilities as an AI-native pr | [Preference-Based Routing](llm_routing/preference_based_routing/) | Routes prompts to LLMs based on user-defined preferences and task type (e.g. code generation vs. understanding) | | [Model Alias Routing](llm_routing/model_alias_routing/) | Maps semantic aliases (`arch.summarize.v1`) to provider-specific models for centralized governance | | [Claude Code Router](llm_routing/claude_code_router/) | Extends Claude Code with multi-provider access and preference-aligned routing for coding tasks | +| [Codex Router](llm_routing/codex_router/) | Extends Codex CLI with multi-provider access and preference-aligned routing for coding tasks | ## Agent Orchestration diff --git a/demos/llm_routing/codex_router/README.md b/demos/llm_routing/codex_router/README.md new file mode 100644 index 00000000..d3662581 --- /dev/null +++ b/demos/llm_routing/codex_router/README.md @@ -0,0 +1,92 @@ +# Codex Router - Multi-Model Access with Intelligent Routing + +Plano extends Codex CLI to access multiple LLM providers through a single interface. This gives you: + +1. **Access to Models**: Connect to OpenAI, Anthropic, xAI, Gemini, and local models via Ollama +2. **Intelligent Routing via Preferences for Coding Tasks**: Configure which models handle specific development tasks: + - Code generation and implementation + - Code understanding and analysis + - Debugging and optimization + - Architecture and system design + +Uses a [1.5B preference-aligned router LLM](https://arxiv.org/abs/2506.16655) to automatically select the best model based on your request type. + +## Benefits + +- **Single Interface**: Access multiple LLM providers through the same Codex CLI +- **Task-Aware Routing**: Requests are analyzed and routed to models based on task type (code generation vs code understanding) +- **Provider Flexibility**: Add or remove providers without changing your workflow +- **Routing Transparency**: See which model handles each request and why + +## Quick Start + +### Prerequisites + +```bash +# Install Codex CLI +npm install -g @openai/codex + +# Install Plano CLI +pip install planoai +``` + +### Step 1: Open the Demo + +```bash +git clone https://github.com/katanemo/arch.git +cd arch/demos/llm_routing/codex_router +``` + +### Step 2: Set API Keys + +```bash +export OPENAI_API_KEY="your-openai-key-here" +export ANTHROPIC_API_KEY="your-anthropic-key-here" +export XAI_API_KEY="your-xai-key-here" +export GEMINI_API_KEY="your-gemini-key-here" +``` + +### Step 3: Start Plano + +```bash +planoai up +# or: uvx planoai up +``` + +### Step 4: Launch Codex Through Plano + +```bash +planoai cli-agent codex +# or: uvx planoai cli-agent codex +``` + +By default, `planoai cli-agent codex` starts Codex with `gpt-5.3-codex`. With this demo config: + +- `code understanding` prompts are routed to `gpt-5-2025-08-07` +- `code generation` prompts are routed to `gpt-5.3-codex` + +## Monitor Routing Decisions + +In a second terminal: + +```bash +sh pretty_model_resolution.sh +``` + +This shows each request model and the final model selected by Plano's router. + +## Configuration Highlights + +`config.yaml` demonstrates: + +- OpenAI default model for Codex sessions (`gpt-5.3-codex`) +- Routing preference override for code understanding (`gpt-5-2025-08-07`) +- Additional providers (Anthropic, xAI, Gemini, Ollama local) to show cross-provider routing support + +## Optional Overrides + +Set a different Codex session model: + +```bash +planoai cli-agent codex --settings='{"CODEX_MODEL":"gpt-5-2025-08-07"}' +``` diff --git a/demos/llm_routing/codex_router/config.yaml b/demos/llm_routing/codex_router/config.yaml new file mode 100644 index 00000000..7cafe641 --- /dev/null +++ b/demos/llm_routing/codex_router/config.yaml @@ -0,0 +1,38 @@ +version: v0.3.0 + +listeners: + - type: model + name: model_listener + port: 12000 + +model_providers: + # OpenAI models used by Codex defaults and preference routing + - model: openai/gpt-5.3-codex + default: true + access_key: $OPENAI_API_KEY + routing_preferences: + - name: code generation + description: generating new code snippets, functions, or boilerplate based on user prompts or requirements + + - model: xai/grok-4-1-fast-non-reasoning + access_key: $GROK_API_KEY + routing_preferences: + - name: project understanding + description: understand repository structure, codebase, and code files, readmes, and other documentation + + # Additional providers (optional): Codex can route to any configured model + # - model: anthropic/claude-sonnet-4-5 + # access_key: $ANTHROPIC_API_KEY + + # - model: xai/grok-4-1-fast-non-reasoning + # access_key: $GROK_API_KEY + + - model: ollama/llama3.1 + base_url: http://localhost:11434 + +model_aliases: + arch.codex.default: + target: gpt-5.3-codex + +tracing: + random_sampling: 100 diff --git a/demos/llm_routing/codex_router/pretty_model_resolution.sh b/demos/llm_routing/codex_router/pretty_model_resolution.sh new file mode 100644 index 00000000..b6187e65 --- /dev/null +++ b/demos/llm_routing/codex_router/pretty_model_resolution.sh @@ -0,0 +1,33 @@ +#!/usr/bin/env bash +# Pretty-print Plano MODEL_RESOLUTION lines from docker logs +# - hides Arch-Router +# - prints timestamp +# - colors MODEL_RESOLUTION red +# - colors req_model cyan +# - colors resolved_model magenta +# - removes provider and streaming + +docker logs -f plano 2>&1 \ +| awk ' +/MODEL_RESOLUTION:/ && $0 !~ /Arch-Router/ { + # extract timestamp between first [ and ] + ts="" + if (match($0, /\[[0-9-]+ [0-9:.]+\]/)) { + ts=substr($0, RSTART+1, RLENGTH-2) + } + + # split out after MODEL_RESOLUTION: + n = split($0, parts, /MODEL_RESOLUTION: */) + line = parts[2] + + # remove provider and streaming fields + sub(/ *provider='\''[^'\'']+'\''/, "", line) + sub(/ *streaming=(true|false)/, "", line) + + # highlight fields + gsub(/req_model='\''[^'\'']+'\''/, "\033[36m&\033[0m", line) + gsub(/resolved_model='\''[^'\'']+'\''/, "\033[35m&\033[0m", line) + + # print timestamp + MODEL_RESOLUTION + printf "\033[90m[%s]\033[0m \033[31mMODEL_RESOLUTION\033[0m: %s\n", ts, line +}' From b4313d93a480f2470a5630451305697999fed8b1 Mon Sep 17 00:00:00 2001 From: Adil Hafeez Date: Wed, 11 Mar 2026 12:49:36 -0700 Subject: [PATCH 5/8] Run demos without Docker (#809) --- demos/advanced/currency_exchange/run_demo.sh | 23 ++--- .../multi_turn_rag/docker-compose.yaml | 11 --- demos/advanced/multi_turn_rag/pyproject.toml | 12 +++ demos/advanced/multi_turn_rag/run_demo.sh | 29 ++++--- demos/advanced/multi_turn_rag/start_agents.sh | 24 ++++++ demos/advanced/stock_quote/run_demo.sh | 23 ++--- .../multi_agent_crewai_langchain/README.md | 27 ++++-- .../multi_agent_crewai_langchain/config.yaml | 4 +- .../docker-compose.yaml | 25 +----- .../multi_agent_crewai_langchain/run_demo.sh | 33 ++++---- .../start_agents.sh | 30 +++++++ .../travel_agents/README.md | 29 +++++-- .../travel_agents/docker-compose.yaml | 32 +------ .../travel_agents/run_demo.sh | 28 +++++-- .../travel_agents/start_agents.sh | 30 +++++++ demos/filter_chains/http_filter/README.md | 26 +++--- demos/filter_chains/http_filter/config.yaml | 8 +- .../http_filter/docker-compose.yaml | 14 +--- demos/filter_chains/http_filter/run_demo.sh | 28 +++++-- .../filter_chains/http_filter/start_agents.sh | 84 +++++-------------- demos/filter_chains/mcp_filter/README.md | 20 +++-- .../mcp_filter/docker-compose.yaml | 14 +--- demos/filter_chains/mcp_filter/run_demo.sh | 28 +++++-- demos/getting_started/llm_gateway/README.md | 11 ++- demos/getting_started/llm_gateway/run_demo.sh | 23 ++--- .../weather_forecast/README.md | 19 ++++- .../weather_forecast/docker-compose.yaml | 10 --- .../weather_forecast/run_demo.sh | 29 ++++--- .../weather_forecast/start_agents.sh | 24 ++++++ demos/integrations/ollama/run_demo.sh | 52 ++++++++---- .../spotify_bearer_auth/run_demo.sh | 23 ++--- .../preference_based_routing/README.md | 18 ++-- .../preference_based_routing/run_demo.sh | 22 ++--- tests/e2e/run_e2e_tests.sh | 11 ++- tests/e2e/run_prompt_gateway_tests.sh | 11 ++- 35 files changed, 488 insertions(+), 347 deletions(-) create mode 100644 demos/advanced/multi_turn_rag/pyproject.toml create mode 100755 demos/advanced/multi_turn_rag/start_agents.sh create mode 100755 demos/agent_orchestration/multi_agent_crewai_langchain/start_agents.sh create mode 100755 demos/agent_orchestration/travel_agents/start_agents.sh mode change 100644 => 100755 demos/filter_chains/http_filter/start_agents.sh create mode 100755 demos/getting_started/weather_forecast/start_agents.sh diff --git a/demos/advanced/currency_exchange/run_demo.sh b/demos/advanced/currency_exchange/run_demo.sh index 6623dee5..e430a1cd 100644 --- a/demos/advanced/currency_exchange/run_demo.sh +++ b/demos/advanced/currency_exchange/run_demo.sh @@ -18,22 +18,24 @@ start_demo() { echo ".env file created with OPENAI_API_KEY." fi - # Step 3: Start Plano + # Step 3: Optionally start UI services (AnythingLLM, Jaeger) + # Jaeger must start before Plano so it can bind the OTEL port (4317) + if [ "$1" == "--with-ui" ]; then + echo "Starting UI services (AnythingLLM, Jaeger)..." + docker compose up -d + fi + + # Step 4: Start Plano echo "Starting Plano with config.yaml..." planoai up config.yaml - - # Step 4: Start developer services - echo "Starting Network Agent using Docker Compose..." - docker compose up -d # Run in detached mode } # Function to stop the demo stop_demo() { - # Step 1: Stop Docker Compose services - echo "Stopping Network Agent using Docker Compose..." - docker compose down + # Stop Docker Compose services if running + docker compose down 2>/dev/null || true - # Step 2: Stop Plano + # Stop Plano echo "Stopping Plano..." planoai down } @@ -42,6 +44,5 @@ stop_demo() { if [ "$1" == "down" ]; then stop_demo else - # Default action is to bring the demo up - start_demo + start_demo "$1" fi diff --git a/demos/advanced/multi_turn_rag/docker-compose.yaml b/demos/advanced/multi_turn_rag/docker-compose.yaml index 1c3ed73c..f36987e4 100644 --- a/demos/advanced/multi_turn_rag/docker-compose.yaml +++ b/demos/advanced/multi_turn_rag/docker-compose.yaml @@ -1,15 +1,4 @@ services: - rag_energy_source_agent: - build: - context: . - dockerfile: Dockerfile - ports: - - "18083:80" - healthcheck: - test: ["CMD", "curl" ,"http://localhost:80/healthz"] - interval: 5s - retries: 20 - anythingllm: image: mintplexlabs/anythingllm restart: always diff --git a/demos/advanced/multi_turn_rag/pyproject.toml b/demos/advanced/multi_turn_rag/pyproject.toml new file mode 100644 index 00000000..05824bd6 --- /dev/null +++ b/demos/advanced/multi_turn_rag/pyproject.toml @@ -0,0 +1,12 @@ +[project] +name = "multi-turn-rag" +version = "0.1.0" +requires-python = ">=3.12" +dependencies = [ + "fastapi", + "uvicorn", + "pydantic>=2.8", + "httpx>=0.27", + "openai>=1.51", + "python-dotenv>=1.0", +] diff --git a/demos/advanced/multi_turn_rag/run_demo.sh b/demos/advanced/multi_turn_rag/run_demo.sh index f9434aa2..5bec6368 100644 --- a/demos/advanced/multi_turn_rag/run_demo.sh +++ b/demos/advanced/multi_turn_rag/run_demo.sh @@ -18,22 +18,32 @@ start_demo() { echo ".env file created with OPENAI_API_KEY." fi - # Step 3: Start Plano + # Step 3: Optionally start UI services (AnythingLLM) + # UI services must start before Plano to avoid OTEL port conflicts + if [ "$1" == "--with-ui" ]; then + echo "Starting UI services (AnythingLLM)..." + docker compose up -d + fi + + # Step 4: Start Plano echo "Starting Plano with config.yaml..." planoai up config.yaml - # Step 4: Start Network Agent - echo "Starting HR Agent using Docker Compose..." - docker compose up -d # Run in detached mode + # Step 5: Start agents natively + echo "Starting agents..." + bash start_agents.sh & } # Function to stop the demo stop_demo() { - # Step 1: Stop Docker Compose services - echo "Stopping HR Agent using Docker Compose..." - docker compose down -v + # Stop agents + echo "Stopping agents..." + pkill -f start_agents.sh 2>/dev/null || true - # Step 2: Stop Plano + # Stop Docker Compose services if running + docker compose down 2>/dev/null || true + + # Stop Plano echo "Stopping Plano..." planoai down } @@ -42,6 +52,5 @@ stop_demo() { if [ "$1" == "down" ]; then stop_demo else - # Default action is to bring the demo up - start_demo + start_demo "$1" fi diff --git a/demos/advanced/multi_turn_rag/start_agents.sh b/demos/advanced/multi_turn_rag/start_agents.sh new file mode 100755 index 00000000..00b7f1b1 --- /dev/null +++ b/demos/advanced/multi_turn_rag/start_agents.sh @@ -0,0 +1,24 @@ +#!/bin/bash +set -e + +PIDS=() + +log() { echo "$(date '+%F %T') - $*"; } + +cleanup() { + log "Stopping agents..." + for PID in "${PIDS[@]}"; do + kill $PID 2>/dev/null && log "Stopped process $PID" + done + exit 0 +} + +trap cleanup EXIT INT TERM + +log "Starting rag_energy_source_agent on port 18083..." +uv run uvicorn main:app --host 0.0.0.0 --port 18083 & +PIDS+=($!) + +for PID in "${PIDS[@]}"; do + wait "$PID" +done diff --git a/demos/advanced/stock_quote/run_demo.sh b/demos/advanced/stock_quote/run_demo.sh index 6623dee5..e430a1cd 100644 --- a/demos/advanced/stock_quote/run_demo.sh +++ b/demos/advanced/stock_quote/run_demo.sh @@ -18,22 +18,24 @@ start_demo() { echo ".env file created with OPENAI_API_KEY." fi - # Step 3: Start Plano + # Step 3: Optionally start UI services (AnythingLLM, Jaeger) + # Jaeger must start before Plano so it can bind the OTEL port (4317) + if [ "$1" == "--with-ui" ]; then + echo "Starting UI services (AnythingLLM, Jaeger)..." + docker compose up -d + fi + + # Step 4: Start Plano echo "Starting Plano with config.yaml..." planoai up config.yaml - - # Step 4: Start developer services - echo "Starting Network Agent using Docker Compose..." - docker compose up -d # Run in detached mode } # Function to stop the demo stop_demo() { - # Step 1: Stop Docker Compose services - echo "Stopping Network Agent using Docker Compose..." - docker compose down + # Stop Docker Compose services if running + docker compose down 2>/dev/null || true - # Step 2: Stop Plano + # Stop Plano echo "Stopping Plano..." planoai down } @@ -42,6 +44,5 @@ stop_demo() { if [ "$1" == "down" ]; then stop_demo else - # Default action is to bring the demo up - start_demo + start_demo "$1" fi diff --git a/demos/agent_orchestration/multi_agent_crewai_langchain/README.md b/demos/agent_orchestration/multi_agent_crewai_langchain/README.md index e2fe23fb..97d71e7f 100644 --- a/demos/agent_orchestration/multi_agent_crewai_langchain/README.md +++ b/demos/agent_orchestration/multi_agent_crewai_langchain/README.md @@ -41,21 +41,36 @@ cd demos/agent_orchestration/multi_agent_crewai_langchain ./run_demo.sh ``` -This starts Plano natively and brings up via Docker Compose: +This starts Plano natively and runs agents as local processes: - **CrewAI Flight Agent** (port 10520) - flight search - **LangChain Weather Agent** (port 10510) - weather forecasts -- **AnythingLLM** (port 3001) - chat interface -- **Jaeger** (port 16686) - distributed tracing Plano runs natively on the host (ports 12000, 8001). +To also start AnythingLLM (chat UI), Jaeger (tracing), and other optional services: + +```bash +./run_demo.sh --with-ui +``` + +This additionally starts: +- **AnythingLLM** (port 3001) - chat interface +- **Jaeger** (port 16686) - distributed tracing + ### Try It Out -1. **Open the Chat Interface** +1. **Using curl** + ```bash + curl -X POST http://localhost:8001/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{"model": "gpt-4o", "messages": [{"role": "user", "content": "What is the weather in San Francisco?"}]}' + ``` + +2. **Using AnythingLLM (requires `--with-ui`)** - Navigate to [http://localhost:3001](http://localhost:3001) - Create an account (stored locally) -2. **Ask Multi-Agent Questions** +3. **Ask Multi-Agent Questions** ``` "What's the weather in San Francisco and can you find flights from Seattle to San Francisco?" ``` @@ -65,7 +80,7 @@ Plano runs natively on the host (ports 12000, 8001). - Routes the flight part to the CrewAI agent - Combines responses seamlessly -3. **View Distributed Traces** +4. **View Distributed Traces (requires `--with-ui`)** - Open [http://localhost:16686](http://localhost:16686) (Jaeger UI) - See how requests flow through both agents diff --git a/demos/agent_orchestration/multi_agent_crewai_langchain/config.yaml b/demos/agent_orchestration/multi_agent_crewai_langchain/config.yaml index b3a204f3..ef522337 100644 --- a/demos/agent_orchestration/multi_agent_crewai_langchain/config.yaml +++ b/demos/agent_orchestration/multi_agent_crewai_langchain/config.yaml @@ -2,9 +2,9 @@ version: v0.3.0 agents: - id: weather_agent - url: http://langchain-weather-agent:10510 + url: http://localhost:10510 - id: flight_agent - url: http://crewai-flight-agent:10520 + url: http://localhost:10520 model_providers: - model: openai/gpt-4o diff --git a/demos/agent_orchestration/multi_agent_crewai_langchain/docker-compose.yaml b/demos/agent_orchestration/multi_agent_crewai_langchain/docker-compose.yaml index 2d9c180b..74954562 100644 --- a/demos/agent_orchestration/multi_agent_crewai_langchain/docker-compose.yaml +++ b/demos/agent_orchestration/multi_agent_crewai_langchain/docker-compose.yaml @@ -1,27 +1,5 @@ services: - crewai-flight-agent: - build: - dockerfile: Dockerfile - restart: always - ports: - - "10520:10520" - environment: - - LLM_GATEWAY_ENDPOINT=http://host.docker.internal:12000/v1 - - AEROAPI_KEY=${AEROAPI_KEY:?AEROAPI_KEY environment variable is required but not set} - - PYTHONUNBUFFERED=1 - command: ["python", "-u", "crewai/flight_agent.py"] - - langchain-weather-agent: - build: - dockerfile: Dockerfile - restart: always - ports: - - "10510:10510" - environment: - - LLM_GATEWAY_ENDPOINT=http://host.docker.internal:12000/v1 - command: ["python", "-u", "langchain/weather_agent.py"] - anythingllm: image: mintplexlabs/anythingllm restart: always @@ -36,6 +14,8 @@ services: - GENERIC_OPEN_AI_MODEL_PREF=gpt-4o-mini - GENERIC_OPEN_AI_MODEL_TOKEN_LIMIT=128000 - GENERIC_OPEN_AI_API_KEY=sk-placeholder + extra_hosts: + - "host.docker.internal:host-gateway" jaeger: build: @@ -44,3 +24,4 @@ services: ports: - "16686:16686" # Jaeger UI - "4317:4317" # OTLP gRPC receiver + - "4318:4318" # OTLP HTTP receiver diff --git a/demos/agent_orchestration/multi_agent_crewai_langchain/run_demo.sh b/demos/agent_orchestration/multi_agent_crewai_langchain/run_demo.sh index b7dc0fad..35bbbbdd 100755 --- a/demos/agent_orchestration/multi_agent_crewai_langchain/run_demo.sh +++ b/demos/agent_orchestration/multi_agent_crewai_langchain/run_demo.sh @@ -12,33 +12,38 @@ start_demo() { echo "Error: OPENAI_API_KEY environment variable is not set for the demo." exit 1 fi - if [ -z "$AEROAPI_KEY" ]; then - echo "Error: AEROAPI_KEY environment variable is not set for the demo." - exit 1 - fi echo "Creating .env file..." echo "OPENAI_API_KEY=$OPENAI_API_KEY" > .env - echo "AEROAPI_KEY=$AEROAPI_KEY" >> .env echo ".env file created with API keys." fi - # Step 3: Start Plano + # Step 3: Optionally start UI services (AnythingLLM, Jaeger) + # Jaeger must start before Plano so it can bind the OTEL port (4317) + if [ "$1" == "--with-ui" ]; then + echo "Starting UI services (AnythingLLM, Jaeger)..." + docker compose up -d + fi + + # Step 4: Start Plano echo "Starting Plano with config.yaml..." planoai up config.yaml - # Step 4: Start agents and services - echo "Starting agents using Docker Compose..." - docker compose up -d + # Step 5: Start agents natively + echo "Starting agents..." + bash start_agents.sh & } # Function to stop the demo stop_demo() { - # Step 1: Stop Docker Compose services - echo "Stopping Docker Compose services..." - docker compose down + # Stop agents + echo "Stopping agents..." + pkill -f start_agents.sh 2>/dev/null || true - # Step 2: Stop Plano + # Stop Docker Compose services if running + docker compose down 2>/dev/null || true + + # Stop Plano echo "Stopping Plano..." planoai down } @@ -47,5 +52,5 @@ stop_demo() { if [ "$1" == "down" ]; then stop_demo else - start_demo + start_demo "$1" fi diff --git a/demos/agent_orchestration/multi_agent_crewai_langchain/start_agents.sh b/demos/agent_orchestration/multi_agent_crewai_langchain/start_agents.sh new file mode 100755 index 00000000..78d2fecb --- /dev/null +++ b/demos/agent_orchestration/multi_agent_crewai_langchain/start_agents.sh @@ -0,0 +1,30 @@ +#!/bin/bash +set -e + +PIDS=() + +log() { echo "$(date '+%F %T') - $*"; } + +cleanup() { + log "Stopping agents..." + for PID in "${PIDS[@]}"; do + kill $PID 2>/dev/null && log "Stopped process $PID" + done + exit 0 +} + +trap cleanup EXIT INT TERM + +export LLM_GATEWAY_ENDPOINT=http://localhost:12000/v1 + +log "Starting langchain weather_agent on port 10510..." +uv run python langchain/weather_agent.py & +PIDS+=($!) + +log "Starting crewai flight_agent on port 10520..." +uv run python crewai/flight_agent.py & +PIDS+=($!) + +for PID in "${PIDS[@]}"; do + wait "$PID" +done diff --git a/demos/agent_orchestration/travel_agents/README.md b/demos/agent_orchestration/travel_agents/README.md index d6468612..7886539d 100644 --- a/demos/agent_orchestration/travel_agents/README.md +++ b/demos/agent_orchestration/travel_agents/README.md @@ -23,9 +23,10 @@ All agents use Plano's agent orchestration LLM to intelligently route user reque ## Prerequisites - [Plano CLI](https://docs.planoai.dev/get_started/quickstart.html#prerequisites) installed (`pip install planoai`) -- Docker and Docker Compose (for agent services) +- [uv](https://docs.astral.sh/uv/) installed (for running agents natively) - [OpenAI API key](https://platform.openai.com/api-keys) - [FlightAware AeroAPI key](https://www.flightaware.com/aeroapi/portal) +- Docker and Docker Compose (optional, only needed for `--with-ui`) > **Note:** You'll need to obtain a FlightAware AeroAPI key for live flight data. Visit [https://www.flightaware.com/aeroapi/portal](https://www.flightaware.com/aeroapi/portal) to get your API key. @@ -46,16 +47,34 @@ export OPENAI_API_KEY="your OpenAI api key" ./run_demo.sh ``` -This starts Plano natively and brings up via Docker Compose: +This starts Plano natively and runs agents as local processes: - Weather Agent on port 10510 - Flight Agent on port 10520 -- Open WebUI on port 8080 Plano runs natively on the host (port 8001). +To also start Open WebUI, Jaeger tracing, and other optional services, pass `--with-ui`: + +```bash +./run_demo.sh --with-ui +``` + +This additionally starts: +- Open WebUI on port 8080 +- Jaeger tracing UI on port 16686 + ### 4. Test the System -Use Open WebUI at http://localhost:8080 +**Option A: Using curl** +```bash +curl -X POST http://localhost:8001/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{"model": "gpt-5.2", "messages": [{"role": "user", "content": "What is the weather in Istanbul?"}]}' +``` + +**Option B: Using Open WebUI (requires `--with-ui`)** + +Navigate to http://localhost:8080 > **Note:** The Open WebUI may take a few minutes to start up and be fully ready. Please wait for the container to finish initializing before accessing the interface. Once ready, make sure to select the **gpt-5.2** model from the model dropdown menu in the UI. @@ -102,7 +121,7 @@ Each agent: 3. Generates response using GPT-5.2 4. Streams response back to user -Both agents run as Docker containers and communicate with Plano running natively on the host. +Both agents run as native local processes and communicate with Plano running natively on the host. ## Observability diff --git a/demos/agent_orchestration/travel_agents/docker-compose.yaml b/demos/agent_orchestration/travel_agents/docker-compose.yaml index f0fb78e5..66edbdc3 100644 --- a/demos/agent_orchestration/travel_agents/docker-compose.yaml +++ b/demos/agent_orchestration/travel_agents/docker-compose.yaml @@ -1,32 +1,5 @@ services: - weather-agent: - build: - context: . - dockerfile: Dockerfile - container_name: weather-agent - restart: always - ports: - - "10510:10510" - environment: - - LLM_GATEWAY_ENDPOINT=http://host.docker.internal:12000/v1 - command: ["uv", "run", "python", "src/travel_agents/weather_agent.py"] - extra_hosts: - - "host.docker.internal:host-gateway" - flight-agent: - build: - context: . - dockerfile: Dockerfile - container_name: flight-agent - restart: always - ports: - - "10520:10520" - environment: - - LLM_GATEWAY_ENDPOINT=http://host.docker.internal:12000/v1 - - AEROAPI_KEY=${AEROAPI_KEY:? AEROAPI_KEY environment variable is required but not set} - command: ["uv", "run", "python", "src/travel_agents/flight_agent.py"] - extra_hosts: - - "host.docker.internal:host-gateway" open-web-ui: image: dyrnq/open-webui:main restart: always @@ -40,9 +13,8 @@ services: - ENABLE_TITLE_GENERATION=false - ENABLE_TAGS_GENERATION=false - ENABLE_AUTOCOMPLETE_GENERATION=false - depends_on: - - weather-agent - - flight-agent + extra_hosts: + - "host.docker.internal:host-gateway" jaeger: build: context: ../../shared/jaeger diff --git a/demos/agent_orchestration/travel_agents/run_demo.sh b/demos/agent_orchestration/travel_agents/run_demo.sh index b7dc0fad..643a0aa2 100755 --- a/demos/agent_orchestration/travel_agents/run_demo.sh +++ b/demos/agent_orchestration/travel_agents/run_demo.sh @@ -23,22 +23,32 @@ start_demo() { echo ".env file created with API keys." fi - # Step 3: Start Plano + # Step 3: Optionally start UI services (Open WebUI, Jaeger) + # Jaeger must start before Plano so it can bind the OTEL port (4317) + if [ "$1" == "--with-ui" ]; then + echo "Starting UI services (Open WebUI, Jaeger)..." + docker compose up -d + fi + + # Step 4: Start Plano echo "Starting Plano with config.yaml..." planoai up config.yaml - # Step 4: Start agents and services - echo "Starting agents using Docker Compose..." - docker compose up -d + # Step 5: Start agents natively + echo "Starting agents..." + bash start_agents.sh & } # Function to stop the demo stop_demo() { - # Step 1: Stop Docker Compose services - echo "Stopping Docker Compose services..." - docker compose down + # Stop agents + echo "Stopping agents..." + pkill -f start_agents.sh 2>/dev/null || true - # Step 2: Stop Plano + # Stop Docker Compose services if running + docker compose down 2>/dev/null || true + + # Stop Plano echo "Stopping Plano..." planoai down } @@ -47,5 +57,5 @@ stop_demo() { if [ "$1" == "down" ]; then stop_demo else - start_demo + start_demo "$1" fi diff --git a/demos/agent_orchestration/travel_agents/start_agents.sh b/demos/agent_orchestration/travel_agents/start_agents.sh new file mode 100755 index 00000000..4f2e32a7 --- /dev/null +++ b/demos/agent_orchestration/travel_agents/start_agents.sh @@ -0,0 +1,30 @@ +#!/bin/bash +set -e + +PIDS=() + +log() { echo "$(date '+%F %T') - $*"; } + +cleanup() { + log "Stopping agents..." + for PID in "${PIDS[@]}"; do + kill $PID 2>/dev/null && log "Stopped process $PID" + done + exit 0 +} + +trap cleanup EXIT INT TERM + +export LLM_GATEWAY_ENDPOINT=http://localhost:12000/v1 + +log "Starting weather_agent on port 10510..." +uv run python src/travel_agents/weather_agent.py & +PIDS+=($!) + +log "Starting flight_agent on port 10520..." +uv run python src/travel_agents/flight_agent.py & +PIDS+=($!) + +for PID in "${PIDS[@]}"; do + wait "$PID" +done diff --git a/demos/filter_chains/http_filter/README.md b/demos/filter_chains/http_filter/README.md index 5e675113..86748217 100644 --- a/demos/filter_chains/http_filter/README.md +++ b/demos/filter_chains/http_filter/README.md @@ -41,23 +41,27 @@ export OPENAI_API_KEY="your-key" ./run_demo.sh ``` -This starts Plano natively and brings up via Docker Compose: -- Input Guards MCP server on port 10500 -- Query Rewriter MCP server on port 10501 -- Context Builder MCP server on port 10502 +This starts Plano natively and runs agents as local processes: +- Input Guards HTTP server on port 10500 +- Query Rewriter HTTP server on port 10501 +- Context Builder HTTP server on port 10502 - RAG Agent REST server on port 10505 -- Jaeger UI for viewing traces at http://localhost:16686 -- AnythingLLM at http://localhost:3001 for interactive queries Plano runs natively on the host (port 8001 and 12000). +To also start AnythingLLM (chat UI) and Jaeger (tracing): + +```bash +./run_demo.sh --with-ui +``` + +This additionally starts: +- Jaeger UI for viewing traces at http://localhost:16686 +- AnythingLLM at http://localhost:3001 for interactive queries + ### 2. Test the system -**Option A: Using AnythingLLM (recommended)** - -Navigate to http://localhost:3001 and send queries through the chat interface. - -**Option B: Using curl** +**Option A: Using curl (recommended)** ```bash curl -X POST http://localhost:8001/v1/chat/completions \ -H "Content-Type: application/json" \ diff --git a/demos/filter_chains/http_filter/config.yaml b/demos/filter_chains/http_filter/config.yaml index 117931e2..014a141a 100644 --- a/demos/filter_chains/http_filter/config.yaml +++ b/demos/filter_chains/http_filter/config.yaml @@ -2,23 +2,23 @@ version: v0.3.0 agents: - id: rag_agent - url: http://rag-agents:10505 + url: http://localhost:10505 filters: - id: input_guards - url: http://rag-agents:10500 + url: http://localhost:10500 type: http # type: mcp (default) # transport: streamable-http (default) # tool: input_guards (default - same as filter id) - id: query_rewriter - url: http://rag-agents:10501 + url: http://localhost:10501 type: http # type: mcp (default) # transport: streamable-http (default) # tool: query_rewriter (default - same as filter id) - id: context_builder - url: http://rag-agents:10502 + url: http://localhost:10502 type: http model_providers: diff --git a/demos/filter_chains/http_filter/docker-compose.yaml b/demos/filter_chains/http_filter/docker-compose.yaml index 64962bce..0361926c 100644 --- a/demos/filter_chains/http_filter/docker-compose.yaml +++ b/demos/filter_chains/http_filter/docker-compose.yaml @@ -1,16 +1,4 @@ services: - rag-agents: - build: - context: . - dockerfile: Dockerfile - ports: - - "10500:10500" - - "10501:10501" - - "10502:10502" - - "10505:10505" - environment: - - LLM_GATEWAY_ENDPOINT=${LLM_GATEWAY_ENDPOINT:-http://host.docker.internal:12000/v1} - - OPENAI_API_KEY=${OPENAI_API_KEY:?OPENAI_API_KEY environment variable is required but not set} jaeger: build: context: ../../shared/jaeger @@ -32,3 +20,5 @@ services: - GENERIC_OPEN_AI_MODEL_PREF=gpt-4o-mini - GENERIC_OPEN_AI_MODEL_TOKEN_LIMIT=128000 - GENERIC_OPEN_AI_API_KEY=sk-placeholder + extra_hosts: + - "host.docker.internal:host-gateway" diff --git a/demos/filter_chains/http_filter/run_demo.sh b/demos/filter_chains/http_filter/run_demo.sh index bed84f16..f203f5b1 100755 --- a/demos/filter_chains/http_filter/run_demo.sh +++ b/demos/filter_chains/http_filter/run_demo.sh @@ -18,22 +18,32 @@ start_demo() { echo ".env file created with OPENAI_API_KEY." fi - # Step 3: Start Plano + # Step 3: Optionally start UI services (AnythingLLM, Jaeger) + # Jaeger must start before Plano so it can bind the OTEL port (4317) + if [ "$1" == "--with-ui" ]; then + echo "Starting UI services (AnythingLLM, Jaeger)..." + docker compose up -d + fi + + # Step 4: Start Plano echo "Starting Plano with config.yaml..." planoai up config.yaml - # Step 4: Start services - echo "Starting services using Docker Compose..." - docker compose up -d + # Step 5: Start agents natively + echo "Starting agents..." + bash start_agents.sh & } # Function to stop the demo stop_demo() { - # Step 1: Stop Docker Compose services - echo "Stopping Docker Compose services..." - docker compose down + # Stop agents + echo "Stopping agents..." + pkill -f start_agents.sh 2>/dev/null || true - # Step 2: Stop Plano + # Stop Docker Compose services if running + docker compose down 2>/dev/null || true + + # Stop Plano echo "Stopping Plano..." planoai down } @@ -42,5 +52,5 @@ stop_demo() { if [ "$1" == "down" ]; then stop_demo else - start_demo + start_demo "$1" fi diff --git a/demos/filter_chains/http_filter/start_agents.sh b/demos/filter_chains/http_filter/start_agents.sh old mode 100644 new mode 100755 index 06cabeec..8dfdc0f4 --- a/demos/filter_chains/http_filter/start_agents.sh +++ b/demos/filter_chains/http_filter/start_agents.sh @@ -1,78 +1,38 @@ -# #!/bin/bash -# set -e - -# WAIT_FOR_PIDS=() - -# log() { -# timestamp=$(python3 -c 'from datetime import datetime; print(datetime.now().strftime("%Y-%m-%d %H:%M:%S,%f")[:23])') -# message="$*" -# echo "$timestamp - $message" -# } - -# cleanup() { -# log "Caught signal, terminating all user processes ..." -# for PID in "${WAIT_FOR_PIDS[@]}"; do -# if kill $PID 2> /dev/null; then -# log "killed process: $PID" -# fi -# done -# exit 1 -# } - -# trap cleanup EXIT - -# log "Starting input_guards agent on port 10500/mcp..." -# uv run python -m rag_agent --rest-server --host 0.0.0.0 --rest-port 10500 --agent input_guards & -# WAIT_FOR_PIDS+=($!) - -# log "Starting query_rewriter agent on port 10501/mcp..." -# uv run python -m rag_agent --rest-server --host 0.0.0.0 --rest-port 10501 --agent query_rewriter & -# WAIT_FOR_PIDS+=($!) - -# log "Starting context_builder agent on port 10502/mcp..." -# uv run python -m rag_agent --rest-server --host 0.0.0.0 --rest-port 10502 --agent context_builder & -# WAIT_FOR_PIDS+=($!) - -# # log "Starting response_generator agent on port 10400..." -# # uv run python -m rag_agent --host 0.0.0.0 --port 10400 --agent response_generator & -# # WAIT_FOR_PIDS+=($!) - -# log "Starting response_generator agent on port 10505..." -# uv run python -m rag_agent --rest-server --host 0.0.0.0 --rest-port 10505 --agent response_generator & -# WAIT_FOR_PIDS+=($!) - -# for PID in "${WAIT_FOR_PIDS[@]}"; do -# wait "$PID" -# done - - - - #!/bin/bash set -e -export PYTHONPATH=/app/src - -pids=() +PIDS=() log() { echo "$(date '+%F %T') - $*"; } -log "Starting input_guards HTTP server on :10500" +cleanup() { + log "Stopping agents..." + for PID in "${PIDS[@]}"; do + kill $PID 2>/dev/null && log "Stopped process $PID" + done + exit 0 +} + +trap cleanup EXIT INT TERM + +export PYTHONPATH=./src + +log "Starting input_guards HTTP server on port 10500..." uv run uvicorn rag_agent.input_guards:app --host 0.0.0.0 --port 10500 & -pids+=($!) +PIDS+=($!) -log "Starting query_rewriter HTTP server on :10501" +log "Starting query_rewriter HTTP server on port 10501..." uv run uvicorn rag_agent.query_rewriter:app --host 0.0.0.0 --port 10501 & -pids+=($!) +PIDS+=($!) -log "Starting context_builder HTTP server on :10502" +log "Starting context_builder HTTP server on port 10502..." uv run uvicorn rag_agent.context_builder:app --host 0.0.0.0 --port 10502 & -pids+=($!) +PIDS+=($!) -log "Starting response_generator (OpenAI-compatible) on :10505" +log "Starting response_generator (OpenAI-compatible) on port 10505..." uv run uvicorn rag_agent.rag_agent:app --host 0.0.0.0 --port 10505 & -pids+=($!) +PIDS+=($!) -for PID in "${pids[@]}"; do +for PID in "${PIDS[@]}"; do wait "$PID" done diff --git a/demos/filter_chains/mcp_filter/README.md b/demos/filter_chains/mcp_filter/README.md index 5e675113..798015e2 100644 --- a/demos/filter_chains/mcp_filter/README.md +++ b/demos/filter_chains/mcp_filter/README.md @@ -41,23 +41,27 @@ export OPENAI_API_KEY="your-key" ./run_demo.sh ``` -This starts Plano natively and brings up via Docker Compose: +This starts Plano natively and runs agents as local processes: - Input Guards MCP server on port 10500 - Query Rewriter MCP server on port 10501 - Context Builder MCP server on port 10502 - RAG Agent REST server on port 10505 -- Jaeger UI for viewing traces at http://localhost:16686 -- AnythingLLM at http://localhost:3001 for interactive queries Plano runs natively on the host (port 8001 and 12000). +To also start AnythingLLM (chat UI) and Jaeger (tracing): + +```bash +./run_demo.sh --with-ui +``` + +This additionally starts: +- Jaeger UI for viewing traces at http://localhost:16686 +- AnythingLLM at http://localhost:3001 for interactive queries + ### 2. Test the system -**Option A: Using AnythingLLM (recommended)** - -Navigate to http://localhost:3001 and send queries through the chat interface. - -**Option B: Using curl** +**Option A: Using curl (recommended)** ```bash curl -X POST http://localhost:8001/v1/chat/completions \ -H "Content-Type: application/json" \ diff --git a/demos/filter_chains/mcp_filter/docker-compose.yaml b/demos/filter_chains/mcp_filter/docker-compose.yaml index 64962bce..0361926c 100644 --- a/demos/filter_chains/mcp_filter/docker-compose.yaml +++ b/demos/filter_chains/mcp_filter/docker-compose.yaml @@ -1,16 +1,4 @@ services: - rag-agents: - build: - context: . - dockerfile: Dockerfile - ports: - - "10500:10500" - - "10501:10501" - - "10502:10502" - - "10505:10505" - environment: - - LLM_GATEWAY_ENDPOINT=${LLM_GATEWAY_ENDPOINT:-http://host.docker.internal:12000/v1} - - OPENAI_API_KEY=${OPENAI_API_KEY:?OPENAI_API_KEY environment variable is required but not set} jaeger: build: context: ../../shared/jaeger @@ -32,3 +20,5 @@ services: - GENERIC_OPEN_AI_MODEL_PREF=gpt-4o-mini - GENERIC_OPEN_AI_MODEL_TOKEN_LIMIT=128000 - GENERIC_OPEN_AI_API_KEY=sk-placeholder + extra_hosts: + - "host.docker.internal:host-gateway" diff --git a/demos/filter_chains/mcp_filter/run_demo.sh b/demos/filter_chains/mcp_filter/run_demo.sh index bed84f16..f203f5b1 100755 --- a/demos/filter_chains/mcp_filter/run_demo.sh +++ b/demos/filter_chains/mcp_filter/run_demo.sh @@ -18,22 +18,32 @@ start_demo() { echo ".env file created with OPENAI_API_KEY." fi - # Step 3: Start Plano + # Step 3: Optionally start UI services (AnythingLLM, Jaeger) + # Jaeger must start before Plano so it can bind the OTEL port (4317) + if [ "$1" == "--with-ui" ]; then + echo "Starting UI services (AnythingLLM, Jaeger)..." + docker compose up -d + fi + + # Step 4: Start Plano echo "Starting Plano with config.yaml..." planoai up config.yaml - # Step 4: Start services - echo "Starting services using Docker Compose..." - docker compose up -d + # Step 5: Start agents natively + echo "Starting agents..." + bash start_agents.sh & } # Function to stop the demo stop_demo() { - # Step 1: Stop Docker Compose services - echo "Stopping Docker Compose services..." - docker compose down + # Stop agents + echo "Stopping agents..." + pkill -f start_agents.sh 2>/dev/null || true - # Step 2: Stop Plano + # Stop Docker Compose services if running + docker compose down 2>/dev/null || true + + # Stop Plano echo "Stopping Plano..." planoai down } @@ -42,5 +52,5 @@ stop_demo() { if [ "$1" == "down" ]; then stop_demo else - start_demo + start_demo "$1" fi diff --git a/demos/getting_started/llm_gateway/README.md b/demos/getting_started/llm_gateway/README.md index e87467fc..b29397b6 100644 --- a/demos/getting_started/llm_gateway/README.md +++ b/demos/getting_started/llm_gateway/README.md @@ -7,7 +7,14 @@ This demo shows how you can use Plano gateway to manage keys and route to upstre ```sh sh run_demo.sh ``` -1. Navigate to http://localhost:3001/ +1. Test with curl (see example below) + +To also start the AnythingLLM chat UI and Jaeger tracing, pass `--with-ui`: +```sh +sh run_demo.sh --with-ui +``` + +Then navigate to http://localhost:3001/ for AnythingLLM. Following screen shows an example of interaction with Plano gateway showing dynamic routing. You can select between different LLMs using "override model" option in the chat UI. @@ -47,7 +54,7 @@ $ curl --header 'Content-Type: application/json' \ ``` # Observability -For tracing you can head over to http://localhost:16686/ to view recent traces. +For tracing, start with `--with-ui` and head over to http://localhost:16686/ to view recent traces. Following is a screenshot of tracing UI showing call received by Plano gateway and making upstream call to LLM, diff --git a/demos/getting_started/llm_gateway/run_demo.sh b/demos/getting_started/llm_gateway/run_demo.sh index b049bf31..e430a1cd 100644 --- a/demos/getting_started/llm_gateway/run_demo.sh +++ b/demos/getting_started/llm_gateway/run_demo.sh @@ -18,22 +18,24 @@ start_demo() { echo ".env file created with OPENAI_API_KEY." fi - # Step 3: Start Plano + # Step 3: Optionally start UI services (AnythingLLM, Jaeger) + # Jaeger must start before Plano so it can bind the OTEL port (4317) + if [ "$1" == "--with-ui" ]; then + echo "Starting UI services (AnythingLLM, Jaeger)..." + docker compose up -d + fi + + # Step 4: Start Plano echo "Starting Plano with config.yaml..." planoai up config.yaml - - # Step 4: Start LLM Routing - echo "Starting LLM Routing using Docker Compose..." - docker compose up -d # Run in detached mode } # Function to stop the demo stop_demo() { - # Step 1: Stop Docker Compose services - echo "Stopping LLM Routing using Docker Compose..." - docker compose down + # Stop Docker Compose services if running + docker compose down 2>/dev/null || true - # Step 2: Stop Plano + # Stop Plano echo "Stopping Plano..." planoai down } @@ -42,6 +44,5 @@ stop_demo() { if [ "$1" == "down" ]; then stop_demo else - # Default action is to bring the demo up - start_demo + start_demo "$1" fi diff --git a/demos/getting_started/weather_forecast/README.md b/demos/getting_started/weather_forecast/README.md index 8a9eb6c0..91fa810f 100644 --- a/demos/getting_started/weather_forecast/README.md +++ b/demos/getting_started/weather_forecast/README.md @@ -10,15 +10,26 @@ This demo shows how you can use Plano's core function calling capabilities. 3. ```sh sh run_demo.sh ``` -4. Navigate to http://localhost:3001/ -5. You can type in queries like "how is the weather?" +4. Test with curl: + ```sh + curl http://localhost:10000/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{"model": "gpt-4o", "messages": [{"role": "user", "content": "how is the weather in San Francisco?"}]}' + ``` Here is a sample interaction, image -## Tracing +## Using the Chat UI and Tracing (optional) -To see a tracing dashboard, navigate to http://localhost:16686/ to open Jaeger UI. +To start AnythingLLM (chat UI) and other optional services, pass `--with-ui`: + +```sh +sh run_demo.sh --with-ui +``` + +- Navigate to http://localhost:3001/ for AnythingLLM +- Navigate to http://localhost:16686/ for Jaeger tracing UI ### Stopping Demo diff --git a/demos/getting_started/weather_forecast/docker-compose.yaml b/demos/getting_started/weather_forecast/docker-compose.yaml index 84074ab9..f36987e4 100644 --- a/demos/getting_started/weather_forecast/docker-compose.yaml +++ b/demos/getting_started/weather_forecast/docker-compose.yaml @@ -1,14 +1,4 @@ services: - weather_forecast_service: - build: - context: ./ - environment: - - OLTP_HOST=http://jaeger:4317 - extra_hosts: - - "host.docker.internal:host-gateway" - ports: - - "18083:80" - anythingllm: image: mintplexlabs/anythingllm restart: always diff --git a/demos/getting_started/weather_forecast/run_demo.sh b/demos/getting_started/weather_forecast/run_demo.sh index c8eb96e5..c77f2d83 100644 --- a/demos/getting_started/weather_forecast/run_demo.sh +++ b/demos/getting_started/weather_forecast/run_demo.sh @@ -72,23 +72,32 @@ start_demo() { exit 1 fi - # Step 4: Start Plano + # Step 4: Optionally start UI services (AnythingLLM, Jaeger, etc.) + # Jaeger must start before Plano so it can bind the OTEL port (4317) + if [ "$1" == "--with-ui" ] || [ "$2" == "--with-ui" ]; then + echo "Starting UI services with $COMPOSE_FILE..." + docker compose -f "$COMPOSE_FILE" up -d + fi + + # Step 5: Start Plano echo "Starting Plano with config.yaml..." planoai up config.yaml - # Step 5: Start Network Agent with the chosen Docker Compose file - echo "Starting Network Agent with $COMPOSE_FILE..." - docker compose -f "$COMPOSE_FILE" up -d # Run in detached mode + # Step 6: Start agents natively + echo "Starting agents..." + bash start_agents.sh & } # Function to stop the demo stop_demo() { - echo "Stopping all Docker Compose services..." + # Stop agents + echo "Stopping agents..." + pkill -f start_agents.sh 2>/dev/null || true - # Stop all services by iterating through all configurations + # Stop all Docker Compose services if running + echo "Stopping Docker Compose services..." for compose_file in ./docker-compose*.yaml; do - echo "Stopping services in $compose_file..." - docker compose -f "$compose_file" down + docker compose -f "$compose_file" down 2>/dev/null || true done # Stop Plano @@ -101,6 +110,6 @@ if [ "$1" == "down" ]; then # Call stop_demo with the second argument as the demo to stop stop_demo else - # Use the argument (jaeger, logfire, signoz) to determine the compose file - start_demo "$1" + # Use the argument (jaeger, logfire, signoz, --with-ui) to determine the compose file + start_demo "$1" "$2" fi diff --git a/demos/getting_started/weather_forecast/start_agents.sh b/demos/getting_started/weather_forecast/start_agents.sh new file mode 100755 index 00000000..548f2bf7 --- /dev/null +++ b/demos/getting_started/weather_forecast/start_agents.sh @@ -0,0 +1,24 @@ +#!/bin/bash +set -e + +PIDS=() + +log() { echo "$(date '+%F %T') - $*"; } + +cleanup() { + log "Stopping agents..." + for PID in "${PIDS[@]}"; do + kill $PID 2>/dev/null && log "Stopped process $PID" + done + exit 0 +} + +trap cleanup EXIT INT TERM + +log "Starting weather_forecast_service on port 18083..." +uv run uvicorn main:app --host 0.0.0.0 --port 18083 & +PIDS+=($!) + +for PID in "${PIDS[@]}"; do + wait "$PID" +done diff --git a/demos/integrations/ollama/run_demo.sh b/demos/integrations/ollama/run_demo.sh index 6623dee5..5bbf183b 100644 --- a/demos/integrations/ollama/run_demo.sh +++ b/demos/integrations/ollama/run_demo.sh @@ -7,33 +7,58 @@ start_demo() { if [ -f ".env" ]; then echo ".env file already exists. Skipping creation." else - # Step 2: Create `.env` file and set OpenAI key + # Step 2: Create `.env` file and set API keys if [ -z "$OPENAI_API_KEY" ]; then echo "Error: OPENAI_API_KEY environment variable is not set for the demo." exit 1 fi + if [ -z "$ANTHROPIC_API_KEY" ]; then + echo "Warning: ANTHROPIC_API_KEY environment variable is not set. Anthropic features may not work." + fi echo "Creating .env file..." echo "OPENAI_API_KEY=$OPENAI_API_KEY" > .env - echo ".env file created with OPENAI_API_KEY." + if [ -n "$ANTHROPIC_API_KEY" ]; then + echo "ANTHROPIC_API_KEY=$ANTHROPIC_API_KEY" >> .env + fi + echo ".env file created with API keys." fi - # Step 3: Start Plano - echo "Starting Plano with config.yaml..." - planoai up config.yaml + # Step 3: Optionally start UI services (AnythingLLM, Jaeger) + # Jaeger must start before Plano so it can bind the OTEL port (4317) + if [ "$1" == "--with-ui" ]; then + echo "Starting UI services (AnythingLLM, Jaeger)..." + docker compose up -d + fi - # Step 4: Start developer services - echo "Starting Network Agent using Docker Compose..." - docker compose up -d # Run in detached mode + # Step 4: Start Plano + echo "Starting Plano with arch_config_with_aliases.yaml..." + planoai up arch_config_with_aliases.yaml + + echo "" + echo "Plano started successfully." + echo "Please run the following CURL command to test model alias routing. Additional instructions are in the README.md file." + echo "" + echo "curl -sS -X POST \"http://localhost:12000/v1/chat/completions\" \ + -H \"Authorization: Bearer test-key\" \ + -H \"Content-Type: application/json\" \ + -d '{ + \"model\": \"arch.summarize.v1\", + \"max_tokens\": 50, + \"messages\": [ + { \"role\": \"user\", + \"content\": \"Hello, please respond with exactly: Hello from alias arch.summarize.v1!\" + } + ] + }' | jq ." } # Function to stop the demo stop_demo() { - # Step 1: Stop Docker Compose services - echo "Stopping Network Agent using Docker Compose..." - docker compose down + # Stop Docker Compose services if running + docker compose down 2>/dev/null || true - # Step 2: Stop Plano + # Stop Plano echo "Stopping Plano..." planoai down } @@ -42,6 +67,5 @@ stop_demo() { if [ "$1" == "down" ]; then stop_demo else - # Default action is to bring the demo up - start_demo + start_demo "$1" fi diff --git a/demos/integrations/spotify_bearer_auth/run_demo.sh b/demos/integrations/spotify_bearer_auth/run_demo.sh index 6623dee5..e430a1cd 100644 --- a/demos/integrations/spotify_bearer_auth/run_demo.sh +++ b/demos/integrations/spotify_bearer_auth/run_demo.sh @@ -18,22 +18,24 @@ start_demo() { echo ".env file created with OPENAI_API_KEY." fi - # Step 3: Start Plano + # Step 3: Optionally start UI services (AnythingLLM, Jaeger) + # Jaeger must start before Plano so it can bind the OTEL port (4317) + if [ "$1" == "--with-ui" ]; then + echo "Starting UI services (AnythingLLM, Jaeger)..." + docker compose up -d + fi + + # Step 4: Start Plano echo "Starting Plano with config.yaml..." planoai up config.yaml - - # Step 4: Start developer services - echo "Starting Network Agent using Docker Compose..." - docker compose up -d # Run in detached mode } # Function to stop the demo stop_demo() { - # Step 1: Stop Docker Compose services - echo "Stopping Network Agent using Docker Compose..." - docker compose down + # Stop Docker Compose services if running + docker compose down 2>/dev/null || true - # Step 2: Stop Plano + # Stop Plano echo "Stopping Plano..." planoai down } @@ -42,6 +44,5 @@ stop_demo() { if [ "$1" == "down" ]; then stop_demo else - # Default action is to bring the demo up - start_demo + start_demo "$1" fi diff --git a/demos/llm_routing/preference_based_routing/README.md b/demos/llm_routing/preference_based_routing/README.md index 03d28cee..9d71971c 100644 --- a/demos/llm_routing/preference_based_routing/README.md +++ b/demos/llm_routing/preference_based_routing/README.md @@ -10,19 +10,27 @@ cd demos/llm_routing/preference_based_routing ./run_demo.sh ``` -Or manually: +To also start AnythingLLM (chat UI) and Jaeger (tracing): -1. Start Plano ```bash -planoai up config.yaml +./run_demo.sh --with-ui ``` -2. Start AnythingLLM +Then open AnythingLLM at http://localhost:3001/ + +Or start manually: + +1. (Optional) Start AnythingLLM and Jaeger ```bash docker compose up -d ``` -3. open AnythingLLM http://localhost:3001/ +2. Start Plano +```bash +planoai up config.yaml +``` + +3. Test with curl or open AnythingLLM http://localhost:3001/ # Testing out preference based routing diff --git a/demos/llm_routing/preference_based_routing/run_demo.sh b/demos/llm_routing/preference_based_routing/run_demo.sh index c9525c26..30e0c67b 100755 --- a/demos/llm_routing/preference_based_routing/run_demo.sh +++ b/demos/llm_routing/preference_based_routing/run_demo.sh @@ -24,22 +24,24 @@ start_demo() { echo ".env file created with API keys." fi - # Step 3: Start Plano + # Step 3: Optionally start UI services (AnythingLLM, Jaeger) + # Jaeger must start before Plano so it can bind the OTEL port (4317) + if [ "$1" == "--with-ui" ]; then + echo "Starting UI services (AnythingLLM, Jaeger)..." + docker compose up -d + fi + + # Step 4: Start Plano echo "Starting Plano with config.yaml..." planoai up config.yaml - - # Step 4: Start services - echo "Starting services using Docker Compose..." - docker compose up -d } # Function to stop the demo stop_demo() { - # Step 1: Stop Docker Compose services - echo "Stopping Docker Compose services..." - docker compose down + # Stop Docker Compose services if running + docker compose down 2>/dev/null || true - # Step 2: Stop Plano + # Stop Plano echo "Stopping Plano..." planoai down } @@ -48,5 +50,5 @@ stop_demo() { if [ "$1" == "down" ]; then stop_demo else - start_demo + start_demo "$1" fi diff --git a/tests/e2e/run_e2e_tests.sh b/tests/e2e/run_e2e_tests.sh index c24931f4..a164b7f9 100644 --- a/tests/e2e/run_e2e_tests.sh +++ b/tests/e2e/run_e2e_tests.sh @@ -21,10 +21,11 @@ trap 'print_debug' INT TERM ERR log starting > ../build.log -log building and running function_calling demo +log starting weather_forecast agent natively log =========================================== cd ../../demos/getting_started/weather_forecast/ -docker compose up weather_forecast_service --build -d +bash start_agents.sh & +AGENTS_PID=$! cd - log building and installing plano cli @@ -78,8 +79,6 @@ log running e2e tests for openai responses api client log ======================================== uv run pytest test_openai_responses_api_client_with_state.py -log shutting down the weather_forecast demo +log shutting down the weather_forecast agent log ======================================= -cd ../../demos/getting_started/weather_forecast -docker compose down -cd - +kill $AGENTS_PID 2>/dev/null || true diff --git a/tests/e2e/run_prompt_gateway_tests.sh b/tests/e2e/run_prompt_gateway_tests.sh index 58d850d8..1e947813 100755 --- a/tests/e2e/run_prompt_gateway_tests.sh +++ b/tests/e2e/run_prompt_gateway_tests.sh @@ -32,10 +32,11 @@ cd - # Re-sync e2e deps uv sync -# Start weather_forecast service (needed for prompt_gateway tests) -log "building and running weather_forecast service" +# Start weather_forecast service natively (needed for prompt_gateway tests) +log "starting weather_forecast agent natively" cd ../../demos/getting_started/weather_forecast/ -docker compose up weather_forecast_service --build -d +bash start_agents.sh & +AGENTS_PID=$! cd - # Start gateway with prompt_gateway config @@ -52,6 +53,4 @@ uv run pytest test_prompt_gateway.py # Cleanup log "shutting down" planoai down --docker || true -cd ../../demos/getting_started/weather_forecast -docker compose down -cd - +kill $AGENTS_PID 2>/dev/null || true From 5400b0a2fa476dfc629bf0dfab95e826a640a67b Mon Sep 17 00:00:00 2001 From: Adil Hafeez Date: Wed, 11 Mar 2026 15:28:50 -0700 Subject: [PATCH 6/8] add instructions on hosting arch-router locally (#819) --- .../preference_based_routing/README.md | 31 +++++ docs/source/guides/llm_router.rst | 123 ++++++++++++++++++ 2 files changed, 154 insertions(+) diff --git a/demos/llm_routing/preference_based_routing/README.md b/demos/llm_routing/preference_based_routing/README.md index 9d71971c..009002fd 100644 --- a/demos/llm_routing/preference_based_routing/README.md +++ b/demos/llm_routing/preference_based_routing/README.md @@ -32,6 +32,37 @@ planoai up config.yaml 3. Test with curl or open AnythingLLM http://localhost:3001/ +## Running with local Arch-Router (via Ollama) + +By default, Plano uses a hosted Arch-Router endpoint. To self-host Arch-Router locally using Ollama: + +1. Install [Ollama](https://ollama.ai) and pull the model: +```bash +ollama pull hf.co/katanemo/Arch-Router-1.5B.gguf:Q4_K_M +``` + +2. Make sure Ollama is running (`ollama serve` or the macOS app). + +3. Start Plano with the local config: +```bash +planoai up plano_config_local.yaml +``` + +4. Test routing: +```bash +curl -s "http://localhost:12000/routing/v1/messages" \ + -H "Content-Type: application/json" \ + -d '{ + "model": "gpt-4o-mini", + "max_tokens": 1024, + "messages": [ + {"role": "user", "content": "Create a REST API endpoint in Rust using actix-web"} + ] + }' +``` + +You should see the router select the appropriate model based on the routing preferences defined in `plano_config_local.yaml`. + # Testing out preference based routing We have defined two routes 1. code generation and 2. code understanding diff --git a/docs/source/guides/llm_router.rst b/docs/source/guides/llm_router.rst index 188b1e30..41c51b4a 100644 --- a/docs/source/guides/llm_router.rst +++ b/docs/source/guides/llm_router.rst @@ -228,6 +228,129 @@ In summary, Arch-Router demonstrates: - **Production-Ready Performance**: Optimized for low-latency, high-throughput applications in multi-model environments. +Self-hosting Arch-Router +------------------------ + +By default, Plano uses a hosted Arch-Router endpoint. To run Arch-Router locally, you can serve the model yourself using either **Ollama** or **vLLM**. + +Using Ollama (recommended for local development) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +1. **Install Ollama** + + Download and install from `ollama.ai `_. + +2. **Pull and serve Arch-Router** + + .. code-block:: bash + + ollama pull hf.co/katanemo/Arch-Router-1.5B.gguf:Q4_K_M + ollama serve + + This downloads the quantized GGUF model from HuggingFace and starts serving on ``http://localhost:11434``. + +3. **Configure Plano to use local Arch-Router** + + .. code-block:: yaml + + routing: + model: Arch-Router + llm_provider: arch-router + + model_providers: + - name: arch-router + model: arch/hf.co/katanemo/Arch-Router-1.5B.gguf:Q4_K_M + base_url: http://localhost:11434 + + - model: openai/gpt-5.2 + access_key: $OPENAI_API_KEY + default: true + + - model: anthropic/claude-sonnet-4-5 + access_key: $ANTHROPIC_API_KEY + routing_preferences: + - name: creative writing + description: creative content generation, storytelling, and writing assistance + +4. **Verify the model is running** + + .. code-block:: bash + + curl http://localhost:11434/v1/models + + You should see ``Arch-Router-1.5B`` listed in the response. + +Using vLLM (recommended for production / EC2) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +vLLM provides higher throughput and GPU optimizations suitable for production deployments. + +1. **Install vLLM** + + .. code-block:: bash + + pip install vllm + +2. **Download the model weights** + + The GGUF weights are downloaded automatically from HuggingFace on first use. To pre-download: + + .. code-block:: bash + + pip install huggingface_hub + huggingface-cli download katanemo/Arch-Router-1.5B.gguf + +3. **Start the vLLM server** + + After downloading, find the GGUF file and Jinja template in the HuggingFace cache: + + .. code-block:: bash + + # Find the downloaded files + SNAPSHOT_DIR=$(ls -d ~/.cache/huggingface/hub/models--katanemo--Arch-Router-1.5B.gguf/snapshots/*/ | head -1) + + vllm serve ${SNAPSHOT_DIR}Arch-Router-1.5B-Q4_K_M.gguf \ + --host 0.0.0.0 \ + --port 10000 \ + --load-format gguf \ + --chat-template ${SNAPSHOT_DIR}template.jinja \ + --tokenizer katanemo/Arch-Router-1.5B \ + --served-model-name Arch-Router \ + --gpu-memory-utilization 0.3 \ + --tensor-parallel-size 1 \ + --enable-prefix-caching + +4. **Configure Plano to use the vLLM endpoint** + + .. code-block:: yaml + + routing: + model: Arch-Router + llm_provider: arch-router + + model_providers: + - name: arch-router + model: Arch-Router + base_url: http://:10000 + + - model: openai/gpt-5.2 + access_key: $OPENAI_API_KEY + default: true + + - model: anthropic/claude-sonnet-4-5 + access_key: $ANTHROPIC_API_KEY + routing_preferences: + - name: creative writing + description: creative content generation, storytelling, and writing assistance + +5. **Verify the server is running** + + .. code-block:: bash + + curl http://localhost:10000/health + curl http://localhost:10000/v1/models + + Combining Routing Methods ------------------------- From 2f52774c0ecf6de6c9df657418020de468aef1a1 Mon Sep 17 00:00:00 2001 From: Adil Hafeez Date: Fri, 13 Mar 2026 00:18:41 -0700 Subject: [PATCH 7/8] Add Claude Code skills and streamline CLAUDE.md (#823) * add claude code skills and streamline CLAUDE.md * remove claude code attribution from PR skill * update pr skill --- .claude/skills/build-wasm/SKILL.md | 12 +++ .claude/skills/check/SKILL.md | 12 +++ .claude/skills/new-provider/SKILL.md | 17 +++ .claude/skills/pr/SKILL.md | 16 +++ .claude/skills/release/SKILL.md | 28 +++++ .claude/skills/test-python/SKILL.md | 9 ++ CLAUDE.md | 152 ++++++++++----------------- 7 files changed, 147 insertions(+), 99 deletions(-) create mode 100644 .claude/skills/build-wasm/SKILL.md create mode 100644 .claude/skills/check/SKILL.md create mode 100644 .claude/skills/new-provider/SKILL.md create mode 100644 .claude/skills/pr/SKILL.md create mode 100644 .claude/skills/release/SKILL.md create mode 100644 .claude/skills/test-python/SKILL.md diff --git a/.claude/skills/build-wasm/SKILL.md b/.claude/skills/build-wasm/SKILL.md new file mode 100644 index 00000000..dffff783 --- /dev/null +++ b/.claude/skills/build-wasm/SKILL.md @@ -0,0 +1,12 @@ +--- +name: build-wasm +description: Build the WASM plugins for Envoy. Use when WASM plugin code changes. +--- + +Build the WASM plugins: + +``` +cd crates && cargo build --release --target=wasm32-wasip1 -p llm_gateway -p prompt_gateway +``` + +If the build fails, diagnose and fix the errors. diff --git a/.claude/skills/check/SKILL.md b/.claude/skills/check/SKILL.md new file mode 100644 index 00000000..4d2427e2 --- /dev/null +++ b/.claude/skills/check/SKILL.md @@ -0,0 +1,12 @@ +--- +name: check +description: Run Rust fmt, clippy, and unit tests. Use after making Rust code changes. +--- + +Run all local checks in order: + +1. `cd crates && cargo fmt --all -- --check` — if formatting fails, run `cargo fmt --all` to fix it +2. `cd crates && cargo clippy --locked --all-targets --all-features -- -D warnings` — fix any warnings +3. `cd crates && cargo test --lib` — ensure all unit tests pass + +Report a summary of what passed/failed. diff --git a/.claude/skills/new-provider/SKILL.md b/.claude/skills/new-provider/SKILL.md new file mode 100644 index 00000000..74ba0f6d --- /dev/null +++ b/.claude/skills/new-provider/SKILL.md @@ -0,0 +1,17 @@ +--- +name: new-provider +description: Add a new LLM provider to hermesllm. Use when integrating a new AI provider. +disable-model-invocation: true +user-invocable: true +--- + +Add a new LLM provider to hermesllm. The user will provide the provider name as $ARGUMENTS. + +1. Add a new variant to `ProviderId` enum in `crates/hermesllm/src/providers/id.rs` +2. Implement string parsing in the `TryFrom<&str>` impl for the new provider +3. If the provider uses a non-OpenAI API format, create request/response types in `crates/hermesllm/src/apis/` +4. Add variant to `ProviderRequestType` and `ProviderResponseType` enums and update all match arms +5. Add model list to `crates/hermesllm/src/providers/provider_models.yaml` +6. Update `SupportedUpstreamAPIs` mapping if needed + +After making changes, run `cd crates && cargo test --lib` to verify everything compiles and tests pass. diff --git a/.claude/skills/pr/SKILL.md b/.claude/skills/pr/SKILL.md new file mode 100644 index 00000000..43e4b46f --- /dev/null +++ b/.claude/skills/pr/SKILL.md @@ -0,0 +1,16 @@ +--- +name: pr +description: Create a feature branch and open a pull request for the current changes. +disable-model-invocation: true +user-invocable: true +--- + +Create a pull request for the current changes: + +1. Determine the GitHub username via `gh api user --jq .login`. If the login is `adilhafeez`, use `adil` instead. +2. Create a feature branch using format `/` — infer the feature name from the changes +3. Run `cd crates && cargo fmt --all -- --check` and `cd crates && cargo clippy --locked --all-targets --all-features -- -D warnings` to verify Rust code is clean +4. Commit all changes with a short, concise commit message (one line, no Co-Authored-By) +5. Push the branch and create a PR targeting `main` + +Keep the PR title short (under 70 chars). Include a brief summary in the body. Never include a "Test plan" section or any "Generated with Claude Code" attribution. diff --git a/.claude/skills/release/SKILL.md b/.claude/skills/release/SKILL.md new file mode 100644 index 00000000..80510004 --- /dev/null +++ b/.claude/skills/release/SKILL.md @@ -0,0 +1,28 @@ +--- +name: release +description: Bump the Plano version across all required files. Use when preparing a release. +disable-model-invocation: true +user-invocable: true +--- + +Prepare a release version bump. The user may provide the new version number as $ARGUMENTS (e.g., `/release 0.4.12`), or a bump type (`major`, `minor`, `patch`). + +If no argument is provided, read the current version from `cli/planoai/__init__.py`, auto-increment the patch version (e.g., `0.4.11` → `0.4.12`), and confirm with the user before proceeding. + +Update the version string in ALL of these files: + +- `.github/workflows/ci.yml` +- `cli/planoai/__init__.py` +- `cli/planoai/consts.py` +- `cli/pyproject.toml` +- `build_filter_image.sh` +- `config/validate_plano_config.sh` +- `docs/source/conf.py` +- `docs/source/get_started/quickstart.rst` +- `docs/source/resources/deployment.rst` +- `apps/www/src/components/Hero.tsx` +- `demos/llm_routing/preference_based_routing/README.md` + +Do NOT change version strings in `*.lock` files or `Cargo.lock`. + +After making changes, show a summary of all files modified and the old → new version. diff --git a/.claude/skills/test-python/SKILL.md b/.claude/skills/test-python/SKILL.md new file mode 100644 index 00000000..2aa40ded --- /dev/null +++ b/.claude/skills/test-python/SKILL.md @@ -0,0 +1,9 @@ +--- +name: test-python +description: Run Python CLI tests. Use after making changes to cli/ code. +--- + +1. `cd cli && uv sync` — ensure dependencies are installed +2. `cd cli && uv run pytest -v` — run all tests + +If tests fail, diagnose and fix the issues. diff --git a/CLAUDE.md b/CLAUDE.md index 71c94303..58b2191f 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -1,152 +1,106 @@ # CLAUDE.md -This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository. - -## Project Overview - Plano is an AI-native proxy server and data plane for agentic applications, built on Envoy proxy. It centralizes agent orchestration, LLM routing, observability, and safety guardrails as an out-of-process dataplane. ## Build & Test Commands -### Rust (crates/) - ```bash -# Build WASM plugins (must target wasm32-wasip1) +# Rust — WASM plugins (must target wasm32-wasip1) cd crates && cargo build --release --target=wasm32-wasip1 -p llm_gateway -p prompt_gateway -# Build brightstaff binary (native target) +# Rust — brightstaff binary (native target) cd crates && cargo build --release -p brightstaff -# Run unit tests +# Rust — tests, format, lint cd crates && cargo test --lib - -# Format check cd crates && cargo fmt --all -- --check - -# Lint cd crates && cargo clippy --locked --all-targets --all-features -- -D warnings -``` -### Python CLI (cli/) +# Python CLI +cd cli && uv sync && uv run pytest -v -```bash -cd cli && uv sync # Install dependencies -cd cli && uv run pytest -v # Run tests -cd cli && uv run planoai --help # Run CLI -``` +# JS/TS (Turbo monorepo) +npm run build && npm run lint && npm run typecheck -### JavaScript/TypeScript (apps/, packages/) - -```bash -npm run build # Build all (via Turbo) -npm run lint # Lint all -npm run dev # Dev servers -npm run typecheck # Type check -``` - -### Pre-commit (runs fmt, clippy, cargo test, black, yaml checks) - -```bash +# Pre-commit (fmt, clippy, cargo test, black, yaml) pre-commit run --all-files -``` -### Docker - -```bash +# Docker docker build -t katanemo/plano:latest . ``` -### E2E Tests (tests/e2e/) - -E2E tests require a built Docker image and API keys. They run via `tests/e2e/run_e2e_tests.sh` which executes four test suites: `test_prompt_gateway.py`, `test_model_alias_routing.py`, `test_openai_responses_api_client.py`, and `test_openai_responses_api_client_with_state.py`. +E2E tests require a Docker image and API keys: `tests/e2e/run_e2e_tests.sh` ## Architecture -### Core Data Flow - -Requests flow through Envoy proxy with two WASM filter plugins, backed by a native Rust binary: - ``` Client → Envoy (prompt_gateway.wasm → llm_gateway.wasm) → Agents/LLM Providers ↕ brightstaff (native binary: state, routing, signals, tracing) ``` -### Rust Crates (crates/) +### Crates (crates/) -All crates share a Cargo workspace. Two compile to `wasm32-wasip1` for Envoy, the rest are native: - -- **prompt_gateway** (WASM) — Proxy-WASM filter for prompt/message processing, guardrails, and filter chains +- **prompt_gateway** (WASM) — Proxy-WASM filter for prompt processing, guardrails, filter chains - **llm_gateway** (WASM) — Proxy-WASM filter for LLM request/response handling and routing -- **brightstaff** (native binary) — Core application server: handlers, router, signals, state management, tracing -- **common** (library) — Shared across all crates: configuration, LLM provider abstractions, HTTP utilities, routing logic, rate limiting, tokenizer, PII detection, tracing -- **hermesllm** (library) — Translates LLM API formats between providers (OpenAI, Anthropic, Gemini, Mistral, Grok, AWS Bedrock, Azure, together.ai). Key types: `ProviderId`, `ProviderRequest`, `ProviderResponse`, `ProviderStreamResponse` +- **brightstaff** (native) — Core server: handlers, router, signals, state, tracing +- **common** (lib) — Shared: config, HTTP, routing, rate limiting, tokenizer, PII, tracing +- **hermesllm** (lib) — LLM API translation between providers. Key types: `ProviderId`, `ProviderRequest`, `ProviderResponse`, `ProviderStreamResponse` ### Python CLI (cli/planoai/) -The `planoai` CLI manages the Plano lifecycle. Key commands: -- `planoai up ` — Validate config, check API keys, start Docker container -- `planoai down` — Stop container -- `planoai build` — Build Docker image from repo root -- `planoai logs` — Stream access/debug logs -- `planoai trace` — OTEL trace collection and analysis -- `planoai init` — Initialize new project -- `planoai cli_agent` — Start a CLI agent connected to Plano -- `planoai generate_prompt_targets` — Generate prompt_targets from python methods +Entry point: `main.py`. Built with `rich-click`. Commands: `up`, `down`, `build`, `logs`, `trace`, `init`, `cli_agent`, `generate_prompt_targets`. -Entry point: `cli/planoai/main.py`. Container lifecycle in `core.py`. Docker operations in `docker_cli.py`. +### Config (config/) -### Configuration System (config/) +- `plano_config_schema.yaml` — JSON Schema for validating user configs +- `envoy.template.yaml` — Jinja2 template → Envoy config +- `supervisord.conf` — Process supervisor for Envoy + brightstaff -- `plano_config_schema.yaml` — JSON Schema (draft-07) for validating user config files -- `envoy.template.yaml` — Jinja2 template rendered into Envoy proxy config -- `supervisord.conf` — Process supervisor for Envoy + brightstaff in the container +### JS Apps (apps/, packages/) -User configs define: `agents` (id + url), `model_providers` (model + access_key), `listeners` (type: agent/model/prompt, with router strategy), `filters` (filter chains), and `tracing` settings. +Turbo monorepo with Next.js 16 / React 19. Not part of the core proxy. -### JavaScript Apps (apps/, packages/) +## WASM Plugin Rules -Turbo monorepo with Next.js 16 / React 19 applications and shared packages (UI components, Tailwind config, TypeScript config). Not part of the core proxy — these are web applications. +Code in `prompt_gateway` and `llm_gateway` runs in Envoy's WASM sandbox: + +- **No std networking/filesystem** — use proxy-wasm host calls only +- **No tokio/async** — synchronous, callback-driven. `Action::Pause` / `Action::Continue` for flow control +- **Lifecycle**: `RootContext` → `on_configure`, `create_http_context`; `HttpContext` → `on_http_request/response_headers/body` +- **HTTP callouts**: `dispatch_http_call()` → store context in `callouts: RefCell>` → match in `on_http_call_response()` +- **Config**: `Rc`-wrapped, loaded once in `on_configure()` via `serde_yaml::from_slice()` +- **Dependencies must be no_std compatible** (e.g., `governor` with `features = ["no_std"]`) +- **Crate type**: `cdylib` → produces `.wasm` + +## Adding a New LLM Provider + +1. Add variant to `ProviderId` in `crates/hermesllm/src/providers/id.rs` + `TryFrom<&str>` +2. Create request/response types in `crates/hermesllm/src/apis/` if non-OpenAI format +3. Add variant to `ProviderRequestType`/`ProviderResponseType` enums, update all match arms +4. Add models to `crates/hermesllm/src/providers/provider_models.yaml` +5. Update `SupportedUpstreamAPIs` mapping if needed ## Release Process -To prepare a release (e.g., bumping from `0.4.6` to `0.4.7`), update the version string in all of the following files: +Update version (e.g., `0.4.11` → `0.4.12`) in all of these files: -**CI Workflow:** -- `.github/workflows/ci.yml` — docker build/save tags +- `.github/workflows/ci.yml`, `build_filter_image.sh`, `config/validate_plano_config.sh` +- `cli/planoai/__init__.py`, `cli/planoai/consts.py`, `cli/pyproject.toml` +- `docs/source/conf.py`, `docs/source/get_started/quickstart.rst`, `docs/source/resources/deployment.rst` +- `apps/www/src/components/Hero.tsx`, `demos/llm_routing/preference_based_routing/README.md` -**CLI:** -- `cli/planoai/__init__.py` — `__version__` -- `cli/planoai/consts.py` — `PLANO_DOCKER_IMAGE` default -- `cli/pyproject.toml` — `version` - -**Build & Config:** -- `build_filter_image.sh` — docker build tag -- `config/validate_plano_config.sh` — docker image tag - -**Docs:** -- `docs/source/conf.py` — `release` -- `docs/source/get_started/quickstart.rst` — install commands and example output -- `docs/source/resources/deployment.rst` — docker image tag - -**Website & Demos:** -- `apps/www/src/components/Hero.tsx` — version badge -- `demos/llm_routing/preference_based_routing/README.md` — example output - -**Important:** Do NOT change `0.4.6` references in `*.lock` files or `Cargo.lock` — those refer to the `colorama` and `http-body` dependency versions, not Plano. - -Commit message format: `release X.Y.Z` +Do NOT change version strings in `*.lock` files or `Cargo.lock`. Commit message: `release X.Y.Z` ## Workflow Preferences -- **Git commits:** Do NOT add `Co-Authored-By` lines. Keep commit messages short and concise (one line, no verbose descriptions). NEVER commit and push directly to `main`—always use a feature branch and PR. -- **Git branches:** Use the format `/` when creating branches for PRs. Determine the username from `gh api user --jq .login`. -- **GitHub issues:** When a GitHub issue URL is pasted, fetch all requirements and context from the issue first. The end goal is always a PR with all tests passing. +- **Commits:** No `Co-Authored-By`. Short one-line messages. Never push directly to `main` — always feature branch + PR. +- **Branches:** Use `adil/` format. +- **Issues:** When a GitHub issue URL is pasted, fetch all context first. Goal is always a PR with passing tests. ## Key Conventions -- Rust edition 2021, formatted with `cargo fmt`, linted with `cargo clippy -D warnings` -- Python formatted with Black -- WASM plugins must target `wasm32-wasip1` — they run inside Envoy, not as native binaries -- The Docker image bundles Envoy + WASM plugins + brightstaff + Python CLI into a single container managed by supervisord -- API keys come from environment variables or `.env` files, never hardcoded +- Rust edition 2021, `cargo fmt`, `cargo clippy -D warnings` +- Python: Black. Rust errors: `thiserror` with `#[from]` +- API keys from env vars or `.env`, never hardcoded +- Provider dispatch: `ProviderRequestType`/`ProviderResponseType` enums implementing `ProviderRequest`/`ProviderResponse` traits From 785bf7e021297665b95d51933412b874e1e6a2f5 Mon Sep 17 00:00:00 2001 From: Adil Hafeez Date: Fri, 13 Mar 2026 00:28:35 -0700 Subject: [PATCH 8/8] add build-cli and build-brightstaff skills (#824) --- .claude/skills/build-brightstaff/SKILL.md | 12 ++++++++++++ .claude/skills/build-cli/SKILL.md | 10 ++++++++++ 2 files changed, 22 insertions(+) create mode 100644 .claude/skills/build-brightstaff/SKILL.md create mode 100644 .claude/skills/build-cli/SKILL.md diff --git a/.claude/skills/build-brightstaff/SKILL.md b/.claude/skills/build-brightstaff/SKILL.md new file mode 100644 index 00000000..6fc97b19 --- /dev/null +++ b/.claude/skills/build-brightstaff/SKILL.md @@ -0,0 +1,12 @@ +--- +name: build-brightstaff +description: Build the brightstaff native binary. Use when brightstaff code changes. +--- + +Build brightstaff: + +``` +cd crates && cargo build --release -p brightstaff +``` + +If the build fails, diagnose and fix the errors. diff --git a/.claude/skills/build-cli/SKILL.md b/.claude/skills/build-cli/SKILL.md new file mode 100644 index 00000000..0e2aec7f --- /dev/null +++ b/.claude/skills/build-cli/SKILL.md @@ -0,0 +1,10 @@ +--- +name: build-cli +description: Build and install the Python CLI (planoai). Use after making changes to cli/ code to install locally. +--- + +1. `cd cli && uv sync` — ensure dependencies are installed +2. `cd cli && uv tool install --editable .` — install the CLI locally +3. Verify the installation: `cd cli && uv run planoai --help` + +If the build or install fails, diagnose and fix the issues.