From e015637f222fff415c0ccdd05db3efd52a06a5ca Mon Sep 17 00:00:00 2001 From: Adil Hafeez Date: Mon, 9 Mar 2026 16:05:50 -0700 Subject: [PATCH] add routing service fixes https://github.com/katanemo/plano/issues/810 --- crates/brightstaff/src/handlers/mod.rs | 1 + .../brightstaff/src/handlers/router_chat.rs | 9 +- .../src/handlers/routing_service.rs | 163 ++++++++++++++++++ crates/brightstaff/src/main.rs | 23 ++- .../model_routing_service/README.md | 92 ++++++++++ .../model_routing_service/config.yaml | 27 +++ .../llm_routing/model_routing_service/demo.sh | 65 +++++++ 7 files changed, 375 insertions(+), 5 deletions(-) create mode 100644 crates/brightstaff/src/handlers/routing_service.rs create mode 100644 demos/llm_routing/model_routing_service/README.md create mode 100644 demos/llm_routing/model_routing_service/config.yaml create mode 100755 demos/llm_routing/model_routing_service/demo.sh diff --git a/crates/brightstaff/src/handlers/mod.rs b/crates/brightstaff/src/handlers/mod.rs index 0bbd3454..9c602e93 100644 --- a/crates/brightstaff/src/handlers/mod.rs +++ b/crates/brightstaff/src/handlers/mod.rs @@ -7,6 +7,7 @@ pub mod models; pub mod pipeline_processor; pub mod response_handler; pub mod router_chat; +pub mod routing_service; pub mod utils; #[cfg(test)] diff --git a/crates/brightstaff/src/handlers/router_chat.rs b/crates/brightstaff/src/handlers/router_chat.rs index d71734fa..345632fc 100644 --- a/crates/brightstaff/src/handlers/router_chat.rs +++ b/crates/brightstaff/src/handlers/router_chat.rs @@ -10,6 +10,7 @@ use crate::tracing::routing; pub struct RoutingResult { pub model_name: String, + pub route_name: Option, } pub struct RoutingError { @@ -133,9 +134,12 @@ pub async fn router_chat_get_upstream_model( match routing_result { Ok(route) => match route { - Some((_, model_name)) => { + Some((route_name, model_name)) => { current_span.record("route.selected_model", model_name.as_str()); - Ok(RoutingResult { model_name }) + Ok(RoutingResult { + model_name, + route_name: Some(route_name), + }) } None => { // No route determined, return sentinel value "none" @@ -145,6 +149,7 @@ pub async fn router_chat_get_upstream_model( Ok(RoutingResult { model_name: "none".to_string(), + route_name: None, }) } }, diff --git a/crates/brightstaff/src/handlers/routing_service.rs b/crates/brightstaff/src/handlers/routing_service.rs new file mode 100644 index 00000000..32f37a08 --- /dev/null +++ b/crates/brightstaff/src/handlers/routing_service.rs @@ -0,0 +1,163 @@ +use bytes::Bytes; +use common::configuration::SpanAttributes; +use common::consts::{REQUEST_ID_HEADER, TRACE_PARENT_HEADER}; +use common::errors::BrightStaffError; +use hermesllm::clients::SupportedAPIsFromClient; +use hermesllm::ProviderRequestType; +use http_body_util::combinators::BoxBody; +use http_body_util::{BodyExt, Full}; +use hyper::{Request, Response, StatusCode}; +use std::sync::Arc; +use tracing::{debug, info, info_span, warn, Instrument}; + +use crate::handlers::router_chat::router_chat_get_upstream_model; +use crate::router::llm_router::RouterService; +use crate::tracing::{collect_custom_trace_attributes, operation_component, set_service_name}; + +#[derive(serde::Serialize)] +struct RoutingDecisionResponse { + model: String, + route: Option, + trace_id: String, +} + +pub async fn routing_decision( + request: Request, + router_service: Arc, + request_path: String, + span_attributes: Arc>, +) -> Result>, hyper::Error> { + let request_headers = request.headers().clone(); + let request_id: String = request_headers + .get(REQUEST_ID_HEADER) + .and_then(|h| h.to_str().ok()) + .map(|s| s.to_string()) + .unwrap_or_else(|| uuid::Uuid::new_v4().to_string()); + + let custom_attrs = + collect_custom_trace_attributes(&request_headers, span_attributes.as_ref().as_ref()); + + let request_span = info_span!( + "routing_decision", + component = "routing", + request_id = %request_id, + http.method = %request.method(), + http.path = %request_path, + ); + + routing_decision_inner( + request, + router_service, + request_id, + request_path, + request_headers, + custom_attrs, + ) + .instrument(request_span) + .await +} + +async fn routing_decision_inner( + request: Request, + router_service: Arc, + request_id: String, + request_path: String, + request_headers: hyper::HeaderMap, + custom_attrs: std::collections::HashMap, +) -> Result>, hyper::Error> { + set_service_name(operation_component::ROUTING); + opentelemetry::trace::get_active_span(|span| { + for (key, value) in &custom_attrs { + span.set_attribute(opentelemetry::KeyValue::new(key.clone(), value.clone())); + } + }); + + // Extract or generate traceparent + let traceparent: String = match request_headers + .get(TRACE_PARENT_HEADER) + .and_then(|h| h.to_str().ok()) + .map(|s| s.to_string()) + { + Some(tp) => tp, + None => { + let trace_id = uuid::Uuid::new_v4().to_string().replace("-", ""); + let generated_tp = format!("00-{}-0000000000000000-01", trace_id); + warn!( + generated_traceparent = %generated_tp, + "TRACE_PARENT header missing, generated new traceparent" + ); + generated_tp + } + }; + + // Extract trace_id from traceparent (format: 00-{trace_id}-{span_id}-{flags}) + let trace_id = traceparent + .split('-') + .nth(1) + .unwrap_or("unknown") + .to_string(); + + // Parse request body + let chat_request_bytes = request.collect().await?.to_bytes(); + + debug!( + body = %String::from_utf8_lossy(&chat_request_bytes), + "routing decision request body received" + ); + + let client_request = match ProviderRequestType::try_from(( + &chat_request_bytes[..], + &SupportedAPIsFromClient::from_endpoint(request_path.as_str()).unwrap(), + )) { + Ok(request) => request, + Err(err) => { + warn!(error = %err, "failed to parse request for routing decision"); + return Ok(BrightStaffError::InvalidRequest(format!( + "Failed to parse request: {}", + err + )) + .into_response()); + } + }; + + // Call the existing routing logic + let routing_result = router_chat_get_upstream_model( + router_service, + client_request, + &traceparent, + &request_path, + &request_id, + ) + .await; + + match routing_result { + Ok(result) => { + let response = RoutingDecisionResponse { + model: result.model_name, + route: result.route_name, + trace_id, + }; + + info!( + model = %response.model, + route = ?response.route, + "routing decision completed" + ); + + let json = serde_json::to_string(&response).unwrap(); + let body = Full::new(Bytes::from(json)) + .map_err(|never| match never {}) + .boxed(); + + Ok(Response::builder() + .status(StatusCode::OK) + .header("Content-Type", "application/json") + .body(body) + .unwrap()) + } + Err(err) => { + warn!(error = %err.message, "routing decision failed"); + Ok(BrightStaffError::InternalServerError(err.message).into_response()) + } + } +} diff --git a/crates/brightstaff/src/main.rs b/crates/brightstaff/src/main.rs index 97345556..51c9127f 100644 --- a/crates/brightstaff/src/main.rs +++ b/crates/brightstaff/src/main.rs @@ -2,6 +2,7 @@ use brightstaff::handlers::agent_chat_completions::agent_chat; use brightstaff::handlers::function_calling::function_calling_chat_handler; use brightstaff::handlers::llm::llm_chat; use brightstaff::handlers::models::list_models; +use brightstaff::handlers::routing_service::routing_decision; use brightstaff::router::llm_router::RouterService; use brightstaff::router::plano_orchestrator::OrchestratorService; use brightstaff::state::memory::MemoryConversationalStorage; @@ -194,7 +195,7 @@ async fn main() -> Result<(), Box> { let state_storage = state_storage.clone(); async move { - let path = req.uri().path(); + let path = req.uri().path().to_string(); // Check if path starts with /agents if path.starts_with("/agents") { // Check if it matches one of the agent API paths @@ -217,7 +218,23 @@ async fn main() -> Result<(), Box> { .await; } } - match (req.method(), path) { + if let Some(stripped_path) = path.strip_prefix("/routing") { + let stripped_path = stripped_path.to_string(); + if matches!( + stripped_path.as_str(), + CHAT_COMPLETIONS_PATH | MESSAGES_PATH | OPENAI_RESPONSES_API_PATH + ) { + return routing_decision( + req, + router_service, + stripped_path, + span_attributes, + ) + .with_context(parent_cx) + .await; + } + } + match (req.method(), path.as_str()) { ( &Method::POST, CHAT_COMPLETIONS_PATH | MESSAGES_PATH | OPENAI_RESPONSES_API_PATH, @@ -270,7 +287,7 @@ async fn main() -> Result<(), Box> { Ok(response) } _ => { - debug!(method = %req.method(), path = %req.uri().path(), "no route found"); + debug!(method = %req.method(), path = %path, "no route found"); let mut not_found = Response::new(empty()); *not_found.status_mut() = StatusCode::NOT_FOUND; Ok(not_found) diff --git a/demos/llm_routing/model_routing_service/README.md b/demos/llm_routing/model_routing_service/README.md new file mode 100644 index 00000000..85d56abf --- /dev/null +++ b/demos/llm_routing/model_routing_service/README.md @@ -0,0 +1,92 @@ +# Model Routing Service Demo + +This demo shows how to use the `/routing/v1/*` endpoints to get routing decisions without proxying requests to an LLM. The endpoint accepts standard LLM request formats and returns which model Plano's router would select. + +## Setup + +Make sure you have Plano CLI installed (`pip install planoai` or `uv tool install planoai`). + +```bash +export OPENAI_API_KEY= +export ANTHROPIC_API_KEY= +``` + +Start Plano: +```bash +cd demos/llm_routing/model_routing_service +planoai up config.yaml +``` + +## Run the demo + +```bash +./demo.sh +``` + +## Endpoints + +All three LLM API formats are supported: + +| Endpoint | Format | +|---|---| +| `POST /routing/v1/chat/completions` | OpenAI Chat Completions | +| `POST /routing/v1/messages` | Anthropic Messages | +| `POST /routing/v1/responses` | OpenAI Responses API | + +## Example + +```bash +curl http://localhost:12000/routing/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "gpt-4o-mini", + "messages": [{"role": "user", "content": "Write a Python function for binary search"}] + }' +``` + +Response: +```json +{ + "model": "anthropic/claude-sonnet-4-20250514", + "route": "code_generation", + "trace_id": "c16d1096c1af4a17abb48fb182918a88" +} +``` + +The response tells you which model would handle this request and which route was matched, without actually making the LLM call. + +## Demo Output + +``` +=== Model Routing Service Demo === + +--- 1. Code generation query (OpenAI format) --- +{ + "model": "anthropic/claude-sonnet-4-20250514", + "route": "code_generation", + "trace_id": "c16d1096c1af4a17abb48fb182918a88" +} + +--- 2. Complex reasoning query (OpenAI format) --- +{ + "model": "openai/gpt-4o", + "route": "complex_reasoning", + "trace_id": "30795e228aff4d7696f082ed01b75ad4" +} + +--- 3. Simple query - no routing match (OpenAI format) --- +{ + "model": "none", + "route": null, + "trace_id": "ae0b6c3b220d499fb5298ac63f4eac0e" +} + +--- 4. Code generation query (Anthropic format) --- +{ + "model": "anthropic/claude-sonnet-4-20250514", + "route": "code_generation", + "trace_id": "26be822bbdf14a3ba19fe198e55ea4a9" +} + +=== Demo Complete === +``` diff --git a/demos/llm_routing/model_routing_service/config.yaml b/demos/llm_routing/model_routing_service/config.yaml new file mode 100644 index 00000000..7b98b25b --- /dev/null +++ b/demos/llm_routing/model_routing_service/config.yaml @@ -0,0 +1,27 @@ +version: v0.3.0 + +listeners: + - type: model + name: model_listener + port: 12000 + +model_providers: + + - model: openai/gpt-4o-mini + access_key: $OPENAI_API_KEY + default: true + + - model: openai/gpt-4o + access_key: $OPENAI_API_KEY + routing_preferences: + - name: complex_reasoning + description: complex reasoning tasks, multi-step analysis, or detailed explanations + + - model: anthropic/claude-sonnet-4-20250514 + access_key: $ANTHROPIC_API_KEY + routing_preferences: + - name: code_generation + description: generating new code, writing functions, or creating boilerplate + +tracing: + random_sampling: 100 diff --git a/demos/llm_routing/model_routing_service/demo.sh b/demos/llm_routing/model_routing_service/demo.sh new file mode 100755 index 00000000..3e9b0584 --- /dev/null +++ b/demos/llm_routing/model_routing_service/demo.sh @@ -0,0 +1,65 @@ +#!/bin/bash +set -e + +PLANO_URL="${PLANO_URL:-http://localhost:12000}" + +echo "=== Model Routing Service Demo ===" +echo "" +echo "This demo shows how to use the /routing/v1/* endpoints to get" +echo "routing decisions without actually proxying the request to an LLM." +echo "" + +# --- Example 1: OpenAI Chat Completions format --- +echo "--- 1. Code generation query (OpenAI format) ---" +echo "" +curl -s "$PLANO_URL/routing/v1/chat/completions" \ + -H "Content-Type: application/json" \ + -d '{ + "model": "gpt-4o-mini", + "messages": [ + {"role": "user", "content": "Write a Python function that implements binary search on a sorted array"} + ] + }' | python3 -m json.tool +echo "" + +# --- Example 2: Complex reasoning query --- +echo "--- 2. Complex reasoning query (OpenAI format) ---" +echo "" +curl -s "$PLANO_URL/routing/v1/chat/completions" \ + -H "Content-Type: application/json" \ + -d '{ + "model": "gpt-4o-mini", + "messages": [ + {"role": "user", "content": "Explain the trade-offs between microservices and monolithic architectures, considering scalability, team structure, and operational complexity"} + ] + }' | python3 -m json.tool +echo "" + +# --- Example 3: Simple query (no routing match) --- +echo "--- 3. Simple query - no routing match (OpenAI format) ---" +echo "" +curl -s "$PLANO_URL/routing/v1/chat/completions" \ + -H "Content-Type: application/json" \ + -d '{ + "model": "gpt-4o-mini", + "messages": [ + {"role": "user", "content": "What is the capital of France?"} + ] + }' | python3 -m json.tool +echo "" + +# --- Example 4: Anthropic Messages format --- +echo "--- 4. Code generation query (Anthropic format) ---" +echo "" +curl -s "$PLANO_URL/routing/v1/messages" \ + -H "Content-Type: application/json" \ + -d '{ + "model": "gpt-4o-mini", + "max_tokens": 1024, + "messages": [ + {"role": "user", "content": "Create a REST API endpoint in Rust using actix-web that handles user registration"} + ] + }' | python3 -m json.tool +echo "" + +echo "=== Demo Complete ==="