mirror of
https://github.com/katanemo/plano.git
synced 2026-07-02 15:51:02 +02:00
parent
b9f01c8471
commit
e015637f22
7 changed files with 375 additions and 5 deletions
|
|
@ -7,6 +7,7 @@ pub mod models;
|
||||||
pub mod pipeline_processor;
|
pub mod pipeline_processor;
|
||||||
pub mod response_handler;
|
pub mod response_handler;
|
||||||
pub mod router_chat;
|
pub mod router_chat;
|
||||||
|
pub mod routing_service;
|
||||||
pub mod utils;
|
pub mod utils;
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
|
|
|
||||||
|
|
@ -10,6 +10,7 @@ use crate::tracing::routing;
|
||||||
|
|
||||||
pub struct RoutingResult {
|
pub struct RoutingResult {
|
||||||
pub model_name: String,
|
pub model_name: String,
|
||||||
|
pub route_name: Option<String>,
|
||||||
}
|
}
|
||||||
|
|
||||||
pub struct RoutingError {
|
pub struct RoutingError {
|
||||||
|
|
@ -133,9 +134,12 @@ pub async fn router_chat_get_upstream_model(
|
||||||
|
|
||||||
match routing_result {
|
match routing_result {
|
||||||
Ok(route) => match route {
|
Ok(route) => match route {
|
||||||
Some((_, model_name)) => {
|
Some((route_name, model_name)) => {
|
||||||
current_span.record("route.selected_model", model_name.as_str());
|
current_span.record("route.selected_model", model_name.as_str());
|
||||||
Ok(RoutingResult { model_name })
|
Ok(RoutingResult {
|
||||||
|
model_name,
|
||||||
|
route_name: Some(route_name),
|
||||||
|
})
|
||||||
}
|
}
|
||||||
None => {
|
None => {
|
||||||
// No route determined, return sentinel value "none"
|
// No route determined, return sentinel value "none"
|
||||||
|
|
@ -145,6 +149,7 @@ pub async fn router_chat_get_upstream_model(
|
||||||
|
|
||||||
Ok(RoutingResult {
|
Ok(RoutingResult {
|
||||||
model_name: "none".to_string(),
|
model_name: "none".to_string(),
|
||||||
|
route_name: None,
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
|
|
||||||
163
crates/brightstaff/src/handlers/routing_service.rs
Normal file
163
crates/brightstaff/src/handlers/routing_service.rs
Normal file
|
|
@ -0,0 +1,163 @@
|
||||||
|
use bytes::Bytes;
|
||||||
|
use common::configuration::SpanAttributes;
|
||||||
|
use common::consts::{REQUEST_ID_HEADER, TRACE_PARENT_HEADER};
|
||||||
|
use common::errors::BrightStaffError;
|
||||||
|
use hermesllm::clients::SupportedAPIsFromClient;
|
||||||
|
use hermesllm::ProviderRequestType;
|
||||||
|
use http_body_util::combinators::BoxBody;
|
||||||
|
use http_body_util::{BodyExt, Full};
|
||||||
|
use hyper::{Request, Response, StatusCode};
|
||||||
|
use std::sync::Arc;
|
||||||
|
use tracing::{debug, info, info_span, warn, Instrument};
|
||||||
|
|
||||||
|
use crate::handlers::router_chat::router_chat_get_upstream_model;
|
||||||
|
use crate::router::llm_router::RouterService;
|
||||||
|
use crate::tracing::{collect_custom_trace_attributes, operation_component, set_service_name};
|
||||||
|
|
||||||
|
#[derive(serde::Serialize)]
|
||||||
|
struct RoutingDecisionResponse {
|
||||||
|
model: String,
|
||||||
|
route: Option<String>,
|
||||||
|
trace_id: String,
|
||||||
|
}
|
||||||
|
|
||||||
|
pub async fn routing_decision(
|
||||||
|
request: Request<hyper::body::Incoming>,
|
||||||
|
router_service: Arc<RouterService>,
|
||||||
|
request_path: String,
|
||||||
|
span_attributes: Arc<Option<SpanAttributes>>,
|
||||||
|
) -> Result<Response<BoxBody<Bytes, hyper::Error>>, hyper::Error> {
|
||||||
|
let request_headers = request.headers().clone();
|
||||||
|
let request_id: String = request_headers
|
||||||
|
.get(REQUEST_ID_HEADER)
|
||||||
|
.and_then(|h| h.to_str().ok())
|
||||||
|
.map(|s| s.to_string())
|
||||||
|
.unwrap_or_else(|| uuid::Uuid::new_v4().to_string());
|
||||||
|
|
||||||
|
let custom_attrs =
|
||||||
|
collect_custom_trace_attributes(&request_headers, span_attributes.as_ref().as_ref());
|
||||||
|
|
||||||
|
let request_span = info_span!(
|
||||||
|
"routing_decision",
|
||||||
|
component = "routing",
|
||||||
|
request_id = %request_id,
|
||||||
|
http.method = %request.method(),
|
||||||
|
http.path = %request_path,
|
||||||
|
);
|
||||||
|
|
||||||
|
routing_decision_inner(
|
||||||
|
request,
|
||||||
|
router_service,
|
||||||
|
request_id,
|
||||||
|
request_path,
|
||||||
|
request_headers,
|
||||||
|
custom_attrs,
|
||||||
|
)
|
||||||
|
.instrument(request_span)
|
||||||
|
.await
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn routing_decision_inner(
|
||||||
|
request: Request<hyper::body::Incoming>,
|
||||||
|
router_service: Arc<RouterService>,
|
||||||
|
request_id: String,
|
||||||
|
request_path: String,
|
||||||
|
request_headers: hyper::HeaderMap,
|
||||||
|
custom_attrs: std::collections::HashMap<String, String>,
|
||||||
|
) -> Result<Response<BoxBody<Bytes, hyper::Error>>, hyper::Error> {
|
||||||
|
set_service_name(operation_component::ROUTING);
|
||||||
|
opentelemetry::trace::get_active_span(|span| {
|
||||||
|
for (key, value) in &custom_attrs {
|
||||||
|
span.set_attribute(opentelemetry::KeyValue::new(key.clone(), value.clone()));
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
// Extract or generate traceparent
|
||||||
|
let traceparent: String = match request_headers
|
||||||
|
.get(TRACE_PARENT_HEADER)
|
||||||
|
.and_then(|h| h.to_str().ok())
|
||||||
|
.map(|s| s.to_string())
|
||||||
|
{
|
||||||
|
Some(tp) => tp,
|
||||||
|
None => {
|
||||||
|
let trace_id = uuid::Uuid::new_v4().to_string().replace("-", "");
|
||||||
|
let generated_tp = format!("00-{}-0000000000000000-01", trace_id);
|
||||||
|
warn!(
|
||||||
|
generated_traceparent = %generated_tp,
|
||||||
|
"TRACE_PARENT header missing, generated new traceparent"
|
||||||
|
);
|
||||||
|
generated_tp
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
// Extract trace_id from traceparent (format: 00-{trace_id}-{span_id}-{flags})
|
||||||
|
let trace_id = traceparent
|
||||||
|
.split('-')
|
||||||
|
.nth(1)
|
||||||
|
.unwrap_or("unknown")
|
||||||
|
.to_string();
|
||||||
|
|
||||||
|
// Parse request body
|
||||||
|
let chat_request_bytes = request.collect().await?.to_bytes();
|
||||||
|
|
||||||
|
debug!(
|
||||||
|
body = %String::from_utf8_lossy(&chat_request_bytes),
|
||||||
|
"routing decision request body received"
|
||||||
|
);
|
||||||
|
|
||||||
|
let client_request = match ProviderRequestType::try_from((
|
||||||
|
&chat_request_bytes[..],
|
||||||
|
&SupportedAPIsFromClient::from_endpoint(request_path.as_str()).unwrap(),
|
||||||
|
)) {
|
||||||
|
Ok(request) => request,
|
||||||
|
Err(err) => {
|
||||||
|
warn!(error = %err, "failed to parse request for routing decision");
|
||||||
|
return Ok(BrightStaffError::InvalidRequest(format!(
|
||||||
|
"Failed to parse request: {}",
|
||||||
|
err
|
||||||
|
))
|
||||||
|
.into_response());
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
// Call the existing routing logic
|
||||||
|
let routing_result = router_chat_get_upstream_model(
|
||||||
|
router_service,
|
||||||
|
client_request,
|
||||||
|
&traceparent,
|
||||||
|
&request_path,
|
||||||
|
&request_id,
|
||||||
|
)
|
||||||
|
.await;
|
||||||
|
|
||||||
|
match routing_result {
|
||||||
|
Ok(result) => {
|
||||||
|
let response = RoutingDecisionResponse {
|
||||||
|
model: result.model_name,
|
||||||
|
route: result.route_name,
|
||||||
|
trace_id,
|
||||||
|
};
|
||||||
|
|
||||||
|
info!(
|
||||||
|
model = %response.model,
|
||||||
|
route = ?response.route,
|
||||||
|
"routing decision completed"
|
||||||
|
);
|
||||||
|
|
||||||
|
let json = serde_json::to_string(&response).unwrap();
|
||||||
|
let body = Full::new(Bytes::from(json))
|
||||||
|
.map_err(|never| match never {})
|
||||||
|
.boxed();
|
||||||
|
|
||||||
|
Ok(Response::builder()
|
||||||
|
.status(StatusCode::OK)
|
||||||
|
.header("Content-Type", "application/json")
|
||||||
|
.body(body)
|
||||||
|
.unwrap())
|
||||||
|
}
|
||||||
|
Err(err) => {
|
||||||
|
warn!(error = %err.message, "routing decision failed");
|
||||||
|
Ok(BrightStaffError::InternalServerError(err.message).into_response())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -2,6 +2,7 @@ use brightstaff::handlers::agent_chat_completions::agent_chat;
|
||||||
use brightstaff::handlers::function_calling::function_calling_chat_handler;
|
use brightstaff::handlers::function_calling::function_calling_chat_handler;
|
||||||
use brightstaff::handlers::llm::llm_chat;
|
use brightstaff::handlers::llm::llm_chat;
|
||||||
use brightstaff::handlers::models::list_models;
|
use brightstaff::handlers::models::list_models;
|
||||||
|
use brightstaff::handlers::routing_service::routing_decision;
|
||||||
use brightstaff::router::llm_router::RouterService;
|
use brightstaff::router::llm_router::RouterService;
|
||||||
use brightstaff::router::plano_orchestrator::OrchestratorService;
|
use brightstaff::router::plano_orchestrator::OrchestratorService;
|
||||||
use brightstaff::state::memory::MemoryConversationalStorage;
|
use brightstaff::state::memory::MemoryConversationalStorage;
|
||||||
|
|
@ -194,7 +195,7 @@ async fn main() -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
|
||||||
let state_storage = state_storage.clone();
|
let state_storage = state_storage.clone();
|
||||||
|
|
||||||
async move {
|
async move {
|
||||||
let path = req.uri().path();
|
let path = req.uri().path().to_string();
|
||||||
// Check if path starts with /agents
|
// Check if path starts with /agents
|
||||||
if path.starts_with("/agents") {
|
if path.starts_with("/agents") {
|
||||||
// Check if it matches one of the agent API paths
|
// Check if it matches one of the agent API paths
|
||||||
|
|
@ -217,7 +218,23 @@ async fn main() -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
|
||||||
.await;
|
.await;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
match (req.method(), path) {
|
if let Some(stripped_path) = path.strip_prefix("/routing") {
|
||||||
|
let stripped_path = stripped_path.to_string();
|
||||||
|
if matches!(
|
||||||
|
stripped_path.as_str(),
|
||||||
|
CHAT_COMPLETIONS_PATH | MESSAGES_PATH | OPENAI_RESPONSES_API_PATH
|
||||||
|
) {
|
||||||
|
return routing_decision(
|
||||||
|
req,
|
||||||
|
router_service,
|
||||||
|
stripped_path,
|
||||||
|
span_attributes,
|
||||||
|
)
|
||||||
|
.with_context(parent_cx)
|
||||||
|
.await;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
match (req.method(), path.as_str()) {
|
||||||
(
|
(
|
||||||
&Method::POST,
|
&Method::POST,
|
||||||
CHAT_COMPLETIONS_PATH | MESSAGES_PATH | OPENAI_RESPONSES_API_PATH,
|
CHAT_COMPLETIONS_PATH | MESSAGES_PATH | OPENAI_RESPONSES_API_PATH,
|
||||||
|
|
@ -270,7 +287,7 @@ async fn main() -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
|
||||||
Ok(response)
|
Ok(response)
|
||||||
}
|
}
|
||||||
_ => {
|
_ => {
|
||||||
debug!(method = %req.method(), path = %req.uri().path(), "no route found");
|
debug!(method = %req.method(), path = %path, "no route found");
|
||||||
let mut not_found = Response::new(empty());
|
let mut not_found = Response::new(empty());
|
||||||
*not_found.status_mut() = StatusCode::NOT_FOUND;
|
*not_found.status_mut() = StatusCode::NOT_FOUND;
|
||||||
Ok(not_found)
|
Ok(not_found)
|
||||||
|
|
|
||||||
92
demos/llm_routing/model_routing_service/README.md
Normal file
92
demos/llm_routing/model_routing_service/README.md
Normal file
|
|
@ -0,0 +1,92 @@
|
||||||
|
# Model Routing Service Demo
|
||||||
|
|
||||||
|
This demo shows how to use the `/routing/v1/*` endpoints to get routing decisions without proxying requests to an LLM. The endpoint accepts standard LLM request formats and returns which model Plano's router would select.
|
||||||
|
|
||||||
|
## Setup
|
||||||
|
|
||||||
|
Make sure you have Plano CLI installed (`pip install planoai` or `uv tool install planoai`).
|
||||||
|
|
||||||
|
```bash
|
||||||
|
export OPENAI_API_KEY=<your-key>
|
||||||
|
export ANTHROPIC_API_KEY=<your-key>
|
||||||
|
```
|
||||||
|
|
||||||
|
Start Plano:
|
||||||
|
```bash
|
||||||
|
cd demos/llm_routing/model_routing_service
|
||||||
|
planoai up config.yaml
|
||||||
|
```
|
||||||
|
|
||||||
|
## Run the demo
|
||||||
|
|
||||||
|
```bash
|
||||||
|
./demo.sh
|
||||||
|
```
|
||||||
|
|
||||||
|
## Endpoints
|
||||||
|
|
||||||
|
All three LLM API formats are supported:
|
||||||
|
|
||||||
|
| Endpoint | Format |
|
||||||
|
|---|---|
|
||||||
|
| `POST /routing/v1/chat/completions` | OpenAI Chat Completions |
|
||||||
|
| `POST /routing/v1/messages` | Anthropic Messages |
|
||||||
|
| `POST /routing/v1/responses` | OpenAI Responses API |
|
||||||
|
|
||||||
|
## Example
|
||||||
|
|
||||||
|
```bash
|
||||||
|
curl http://localhost:12000/routing/v1/chat/completions \
|
||||||
|
-H "Content-Type: application/json" \
|
||||||
|
-d '{
|
||||||
|
"model": "gpt-4o-mini",
|
||||||
|
"messages": [{"role": "user", "content": "Write a Python function for binary search"}]
|
||||||
|
}'
|
||||||
|
```
|
||||||
|
|
||||||
|
Response:
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"model": "anthropic/claude-sonnet-4-20250514",
|
||||||
|
"route": "code_generation",
|
||||||
|
"trace_id": "c16d1096c1af4a17abb48fb182918a88"
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
The response tells you which model would handle this request and which route was matched, without actually making the LLM call.
|
||||||
|
|
||||||
|
## Demo Output
|
||||||
|
|
||||||
|
```
|
||||||
|
=== Model Routing Service Demo ===
|
||||||
|
|
||||||
|
--- 1. Code generation query (OpenAI format) ---
|
||||||
|
{
|
||||||
|
"model": "anthropic/claude-sonnet-4-20250514",
|
||||||
|
"route": "code_generation",
|
||||||
|
"trace_id": "c16d1096c1af4a17abb48fb182918a88"
|
||||||
|
}
|
||||||
|
|
||||||
|
--- 2. Complex reasoning query (OpenAI format) ---
|
||||||
|
{
|
||||||
|
"model": "openai/gpt-4o",
|
||||||
|
"route": "complex_reasoning",
|
||||||
|
"trace_id": "30795e228aff4d7696f082ed01b75ad4"
|
||||||
|
}
|
||||||
|
|
||||||
|
--- 3. Simple query - no routing match (OpenAI format) ---
|
||||||
|
{
|
||||||
|
"model": "none",
|
||||||
|
"route": null,
|
||||||
|
"trace_id": "ae0b6c3b220d499fb5298ac63f4eac0e"
|
||||||
|
}
|
||||||
|
|
||||||
|
--- 4. Code generation query (Anthropic format) ---
|
||||||
|
{
|
||||||
|
"model": "anthropic/claude-sonnet-4-20250514",
|
||||||
|
"route": "code_generation",
|
||||||
|
"trace_id": "26be822bbdf14a3ba19fe198e55ea4a9"
|
||||||
|
}
|
||||||
|
|
||||||
|
=== Demo Complete ===
|
||||||
|
```
|
||||||
27
demos/llm_routing/model_routing_service/config.yaml
Normal file
27
demos/llm_routing/model_routing_service/config.yaml
Normal file
|
|
@ -0,0 +1,27 @@
|
||||||
|
version: v0.3.0
|
||||||
|
|
||||||
|
listeners:
|
||||||
|
- type: model
|
||||||
|
name: model_listener
|
||||||
|
port: 12000
|
||||||
|
|
||||||
|
model_providers:
|
||||||
|
|
||||||
|
- model: openai/gpt-4o-mini
|
||||||
|
access_key: $OPENAI_API_KEY
|
||||||
|
default: true
|
||||||
|
|
||||||
|
- model: openai/gpt-4o
|
||||||
|
access_key: $OPENAI_API_KEY
|
||||||
|
routing_preferences:
|
||||||
|
- name: complex_reasoning
|
||||||
|
description: complex reasoning tasks, multi-step analysis, or detailed explanations
|
||||||
|
|
||||||
|
- model: anthropic/claude-sonnet-4-20250514
|
||||||
|
access_key: $ANTHROPIC_API_KEY
|
||||||
|
routing_preferences:
|
||||||
|
- name: code_generation
|
||||||
|
description: generating new code, writing functions, or creating boilerplate
|
||||||
|
|
||||||
|
tracing:
|
||||||
|
random_sampling: 100
|
||||||
65
demos/llm_routing/model_routing_service/demo.sh
Executable file
65
demos/llm_routing/model_routing_service/demo.sh
Executable file
|
|
@ -0,0 +1,65 @@
|
||||||
|
#!/bin/bash
|
||||||
|
set -e
|
||||||
|
|
||||||
|
PLANO_URL="${PLANO_URL:-http://localhost:12000}"
|
||||||
|
|
||||||
|
echo "=== Model Routing Service Demo ==="
|
||||||
|
echo ""
|
||||||
|
echo "This demo shows how to use the /routing/v1/* endpoints to get"
|
||||||
|
echo "routing decisions without actually proxying the request to an LLM."
|
||||||
|
echo ""
|
||||||
|
|
||||||
|
# --- Example 1: OpenAI Chat Completions format ---
|
||||||
|
echo "--- 1. Code generation query (OpenAI format) ---"
|
||||||
|
echo ""
|
||||||
|
curl -s "$PLANO_URL/routing/v1/chat/completions" \
|
||||||
|
-H "Content-Type: application/json" \
|
||||||
|
-d '{
|
||||||
|
"model": "gpt-4o-mini",
|
||||||
|
"messages": [
|
||||||
|
{"role": "user", "content": "Write a Python function that implements binary search on a sorted array"}
|
||||||
|
]
|
||||||
|
}' | python3 -m json.tool
|
||||||
|
echo ""
|
||||||
|
|
||||||
|
# --- Example 2: Complex reasoning query ---
|
||||||
|
echo "--- 2. Complex reasoning query (OpenAI format) ---"
|
||||||
|
echo ""
|
||||||
|
curl -s "$PLANO_URL/routing/v1/chat/completions" \
|
||||||
|
-H "Content-Type: application/json" \
|
||||||
|
-d '{
|
||||||
|
"model": "gpt-4o-mini",
|
||||||
|
"messages": [
|
||||||
|
{"role": "user", "content": "Explain the trade-offs between microservices and monolithic architectures, considering scalability, team structure, and operational complexity"}
|
||||||
|
]
|
||||||
|
}' | python3 -m json.tool
|
||||||
|
echo ""
|
||||||
|
|
||||||
|
# --- Example 3: Simple query (no routing match) ---
|
||||||
|
echo "--- 3. Simple query - no routing match (OpenAI format) ---"
|
||||||
|
echo ""
|
||||||
|
curl -s "$PLANO_URL/routing/v1/chat/completions" \
|
||||||
|
-H "Content-Type: application/json" \
|
||||||
|
-d '{
|
||||||
|
"model": "gpt-4o-mini",
|
||||||
|
"messages": [
|
||||||
|
{"role": "user", "content": "What is the capital of France?"}
|
||||||
|
]
|
||||||
|
}' | python3 -m json.tool
|
||||||
|
echo ""
|
||||||
|
|
||||||
|
# --- Example 4: Anthropic Messages format ---
|
||||||
|
echo "--- 4. Code generation query (Anthropic format) ---"
|
||||||
|
echo ""
|
||||||
|
curl -s "$PLANO_URL/routing/v1/messages" \
|
||||||
|
-H "Content-Type: application/json" \
|
||||||
|
-d '{
|
||||||
|
"model": "gpt-4o-mini",
|
||||||
|
"max_tokens": 1024,
|
||||||
|
"messages": [
|
||||||
|
{"role": "user", "content": "Create a REST API endpoint in Rust using actix-web that handles user registration"}
|
||||||
|
]
|
||||||
|
}' | python3 -m json.tool
|
||||||
|
echo ""
|
||||||
|
|
||||||
|
echo "=== Demo Complete ==="
|
||||||
Loading…
Add table
Add a link
Reference in a new issue