model routing: cost/latency ranking with ranked fallback list (#849)

This commit is contained in:
Adil Hafeez 2026-03-30 13:46:52 -07:00 committed by GitHub
parent 3a531ce22a
commit e5751d6b13
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
23 changed files with 1524 additions and 317 deletions

View file

@ -9,6 +9,7 @@ properties:
- 0.1-beta - 0.1-beta
- 0.2.0 - 0.2.0
- v0.3.0 - v0.3.0
- v0.4.0
agents: agents:
type: array type: array
@ -470,6 +471,106 @@ properties:
additionalProperties: false additionalProperties: false
required: required:
- jailbreak - jailbreak
routing_preferences:
type: array
items:
type: object
properties:
name:
type: string
description:
type: string
models:
type: array
items:
type: string
minItems: 1
selection_policy:
type: object
properties:
prefer:
type: string
enum:
- cheapest
- fastest
- none
additionalProperties: false
required:
- prefer
additionalProperties: false
required:
- name
- description
- models
- selection_policy
model_metrics_sources:
type: array
items:
oneOf:
- type: object
properties:
type:
type: string
const: cost_metrics
url:
type: string
refresh_interval:
type: integer
minimum: 1
auth:
type: object
properties:
type:
type: string
enum:
- bearer
token:
type: string
required:
- type
- token
additionalProperties: false
required:
- type
- url
additionalProperties: false
- type: object
properties:
type:
type: string
const: prometheus_metrics
url:
type: string
query:
type: string
refresh_interval:
type: integer
minimum: 1
description: "Refresh interval in seconds"
required:
- type
- url
- query
additionalProperties: false
- type: object
properties:
type:
type: string
const: digitalocean_pricing
refresh_interval:
type: integer
minimum: 1
description: "Refresh interval in seconds"
model_aliases:
type: object
description: "Map DO catalog keys (lowercase(creator)/model_id) to Plano model names used in routing_preferences. Example: 'openai/openai-gpt-oss-120b: openai/gpt-4o'"
additionalProperties:
type: string
required:
- type
additionalProperties: false
additionalProperties: false additionalProperties: false
required: required:
- version - version

View file

@ -1,6 +1,16 @@
#!/bin/bash #!/bin/bash
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
REPO_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
CLI_DIR="$REPO_ROOT/cli"
# Use uv run if available and cli/ has a pyproject.toml, otherwise fall back to bare python
if command -v uv &> /dev/null && [ -f "$CLI_DIR/pyproject.toml" ]; then
PYTHON_CMD="uv run --directory $CLI_DIR python"
else
PYTHON_CMD="python"
fi
failed_files=() failed_files=()
for file in $(find . -name config.yaml -o -name plano_config_full_reference.yaml); do for file in $(find . -name config.yaml -o -name plano_config_full_reference.yaml); do
@ -14,7 +24,7 @@ for file in $(find . -name config.yaml -o -name plano_config_full_reference.yaml
ENVOY_CONFIG_TEMPLATE_FILE="envoy.template.yaml" \ ENVOY_CONFIG_TEMPLATE_FILE="envoy.template.yaml" \
PLANO_CONFIG_FILE_RENDERED="$rendered_file" \ PLANO_CONFIG_FILE_RENDERED="$rendered_file" \
ENVOY_CONFIG_FILE_RENDERED="/dev/null" \ ENVOY_CONFIG_FILE_RENDERED="/dev/null" \
python -m planoai.config_generator 2>&1 > /dev/null $PYTHON_CMD -m planoai.config_generator 2>&1 > /dev/null
if [ $? -ne 0 ]; then if [ $? -ne 0 ]; then
echo "Validation failed for $file" echo "Validation failed for $file"

View file

@ -119,7 +119,7 @@ async fn llm_chat_inner(
temperature, temperature,
tool_names, tool_names,
user_message_preview, user_message_preview,
inline_routing_policy, inline_routing_preferences,
client_api, client_api,
provider_id, provider_id,
} = parsed; } = parsed;
@ -261,7 +261,7 @@ async fn llm_chat_inner(
&traceparent, &traceparent,
&request_path, &request_path,
&request_id, &request_id,
inline_routing_policy, inline_routing_preferences,
) )
.await .await
} }
@ -323,7 +323,7 @@ struct PreparedRequest {
temperature: Option<f32>, temperature: Option<f32>,
tool_names: Option<Vec<String>>, tool_names: Option<Vec<String>>,
user_message_preview: Option<String>, user_message_preview: Option<String>,
inline_routing_policy: Option<Vec<common::configuration::ModelUsagePreference>>, inline_routing_preferences: Option<Vec<common::configuration::TopLevelRoutingPreference>>,
client_api: Option<SupportedAPIsFromClient>, client_api: Option<SupportedAPIsFromClient>,
provider_id: hermesllm::ProviderId, provider_id: hermesllm::ProviderId,
} }
@ -352,16 +352,14 @@ async fn parse_and_validate_request(
"request body received" "request body received"
); );
// Extract routing_policy from request body if present // Extract routing_preferences from request body if present
let (chat_request_bytes, inline_routing_policy) = let (chat_request_bytes, inline_routing_preferences) =
crate::handlers::routing_service::extract_routing_policy(&raw_bytes, false).map_err( crate::handlers::routing_service::extract_routing_policy(&raw_bytes).map_err(|err| {
|err| { warn!(error = %err, "failed to parse request JSON");
warn!(error = %err, "failed to parse request JSON"); let mut r = Response::new(full(format!("Failed to parse request: {}", err)));
let mut r = Response::new(full(format!("Failed to parse request: {}", err))); *r.status_mut() = StatusCode::BAD_REQUEST;
*r.status_mut() = StatusCode::BAD_REQUEST; r
r })?;
},
)?;
let api_type = SupportedAPIsFromClient::from_endpoint(request_path).ok_or_else(|| { let api_type = SupportedAPIsFromClient::from_endpoint(request_path).ok_or_else(|| {
warn!(path = %request_path, "unsupported endpoint"); warn!(path = %request_path, "unsupported endpoint");
@ -439,7 +437,7 @@ async fn parse_and_validate_request(
temperature, temperature,
tool_names, tool_names,
user_message_preview, user_message_preview,
inline_routing_policy, inline_routing_preferences,
client_api, client_api,
provider_id, provider_id,
}) })

View file

@ -1,6 +1,6 @@
use common::configuration::ModelUsagePreference; use common::configuration::TopLevelRoutingPreference;
use hermesllm::clients::endpoints::SupportedUpstreamAPIs; use hermesllm::clients::endpoints::SupportedUpstreamAPIs;
use hermesllm::{ProviderRequest, ProviderRequestType}; use hermesllm::ProviderRequestType;
use hyper::StatusCode; use hyper::StatusCode;
use std::sync::Arc; use std::sync::Arc;
use tracing::{debug, info, warn}; use tracing::{debug, info, warn};
@ -10,7 +10,10 @@ use crate::streaming::truncate_message;
use crate::tracing::routing; use crate::tracing::routing;
pub struct RoutingResult { pub struct RoutingResult {
/// Primary model to use (first in the ranked list).
pub model_name: String, pub model_name: String,
/// Full ranked list — use subsequent entries as fallbacks on 429/5xx.
pub models: Vec<String>,
pub route_name: Option<String>, pub route_name: Option<String>,
} }
@ -39,11 +42,8 @@ pub async fn router_chat_get_upstream_model(
traceparent: &str, traceparent: &str,
request_path: &str, request_path: &str,
request_id: &str, request_id: &str,
inline_usage_preferences: Option<Vec<ModelUsagePreference>>, inline_routing_preferences: Option<Vec<TopLevelRoutingPreference>>,
) -> Result<RoutingResult, RoutingError> { ) -> Result<RoutingResult, RoutingError> {
// Clone metadata for routing before converting (which consumes client_request)
let routing_metadata = client_request.metadata().clone();
// Convert to ChatCompletionsRequest for routing (regardless of input type) // Convert to ChatCompletionsRequest for routing (regardless of input type)
let chat_request = match ProviderRequestType::try_from(( let chat_request = match ProviderRequestType::try_from((
client_request, client_request,
@ -78,22 +78,6 @@ pub async fn router_chat_get_upstream_model(
"router request" "router request"
); );
// Use inline preferences if provided, otherwise fall back to metadata extraction
let usage_preferences: Option<Vec<ModelUsagePreference>> = if inline_usage_preferences.is_some()
{
inline_usage_preferences
} else {
let usage_preferences_str: Option<String> =
routing_metadata.as_ref().and_then(|metadata| {
metadata
.get("plano_preference_config")
.map(|value| value.to_string())
});
usage_preferences_str
.as_ref()
.and_then(|s| serde_yaml::from_str(s).ok())
};
// Prepare log message with latest message from chat request // Prepare log message with latest message from chat request
let latest_message_for_log = chat_request let latest_message_for_log = chat_request
.messages .messages
@ -107,7 +91,6 @@ pub async fn router_chat_get_upstream_model(
let latest_message_for_log = truncate_message(&latest_message_for_log, 50); let latest_message_for_log = truncate_message(&latest_message_for_log, 50);
info!( info!(
has_usage_preferences = usage_preferences.is_some(),
path = %request_path, path = %request_path,
latest_message = %latest_message_for_log, latest_message = %latest_message_for_log,
"processing router request" "processing router request"
@ -121,7 +104,7 @@ pub async fn router_chat_get_upstream_model(
.determine_route( .determine_route(
&chat_request.messages, &chat_request.messages,
traceparent, traceparent,
usage_preferences, inline_routing_preferences,
request_id, request_id,
) )
.await; .await;
@ -132,10 +115,12 @@ pub async fn router_chat_get_upstream_model(
match routing_result { match routing_result {
Ok(route) => match route { Ok(route) => match route {
Some((route_name, model_name)) => { Some((route_name, ranked_models)) => {
let model_name = ranked_models.first().cloned().unwrap_or_default();
current_span.record("route.selected_model", model_name.as_str()); current_span.record("route.selected_model", model_name.as_str());
Ok(RoutingResult { Ok(RoutingResult {
model_name, model_name,
models: ranked_models,
route_name: Some(route_name), route_name: Some(route_name),
}) })
} }
@ -147,6 +132,7 @@ pub async fn router_chat_get_upstream_model(
Ok(RoutingResult { Ok(RoutingResult {
model_name: "none".to_string(), model_name: "none".to_string(),
models: vec!["none".to_string()],
route_name: None, route_name: None,
}) })
} }

View file

@ -1,5 +1,5 @@
use bytes::Bytes; use bytes::Bytes;
use common::configuration::{ModelUsagePreference, SpanAttributes}; use common::configuration::{SpanAttributes, TopLevelRoutingPreference};
use common::consts::REQUEST_ID_HEADER; use common::consts::REQUEST_ID_HEADER;
use common::errors::BrightStaffError; use common::errors::BrightStaffError;
use hermesllm::clients::SupportedAPIsFromClient; use hermesllm::clients::SupportedAPIsFromClient;
@ -15,56 +15,42 @@ use crate::handlers::llm::model_selection::router_chat_get_upstream_model;
use crate::router::llm::RouterService; use crate::router::llm::RouterService;
use crate::tracing::{collect_custom_trace_attributes, operation_component, set_service_name}; use crate::tracing::{collect_custom_trace_attributes, operation_component, set_service_name};
const ROUTING_POLICY_SIZE_WARNING_BYTES: usize = 5120; /// Extracts `routing_preferences` from a JSON body, returning the cleaned body bytes
/// and the parsed preferences. The field is removed from the JSON before re-serializing
/// Extracts `routing_policy` from a JSON body, returning the cleaned body bytes /// so downstream parsers don't see it.
/// and parsed preferences. The `routing_policy` field is removed from the JSON
/// before re-serializing so downstream parsers don't see the non-standard field.
///
/// If `warn_on_size` is true, logs a warning when the serialized policy exceeds 5KB.
pub fn extract_routing_policy( pub fn extract_routing_policy(
raw_bytes: &[u8], raw_bytes: &[u8],
warn_on_size: bool, ) -> Result<(Bytes, Option<Vec<TopLevelRoutingPreference>>), String> {
) -> Result<(Bytes, Option<Vec<ModelUsagePreference>>), String> {
let mut json_body: serde_json::Value = serde_json::from_slice(raw_bytes) let mut json_body: serde_json::Value = serde_json::from_slice(raw_bytes)
.map_err(|err| format!("Failed to parse JSON: {}", err))?; .map_err(|err| format!("Failed to parse JSON: {}", err))?;
let preferences = json_body let routing_preferences = json_body
.as_object_mut() .as_object_mut()
.and_then(|obj| obj.remove("routing_policy")) .and_then(|o| o.remove("routing_preferences"))
.and_then(|policy_value| { .and_then(
if warn_on_size { |value| match serde_json::from_value::<Vec<TopLevelRoutingPreference>>(value) {
let policy_str = serde_json::to_string(&policy_value).unwrap_or_default();
if policy_str.len() > ROUTING_POLICY_SIZE_WARNING_BYTES {
warn!(
size_bytes = policy_str.len(),
limit_bytes = ROUTING_POLICY_SIZE_WARNING_BYTES,
"routing_policy exceeds recommended size limit"
);
}
}
match serde_json::from_value::<Vec<ModelUsagePreference>>(policy_value) {
Ok(prefs) => { Ok(prefs) => {
info!( info!(
num_models = prefs.len(), num_routes = prefs.len(),
"using inline routing_policy from request body" "using inline routing_preferences from request body"
); );
Some(prefs) Some(prefs)
} }
Err(err) => { Err(err) => {
warn!(error = %err, "failed to parse routing_policy"); warn!(error = %err, "failed to parse routing_preferences");
None None
} }
} },
}); );
let bytes = Bytes::from(serde_json::to_vec(&json_body).unwrap()); let bytes = Bytes::from(serde_json::to_vec(&json_body).unwrap());
Ok((bytes, preferences)) Ok((bytes, routing_preferences))
} }
#[derive(serde::Serialize)] #[derive(serde::Serialize)]
struct RoutingDecisionResponse { struct RoutingDecisionResponse {
model: String, /// Ranked model list — use first, fall back to next on 429/5xx.
models: Vec<String>,
route: Option<String>, route: Option<String>,
trace_id: String, trace_id: String,
} }
@ -136,8 +122,9 @@ async fn routing_decision_inner(
"routing decision request body received" "routing decision request body received"
); );
// Extract routing_policy from request body before parsing as ProviderRequestType // Extract routing_preferences from body before parsing as ProviderRequestType
let (chat_request_bytes, inline_preferences) = match extract_routing_policy(&raw_bytes, true) { let (chat_request_bytes, inline_routing_preferences) = match extract_routing_policy(&raw_bytes)
{
Ok(result) => result, Ok(result) => result,
Err(err) => { Err(err) => {
warn!(error = %err, "failed to parse request JSON"); warn!(error = %err, "failed to parse request JSON");
@ -164,27 +151,27 @@ async fn routing_decision_inner(
} }
}; };
// Call the existing routing logic with inline preferences
let routing_result = router_chat_get_upstream_model( let routing_result = router_chat_get_upstream_model(
router_service, router_service,
client_request, client_request,
&traceparent, &traceparent,
&request_path, &request_path,
&request_id, &request_id,
inline_preferences, inline_routing_preferences,
) )
.await; .await;
match routing_result { match routing_result {
Ok(result) => { Ok(result) => {
let response = RoutingDecisionResponse { let response = RoutingDecisionResponse {
model: result.model_name, models: result.models,
route: result.route_name, route: result.route_name,
trace_id, trace_id,
}; };
info!( info!(
model = %response.model, primary_model = %response.models.first().map(|s| s.as_str()).unwrap_or("none"),
total_models = response.models.len(),
route = ?response.route, route = ?response.route,
"routing decision completed" "routing decision completed"
); );
@ -227,101 +214,70 @@ mod tests {
#[test] #[test]
fn extract_routing_policy_no_policy() { fn extract_routing_policy_no_policy() {
let body = make_chat_body(""); let body = make_chat_body("");
let (cleaned, prefs) = extract_routing_policy(&body, false).unwrap(); let (cleaned, prefs) = extract_routing_policy(&body).unwrap();
assert!(prefs.is_none()); assert!(prefs.is_none());
let cleaned_json: serde_json::Value = serde_json::from_slice(&cleaned).unwrap(); let cleaned_json: serde_json::Value = serde_json::from_slice(&cleaned).unwrap();
assert_eq!(cleaned_json["model"], "gpt-4o-mini"); assert_eq!(cleaned_json["model"], "gpt-4o-mini");
assert!(cleaned_json.get("routing_policy").is_none());
}
#[test]
fn extract_routing_policy_valid_policy() {
let policy = r#""routing_policy": [
{
"model": "openai/gpt-4o",
"routing_preferences": [
{"name": "coding", "description": "code generation tasks"}
]
},
{
"model": "openai/gpt-4o-mini",
"routing_preferences": [
{"name": "general", "description": "general questions"}
]
}
]"#;
let body = make_chat_body(policy);
let (cleaned, prefs) = extract_routing_policy(&body, false).unwrap();
let prefs = prefs.expect("should have parsed preferences");
assert_eq!(prefs.len(), 2);
assert_eq!(prefs[0].model, "openai/gpt-4o");
assert_eq!(prefs[0].routing_preferences[0].name, "coding");
assert_eq!(prefs[1].model, "openai/gpt-4o-mini");
assert_eq!(prefs[1].routing_preferences[0].name, "general");
// routing_policy should be stripped from cleaned body
let cleaned_json: serde_json::Value = serde_json::from_slice(&cleaned).unwrap();
assert!(cleaned_json.get("routing_policy").is_none());
assert_eq!(cleaned_json["model"], "gpt-4o-mini");
}
#[test]
fn extract_routing_policy_invalid_policy_returns_none() {
// routing_policy is present but has wrong shape
let policy = r#""routing_policy": "not-an-array""#;
let body = make_chat_body(policy);
let (cleaned, prefs) = extract_routing_policy(&body, false).unwrap();
// Invalid policy should be ignored (returns None), not error
assert!(prefs.is_none());
// routing_policy should still be stripped from cleaned body
let cleaned_json: serde_json::Value = serde_json::from_slice(&cleaned).unwrap();
assert!(cleaned_json.get("routing_policy").is_none());
} }
#[test] #[test]
fn extract_routing_policy_invalid_json_returns_error() { fn extract_routing_policy_invalid_json_returns_error() {
let body = b"not valid json"; let body = b"not valid json";
let result = extract_routing_policy(body, false); let result = extract_routing_policy(body);
assert!(result.is_err()); assert!(result.is_err());
assert!(result.unwrap_err().contains("Failed to parse JSON")); assert!(result.unwrap_err().contains("Failed to parse JSON"));
} }
#[test] #[test]
fn extract_routing_policy_empty_array() { fn extract_routing_policy_routing_preferences() {
let policy = r#""routing_policy": []"#; let policy = r#""routing_preferences": [
{
"name": "code generation",
"description": "generate new code",
"models": ["openai/gpt-4o", "openai/gpt-4o-mini"],
"selection_policy": {"prefer": "fastest"}
}
]"#;
let body = make_chat_body(policy); let body = make_chat_body(policy);
let (_, prefs) = extract_routing_policy(&body, false).unwrap(); let (cleaned, prefs) = extract_routing_policy(&body).unwrap();
let prefs = prefs.expect("empty array is valid"); let prefs = prefs.expect("should have parsed routing_preferences");
assert_eq!(prefs.len(), 0); assert_eq!(prefs.len(), 1);
assert_eq!(prefs[0].name, "code generation");
assert_eq!(prefs[0].models, vec!["openai/gpt-4o", "openai/gpt-4o-mini"]);
let cleaned_json: serde_json::Value = serde_json::from_slice(&cleaned).unwrap();
assert!(cleaned_json.get("routing_preferences").is_none());
} }
#[test] #[test]
fn extract_routing_policy_preserves_other_fields() { fn extract_routing_policy_preserves_other_fields() {
let policy = r#""routing_policy": [{"model": "gpt-4o", "routing_preferences": [{"name": "test", "description": "test"}]}], "temperature": 0.5, "max_tokens": 100"#; let policy = r#""routing_preferences": [{"name": "test", "description": "test", "models": ["gpt-4o"], "selection_policy": {"prefer": "none"}}], "temperature": 0.5, "max_tokens": 100"#;
let body = make_chat_body(policy); let body = make_chat_body(policy);
let (cleaned, prefs) = extract_routing_policy(&body, false).unwrap(); let (cleaned, prefs) = extract_routing_policy(&body).unwrap();
assert!(prefs.is_some()); assert!(prefs.is_some());
let cleaned_json: serde_json::Value = serde_json::from_slice(&cleaned).unwrap(); let cleaned_json: serde_json::Value = serde_json::from_slice(&cleaned).unwrap();
assert_eq!(cleaned_json["temperature"], 0.5); assert_eq!(cleaned_json["temperature"], 0.5);
assert_eq!(cleaned_json["max_tokens"], 100); assert_eq!(cleaned_json["max_tokens"], 100);
assert!(cleaned_json.get("routing_policy").is_none()); assert!(cleaned_json.get("routing_preferences").is_none());
} }
#[test] #[test]
fn routing_decision_response_serialization() { fn routing_decision_response_serialization() {
let response = RoutingDecisionResponse { let response = RoutingDecisionResponse {
model: "openai/gpt-4o".to_string(), models: vec![
"openai/gpt-4o-mini".to_string(),
"openai/gpt-4o".to_string(),
],
route: Some("code_generation".to_string()), route: Some("code_generation".to_string()),
trace_id: "abc123".to_string(), trace_id: "abc123".to_string(),
}; };
let json = serde_json::to_string(&response).unwrap(); let json = serde_json::to_string(&response).unwrap();
let parsed: serde_json::Value = serde_json::from_str(&json).unwrap(); let parsed: serde_json::Value = serde_json::from_str(&json).unwrap();
assert_eq!(parsed["model"], "openai/gpt-4o"); assert_eq!(parsed["models"][0], "openai/gpt-4o-mini");
assert_eq!(parsed["models"][1], "openai/gpt-4o");
assert_eq!(parsed["route"], "code_generation"); assert_eq!(parsed["route"], "code_generation");
assert_eq!(parsed["trace_id"], "abc123"); assert_eq!(parsed["trace_id"], "abc123");
} }
@ -329,13 +285,13 @@ mod tests {
#[test] #[test]
fn routing_decision_response_serialization_no_route() { fn routing_decision_response_serialization_no_route() {
let response = RoutingDecisionResponse { let response = RoutingDecisionResponse {
model: "none".to_string(), models: vec!["none".to_string()],
route: None, route: None,
trace_id: "abc123".to_string(), trace_id: "abc123".to_string(),
}; };
let json = serde_json::to_string(&response).unwrap(); let json = serde_json::to_string(&response).unwrap();
let parsed: serde_json::Value = serde_json::from_str(&json).unwrap(); let parsed: serde_json::Value = serde_json::from_str(&json).unwrap();
assert_eq!(parsed["model"], "none"); assert_eq!(parsed["models"][0], "none");
assert!(parsed["route"].is_null()); assert!(parsed["route"].is_null());
} }
} }

View file

@ -6,6 +6,7 @@ use brightstaff::handlers::llm::llm_chat;
use brightstaff::handlers::models::list_models; use brightstaff::handlers::models::list_models;
use brightstaff::handlers::routing_service::routing_decision; use brightstaff::handlers::routing_service::routing_decision;
use brightstaff::router::llm::RouterService; use brightstaff::router::llm::RouterService;
use brightstaff::router::model_metrics::ModelMetricsService;
use brightstaff::router::orchestrator::OrchestratorService; use brightstaff::router::orchestrator::OrchestratorService;
use brightstaff::state::memory::MemoryConversationalStorage; use brightstaff::state::memory::MemoryConversationalStorage;
use brightstaff::state::postgresql::PostgreSQLConversationStorage; use brightstaff::state::postgresql::PostgreSQLConversationStorage;
@ -40,6 +41,17 @@ const DEFAULT_ROUTING_MODEL_NAME: &str = "Arch-Router";
const DEFAULT_ORCHESTRATOR_LLM_PROVIDER: &str = "plano-orchestrator"; const DEFAULT_ORCHESTRATOR_LLM_PROVIDER: &str = "plano-orchestrator";
const DEFAULT_ORCHESTRATOR_MODEL_NAME: &str = "Plano-Orchestrator"; const DEFAULT_ORCHESTRATOR_MODEL_NAME: &str = "Plano-Orchestrator";
/// Parse a version string like `v0.4.0`, `v0.3.0`, `0.2.0` into a `(major, minor, patch)` tuple.
/// Missing parts default to 0. Non-numeric parts are treated as 0.
fn parse_semver(version: &str) -> (u32, u32, u32) {
let v = version.trim_start_matches('v');
let mut parts = v.splitn(3, '.').map(|p| p.parse::<u32>().unwrap_or(0));
let major = parts.next().unwrap_or(0);
let minor = parts.next().unwrap_or(0);
let patch = parts.next().unwrap_or(0);
(major, minor, patch)
}
/// CORS pre-flight response for the models endpoint. /// CORS pre-flight response for the models endpoint.
fn cors_preflight() -> Result<Response<BoxBody<Bytes, hyper::Error>>, hyper::Error> { fn cors_preflight() -> Result<Response<BoxBody<Bytes, hyper::Error>>, hyper::Error> {
let mut response = Response::new(empty()); let mut response = Response::new(empty());
@ -162,8 +174,150 @@ async fn init_app_state(
.map(|p| p.name.clone()) .map(|p| p.name.clone())
.unwrap_or_else(|| DEFAULT_ROUTING_LLM_PROVIDER.to_string()); .unwrap_or_else(|| DEFAULT_ROUTING_LLM_PROVIDER.to_string());
// Validate that top-level routing_preferences requires v0.4.0+.
let config_version = parse_semver(&config.version);
let is_v040_plus = config_version >= (0, 4, 0);
if !is_v040_plus && config.routing_preferences.is_some() {
return Err(
"top-level routing_preferences requires version v0.4.0 or above. \
Update the version field or remove routing_preferences."
.into(),
);
}
// Validate that all models referenced in top-level routing_preferences exist in model_providers.
// The CLI renders model_providers with `name` = "openai/gpt-4o" and `model` = "gpt-4o",
// so we accept a match against either field.
if let Some(ref route_prefs) = config.routing_preferences {
let provider_model_names: std::collections::HashSet<&str> = config
.model_providers
.iter()
.flat_map(|p| std::iter::once(p.name.as_str()).chain(p.model.as_deref()))
.collect();
for pref in route_prefs {
for model in &pref.models {
if !provider_model_names.contains(model.as_str()) {
return Err(format!(
"routing_preferences route '{}' references model '{}' \
which is not declared in model_providers",
pref.name, model
)
.into());
}
}
}
}
// Validate and initialize ModelMetricsService if model_metrics_sources is configured.
let metrics_service: Option<Arc<ModelMetricsService>> = if let Some(ref sources) =
config.model_metrics_sources
{
use common::configuration::MetricsSource;
let cost_count = sources
.iter()
.filter(|s| matches!(s, MetricsSource::CostMetrics { .. }))
.count();
let prom_count = sources
.iter()
.filter(|s| matches!(s, MetricsSource::PrometheusMetrics { .. }))
.count();
let do_count = sources
.iter()
.filter(|s| matches!(s, MetricsSource::DigitalOceanPricing { .. }))
.count();
if cost_count > 1 {
return Err("model_metrics_sources: only one cost_metrics source is allowed".into());
}
if prom_count > 1 {
return Err(
"model_metrics_sources: only one prometheus_metrics source is allowed".into(),
);
}
if do_count > 1 {
return Err(
"model_metrics_sources: only one digitalocean_pricing source is allowed".into(),
);
}
if cost_count > 0 && do_count > 0 {
return Err(
"model_metrics_sources: cost_metrics and digitalocean_pricing cannot both be configured — use one or the other".into(),
);
}
let svc = ModelMetricsService::new(sources, reqwest::Client::new()).await;
Some(Arc::new(svc))
} else {
None
};
// Validate that selection_policy.prefer is compatible with the configured metric sources.
if let Some(ref prefs) = config.routing_preferences {
use common::configuration::{MetricsSource, SelectionPreference};
let has_cost_source = config
.model_metrics_sources
.as_deref()
.unwrap_or_default()
.iter()
.any(|s| {
matches!(
s,
MetricsSource::CostMetrics { .. } | MetricsSource::DigitalOceanPricing { .. }
)
});
let has_prometheus = config
.model_metrics_sources
.as_deref()
.unwrap_or_default()
.iter()
.any(|s| matches!(s, MetricsSource::PrometheusMetrics { .. }));
for pref in prefs {
if pref.selection_policy.prefer == SelectionPreference::Cheapest && !has_cost_source {
return Err(format!(
"routing_preferences route '{}' uses prefer: cheapest but no cost data source is configured — \
add cost_metrics or digitalocean_pricing to model_metrics_sources",
pref.name
)
.into());
}
if pref.selection_policy.prefer == SelectionPreference::Fastest && !has_prometheus {
return Err(format!(
"routing_preferences route '{}' uses prefer: fastest but no prometheus_metrics source is configured — \
add prometheus_metrics to model_metrics_sources",
pref.name
)
.into());
}
}
}
// Warn about models in routing_preferences that have no matching pricing/latency data.
if let (Some(ref prefs), Some(ref svc)) = (&config.routing_preferences, &metrics_service) {
let cost_data = svc.cost_snapshot().await;
let latency_data = svc.latency_snapshot().await;
for pref in prefs {
use common::configuration::SelectionPreference;
for model in &pref.models {
let missing = match pref.selection_policy.prefer {
SelectionPreference::Cheapest => !cost_data.contains_key(model.as_str()),
SelectionPreference::Fastest => !latency_data.contains_key(model.as_str()),
_ => false,
};
if missing {
warn!(
model = %model,
route = %pref.name,
"model has no metric data — will be ranked last"
);
}
}
}
}
let router_service = Arc::new(RouterService::new( let router_service = Arc::new(RouterService::new(
config.model_providers.clone(), config.routing_preferences.clone(),
metrics_service,
format!("{llm_provider_url}{CHAT_COMPLETIONS_PATH}"), format!("{llm_provider_url}{CHAT_COMPLETIONS_PATH}"),
routing_model_name, routing_model_name,
routing_llm_provider, routing_llm_provider,

View file

@ -1,15 +1,18 @@
use std::{collections::HashMap, sync::Arc}; use std::{collections::HashMap, sync::Arc};
use common::{ use common::{
configuration::{LlmProvider, ModelUsagePreference, RoutingPreference}, configuration::TopLevelRoutingPreference,
consts::{ARCH_PROVIDER_HINT_HEADER, REQUEST_ID_HEADER, TRACE_PARENT_HEADER}, consts::{ARCH_PROVIDER_HINT_HEADER, REQUEST_ID_HEADER, TRACE_PARENT_HEADER},
}; };
use super::router_model::{ModelUsagePreference, RoutingPreference};
use hermesllm::apis::openai::Message; use hermesllm::apis::openai::Message;
use hyper::header; use hyper::header;
use thiserror::Error; use thiserror::Error;
use tracing::{debug, info}; use tracing::{debug, info};
use super::http::{self, post_and_extract_content}; use super::http::{self, post_and_extract_content};
use super::model_metrics::ModelMetricsService;
use super::router_model::RouterModel; use super::router_model::RouterModel;
use crate::router::router_model_v1; use crate::router::router_model_v1;
@ -19,7 +22,8 @@ pub struct RouterService {
client: reqwest::Client, client: reqwest::Client,
router_model: Arc<dyn RouterModel>, router_model: Arc<dyn RouterModel>,
routing_provider_name: String, routing_provider_name: String,
llm_usage_defined: bool, top_level_preferences: HashMap<String, TopLevelRoutingPreference>,
metrics_service: Option<Arc<ModelMetricsService>>,
} }
#[derive(Debug, Error)] #[derive(Debug, Error)]
@ -35,29 +39,37 @@ pub type Result<T> = std::result::Result<T, RoutingError>;
impl RouterService { impl RouterService {
pub fn new( pub fn new(
providers: Vec<LlmProvider>, top_level_prefs: Option<Vec<TopLevelRoutingPreference>>,
metrics_service: Option<Arc<ModelMetricsService>>,
router_url: String, router_url: String,
routing_model_name: String, routing_model_name: String,
routing_provider_name: String, routing_provider_name: String,
) -> Self { ) -> Self {
let providers_with_usage = providers let top_level_preferences: HashMap<String, TopLevelRoutingPreference> = top_level_prefs
.iter() .map_or_else(HashMap::new, |prefs| {
.filter(|provider| provider.routing_preferences.is_some()) prefs.into_iter().map(|p| (p.name.clone(), p)).collect()
.cloned() });
.collect::<Vec<LlmProvider>>();
let llm_routes: HashMap<String, Vec<RoutingPreference>> = providers_with_usage // Build sentinel routes for RouterModelV1: route_name → first model.
// RouterModelV1 uses this to build its prompt; RouterService overrides
// the model selection via rank_models() after the route is determined.
let sentinel_routes: HashMap<String, Vec<RoutingPreference>> = top_level_preferences
.iter() .iter()
.filter_map(|provider| { .filter_map(|(name, pref)| {
provider pref.models.first().map(|first_model| {
.routing_preferences (
.as_ref() first_model.clone(),
.map(|prefs| (provider.name.clone(), prefs.clone())) vec![RoutingPreference {
name: name.clone(),
description: pref.description.clone(),
}],
)
})
}) })
.collect(); .collect();
let router_model = Arc::new(router_model_v1::RouterModelV1::new( let router_model = Arc::new(router_model_v1::RouterModelV1::new(
llm_routes, sentinel_routes,
routing_model_name, routing_model_name,
router_model_v1::MAX_TOKEN_LEN, router_model_v1::MAX_TOKEN_LEN,
)); ));
@ -67,7 +79,8 @@ impl RouterService {
client: reqwest::Client::new(), client: reqwest::Client::new(),
router_model, router_model,
routing_provider_name, routing_provider_name,
llm_usage_defined: !providers_with_usage.is_empty(), top_level_preferences,
metrics_service,
} }
} }
@ -75,24 +88,43 @@ impl RouterService {
&self, &self,
messages: &[Message], messages: &[Message],
traceparent: &str, traceparent: &str,
usage_preferences: Option<Vec<ModelUsagePreference>>, inline_routing_preferences: Option<Vec<TopLevelRoutingPreference>>,
request_id: &str, request_id: &str,
) -> Result<Option<(String, String)>> { ) -> Result<Option<(String, Vec<String>)>> {
if messages.is_empty() { if messages.is_empty() {
return Ok(None); return Ok(None);
} }
if usage_preferences // Build inline top-level map from request if present (inline overrides config).
.as_ref() let inline_top_map: Option<HashMap<String, TopLevelRoutingPreference>> =
.is_none_or(|prefs| prefs.len() < 2) inline_routing_preferences
&& !self.llm_usage_defined .map(|prefs| prefs.into_iter().map(|p| (p.name.clone(), p)).collect());
{
// No routing defined — skip the router call entirely.
if inline_top_map.is_none() && self.top_level_preferences.is_empty() {
return Ok(None); return Ok(None);
} }
// For inline overrides, build synthetic ModelUsagePreference list so RouterModelV1
// generates the correct prompt (route name + description pairs).
// For config-level prefs the sentinel routes are already baked into RouterModelV1.
let effective_usage_preferences: Option<Vec<ModelUsagePreference>> =
inline_top_map.as_ref().map(|inline_map| {
inline_map
.values()
.map(|p| ModelUsagePreference {
model: p.models.first().cloned().unwrap_or_default(),
routing_preferences: vec![RoutingPreference {
name: p.name.clone(),
description: p.description.clone(),
}],
})
.collect()
});
let router_request = self let router_request = self
.router_model .router_model
.generate_request(messages, &usage_preferences); .generate_request(messages, &effective_usage_preferences);
debug!( debug!(
model = %self.router_model.get_model_name(), model = %self.router_model.get_model_name(),
@ -132,17 +164,37 @@ impl RouterService {
return Ok(None); return Ok(None);
}; };
// Parse the route name from the router response.
let parsed = self let parsed = self
.router_model .router_model
.parse_response(&content, &usage_preferences)?; .parse_response(&content, &effective_usage_preferences)?;
let result = if let Some((route_name, _sentinel)) = parsed {
let top_pref = inline_top_map
.as_ref()
.and_then(|m| m.get(&route_name))
.or_else(|| self.top_level_preferences.get(&route_name));
if let Some(pref) = top_pref {
let ranked = match &self.metrics_service {
Some(svc) => svc.rank_models(&pref.models, &pref.selection_policy).await,
None => pref.models.clone(),
};
Some((route_name, ranked))
} else {
None
}
} else {
None
};
info!( info!(
content = %content.replace("\n", "\\n"), content = %content.replace("\n", "\\n"),
selected_model = ?parsed, selected_model = ?result,
response_time_ms = elapsed.as_millis(), response_time_ms = elapsed.as_millis(),
"arch-router determined route" "arch-router determined route"
); );
Ok(parsed) Ok(result)
} }
} }

View file

@ -1,5 +1,6 @@
pub(crate) mod http; pub(crate) mod http;
pub mod llm; pub mod llm;
pub mod model_metrics;
pub mod orchestrator; pub mod orchestrator;
pub mod orchestrator_model; pub mod orchestrator_model;
pub mod orchestrator_model_v1; pub mod orchestrator_model_v1;

View file

@ -0,0 +1,419 @@
use std::collections::HashMap;
use std::sync::Arc;
use std::time::Duration;
use common::configuration::{MetricsSource, SelectionPolicy, SelectionPreference};
use tokio::sync::RwLock;
use tracing::{info, warn};
const DO_PRICING_URL: &str = "https://api.digitalocean.com/v2/gen-ai/models/catalog";
pub struct ModelMetricsService {
cost: Arc<RwLock<HashMap<String, f64>>>,
latency: Arc<RwLock<HashMap<String, f64>>>,
}
impl ModelMetricsService {
pub async fn new(sources: &[MetricsSource], client: reqwest::Client) -> Self {
let cost_data = Arc::new(RwLock::new(HashMap::new()));
let latency_data = Arc::new(RwLock::new(HashMap::new()));
for source in sources {
match source {
MetricsSource::CostMetrics {
url,
refresh_interval,
auth,
} => {
let data = fetch_cost_metrics(url, auth.as_ref(), &client).await;
info!(models = data.len(), url = %url, "fetched cost metrics");
*cost_data.write().await = data;
if let Some(interval_secs) = refresh_interval {
let cost_clone = Arc::clone(&cost_data);
let client_clone = client.clone();
let url = url.clone();
let auth = auth.clone();
let interval = Duration::from_secs(*interval_secs);
tokio::spawn(async move {
loop {
tokio::time::sleep(interval).await;
let data =
fetch_cost_metrics(&url, auth.as_ref(), &client_clone).await;
info!(models = data.len(), url = %url, "refreshed cost metrics");
*cost_clone.write().await = data;
}
});
}
}
MetricsSource::PrometheusMetrics {
url,
query,
refresh_interval,
} => {
let data = fetch_prometheus_metrics(url, query, &client).await;
info!(models = data.len(), url = %url, "fetched prometheus latency metrics");
*latency_data.write().await = data;
if let Some(interval_secs) = refresh_interval {
let latency_clone = Arc::clone(&latency_data);
let client_clone = client.clone();
let url = url.clone();
let query = query.clone();
let interval = Duration::from_secs(*interval_secs);
tokio::spawn(async move {
loop {
tokio::time::sleep(interval).await;
let data =
fetch_prometheus_metrics(&url, &query, &client_clone).await;
info!(models = data.len(), url = %url, "refreshed prometheus latency metrics");
*latency_clone.write().await = data;
}
});
}
}
MetricsSource::DigitalOceanPricing {
refresh_interval,
model_aliases,
} => {
let aliases = model_aliases.clone().unwrap_or_default();
let data = fetch_do_pricing(&client, &aliases).await;
info!(models = data.len(), "fetched digitalocean pricing");
*cost_data.write().await = data;
if let Some(interval_secs) = refresh_interval {
let cost_clone = Arc::clone(&cost_data);
let client_clone = client.clone();
let interval = Duration::from_secs(*interval_secs);
tokio::spawn(async move {
loop {
tokio::time::sleep(interval).await;
let data = fetch_do_pricing(&client_clone, &aliases).await;
info!(models = data.len(), "refreshed digitalocean pricing");
*cost_clone.write().await = data;
}
});
}
}
}
}
ModelMetricsService {
cost: cost_data,
latency: latency_data,
}
}
/// Rank `models` by `policy`, returning them in preference order.
/// Models with no metric data are appended at the end in their original order.
pub async fn rank_models(&self, models: &[String], policy: &SelectionPolicy) -> Vec<String> {
match policy.prefer {
SelectionPreference::Cheapest => {
let data = self.cost.read().await;
for m in models {
if !data.contains_key(m.as_str()) {
warn!(model = %m, "no cost data for model — ranking last (prefer: cheapest)");
}
}
rank_by_ascending_metric(models, &data)
}
SelectionPreference::Fastest => {
let data = self.latency.read().await;
for m in models {
if !data.contains_key(m.as_str()) {
warn!(model = %m, "no latency data for model — ranking last (prefer: fastest)");
}
}
rank_by_ascending_metric(models, &data)
}
SelectionPreference::None => models.to_vec(),
}
}
/// Returns a snapshot of the current cost data. Used at startup to warn about unmatched models.
pub async fn cost_snapshot(&self) -> HashMap<String, f64> {
self.cost.read().await.clone()
}
/// Returns a snapshot of the current latency data. Used at startup to warn about unmatched models.
pub async fn latency_snapshot(&self) -> HashMap<String, f64> {
self.latency.read().await.clone()
}
}
fn rank_by_ascending_metric(models: &[String], data: &HashMap<String, f64>) -> Vec<String> {
let mut with_data: Vec<(&String, f64)> = models
.iter()
.filter_map(|m| data.get(m.as_str()).map(|v| (m, *v)))
.collect();
with_data.sort_by(|a, b| a.1.partial_cmp(&b.1).unwrap_or(std::cmp::Ordering::Equal));
let without_data: Vec<&String> = models
.iter()
.filter(|m| !data.contains_key(m.as_str()))
.collect();
with_data
.iter()
.map(|(m, _)| (*m).clone())
.chain(without_data.iter().map(|m| (*m).clone()))
.collect()
}
#[derive(serde::Deserialize)]
struct CostEntry {
input_per_million: f64,
output_per_million: f64,
}
async fn fetch_cost_metrics(
url: &str,
auth: Option<&common::configuration::MetricsAuth>,
client: &reqwest::Client,
) -> HashMap<String, f64> {
let mut req = client.get(url);
if let Some(auth) = auth {
if auth.auth_type == "bearer" {
req = req.header("Authorization", format!("Bearer {}", auth.token));
} else {
warn!(auth_type = %auth.auth_type, "unsupported auth type for cost_metrics, skipping auth");
}
}
match req.send().await {
Ok(resp) => match resp.json::<HashMap<String, CostEntry>>().await {
Ok(data) => data
.into_iter()
.map(|(k, v)| (k, v.input_per_million + v.output_per_million))
.collect(),
Err(err) => {
warn!(error = %err, url = %url, "failed to parse cost metrics response");
HashMap::new()
}
},
Err(err) => {
warn!(error = %err, url = %url, "failed to fetch cost metrics");
HashMap::new()
}
}
}
#[derive(serde::Deserialize)]
struct DoModelList {
data: Vec<DoModel>,
}
#[derive(serde::Deserialize)]
struct DoModel {
model_id: String,
pricing: Option<DoPricing>,
}
#[derive(serde::Deserialize)]
struct DoPricing {
input_price_per_million: Option<f64>,
output_price_per_million: Option<f64>,
}
async fn fetch_do_pricing(
client: &reqwest::Client,
aliases: &HashMap<String, String>,
) -> HashMap<String, f64> {
match client.get(DO_PRICING_URL).send().await {
Ok(resp) => match resp.json::<DoModelList>().await {
Ok(list) => list
.data
.into_iter()
.filter_map(|m| {
let pricing = m.pricing?;
let raw_key = m.model_id.clone();
let key = aliases.get(&raw_key).cloned().unwrap_or(raw_key);
let cost = pricing.input_price_per_million.unwrap_or(0.0)
+ pricing.output_price_per_million.unwrap_or(0.0);
Some((key, cost))
})
.collect(),
Err(err) => {
warn!(error = %err, url = DO_PRICING_URL, "failed to parse digitalocean pricing response");
HashMap::new()
}
},
Err(err) => {
warn!(error = %err, url = DO_PRICING_URL, "failed to fetch digitalocean pricing");
HashMap::new()
}
}
}
#[derive(serde::Deserialize)]
struct PrometheusResponse {
data: PrometheusData,
}
#[derive(serde::Deserialize)]
struct PrometheusData {
result: Vec<PrometheusResult>,
}
#[derive(serde::Deserialize)]
struct PrometheusResult {
metric: HashMap<String, String>,
value: (f64, String), // (timestamp, value_str)
}
async fn fetch_prometheus_metrics(
url: &str,
query: &str,
client: &reqwest::Client,
) -> HashMap<String, f64> {
let query_url = format!("{}/api/v1/query", url.trim_end_matches('/'));
match client
.get(&query_url)
.query(&[("query", query)])
.send()
.await
{
Ok(resp) => match resp.json::<PrometheusResponse>().await {
Ok(prom) => prom
.data
.result
.into_iter()
.filter_map(|r| {
let model_name = r.metric.get("model_name")?.clone();
let value: f64 = r.value.1.parse().ok()?;
Some((model_name, value))
})
.collect(),
Err(err) => {
warn!(error = %err, url = %query_url, "failed to parse prometheus response");
HashMap::new()
}
},
Err(err) => {
warn!(error = %err, url = %query_url, "failed to fetch prometheus metrics");
HashMap::new()
}
}
}
#[cfg(test)]
mod tests {
use super::*;
use common::configuration::SelectionPreference;
fn make_policy(prefer: SelectionPreference) -> SelectionPolicy {
SelectionPolicy { prefer }
}
#[test]
fn test_rank_by_ascending_metric_picks_lowest_first() {
let models = vec!["a".to_string(), "b".to_string(), "c".to_string()];
let mut data = HashMap::new();
data.insert("a".to_string(), 0.01);
data.insert("b".to_string(), 0.005);
data.insert("c".to_string(), 0.02);
assert_eq!(
rank_by_ascending_metric(&models, &data),
vec!["b", "a", "c"]
);
}
#[test]
fn test_rank_by_ascending_metric_no_data_preserves_order() {
let models = vec!["x".to_string(), "y".to_string()];
let data = HashMap::new();
assert_eq!(rank_by_ascending_metric(&models, &data), vec!["x", "y"]);
}
#[test]
fn test_rank_by_ascending_metric_partial_data() {
let models = vec!["a".to_string(), "b".to_string()];
let mut data = HashMap::new();
data.insert("b".to_string(), 100.0);
assert_eq!(rank_by_ascending_metric(&models, &data), vec!["b", "a"]);
}
#[tokio::test]
async fn test_rank_models_cheapest() {
let service = ModelMetricsService {
cost: Arc::new(RwLock::new({
let mut m = HashMap::new();
m.insert("gpt-4o".to_string(), 0.005);
m.insert("gpt-4o-mini".to_string(), 0.0001);
m
})),
latency: Arc::new(RwLock::new(HashMap::new())),
};
let models = vec!["gpt-4o".to_string(), "gpt-4o-mini".to_string()];
let result = service
.rank_models(&models, &make_policy(SelectionPreference::Cheapest))
.await;
assert_eq!(result, vec!["gpt-4o-mini", "gpt-4o"]);
}
#[tokio::test]
async fn test_rank_models_fastest() {
let service = ModelMetricsService {
cost: Arc::new(RwLock::new(HashMap::new())),
latency: Arc::new(RwLock::new({
let mut m = HashMap::new();
m.insert("gpt-4o".to_string(), 200.0);
m.insert("claude-sonnet".to_string(), 120.0);
m
})),
};
let models = vec!["gpt-4o".to_string(), "claude-sonnet".to_string()];
let result = service
.rank_models(&models, &make_policy(SelectionPreference::Fastest))
.await;
assert_eq!(result, vec!["claude-sonnet", "gpt-4o"]);
}
#[tokio::test]
async fn test_rank_models_fallback_no_metrics() {
let service = ModelMetricsService {
cost: Arc::new(RwLock::new(HashMap::new())),
latency: Arc::new(RwLock::new(HashMap::new())),
};
let models = vec!["model-a".to_string(), "model-b".to_string()];
let result = service
.rank_models(&models, &make_policy(SelectionPreference::Cheapest))
.await;
assert_eq!(result, vec!["model-a", "model-b"]);
}
#[tokio::test]
async fn test_rank_models_partial_data_appended_last() {
let service = ModelMetricsService {
cost: Arc::new(RwLock::new({
let mut m = HashMap::new();
m.insert("gpt-4o".to_string(), 0.005);
m
})),
latency: Arc::new(RwLock::new(HashMap::new())),
};
let models = vec!["gpt-4o-mini".to_string(), "gpt-4o".to_string()];
let result = service
.rank_models(&models, &make_policy(SelectionPreference::Cheapest))
.await;
assert_eq!(result, vec!["gpt-4o", "gpt-4o-mini"]);
}
#[tokio::test]
async fn test_rank_models_none_preserves_order() {
let service = ModelMetricsService {
cost: Arc::new(RwLock::new({
let mut m = HashMap::new();
m.insert("gpt-4o-mini".to_string(), 0.0001);
m.insert("gpt-4o".to_string(), 0.005);
m
})),
latency: Arc::new(RwLock::new(HashMap::new())),
};
let models = vec!["gpt-4o".to_string(), "gpt-4o-mini".to_string()];
let result = service
.rank_models(&models, &make_policy(SelectionPreference::None))
.await;
// none → original order, despite gpt-4o-mini being cheaper
assert_eq!(result, vec!["gpt-4o", "gpt-4o-mini"]);
}
}

View file

@ -1,5 +1,5 @@
use common::configuration::ModelUsagePreference;
use hermesllm::apis::openai::{ChatCompletionsRequest, Message}; use hermesllm::apis::openai::{ChatCompletionsRequest, Message};
use serde::{Deserialize, Serialize};
use thiserror::Error; use thiserror::Error;
#[derive(Debug, Error)] #[derive(Debug, Error)]
@ -10,6 +10,20 @@ pub enum RoutingModelError {
pub type Result<T> = std::result::Result<T, RoutingModelError>; pub type Result<T> = std::result::Result<T, RoutingModelError>;
/// Internal route descriptor passed to the router model to build its prompt.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct RoutingPreference {
pub name: String,
pub description: String,
}
/// Groups a model with its routing preferences (used internally by RouterModelV1).
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ModelUsagePreference {
pub model: String,
pub routing_preferences: Vec<RoutingPreference>,
}
pub trait RouterModel: Send + Sync { pub trait RouterModel: Send + Sync {
fn generate_request( fn generate_request(
&self, &self,

View file

@ -1,6 +1,6 @@
use std::collections::HashMap; use std::collections::HashMap;
use common::configuration::{ModelUsagePreference, RoutingPreference}; use super::router_model::{ModelUsagePreference, RoutingPreference};
use hermesllm::apis::openai::{ChatCompletionsRequest, Message, MessageContent, Role}; use hermesllm::apis::openai::{ChatCompletionsRequest, Message, MessageContent, Role};
use hermesllm::transforms::lib::ExtractText; use hermesllm::transforms::lib::ExtractText;
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};

View file

@ -104,6 +104,57 @@ pub enum StateStorageType {
Postgres, Postgres,
} }
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
#[serde(rename_all = "lowercase")]
pub enum SelectionPreference {
Cheapest,
Fastest,
/// Return models in the same order they were defined — no reordering.
None,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct SelectionPolicy {
pub prefer: SelectionPreference,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct TopLevelRoutingPreference {
pub name: String,
pub description: String,
pub models: Vec<String>,
pub selection_policy: SelectionPolicy,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct MetricsAuth {
#[serde(rename = "type")]
pub auth_type: String, // only "bearer" supported
pub token: String,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(tag = "type", rename_all = "snake_case")]
pub enum MetricsSource {
CostMetrics {
url: String,
refresh_interval: Option<u64>,
auth: Option<MetricsAuth>,
},
PrometheusMetrics {
url: String,
query: String,
refresh_interval: Option<u64>,
},
#[serde(rename = "digitalocean_pricing")]
DigitalOceanPricing {
refresh_interval: Option<u64>,
/// Map DO catalog keys (`lowercase(creator)/model_id`) to Plano model names.
/// Example: `openai/openai-gpt-oss-120b: openai/gpt-4o`
model_aliases: Option<HashMap<String, String>>,
},
}
#[derive(Debug, Clone, Serialize, Deserialize)] #[derive(Debug, Clone, Serialize, Deserialize)]
pub struct Configuration { pub struct Configuration {
pub version: String, pub version: String,
@ -122,6 +173,8 @@ pub struct Configuration {
pub filters: Option<Vec<Agent>>, pub filters: Option<Vec<Agent>>,
pub listeners: Vec<Listener>, pub listeners: Vec<Listener>,
pub state_storage: Option<StateStorageConfig>, pub state_storage: Option<StateStorageConfig>,
pub routing_preferences: Option<Vec<TopLevelRoutingPreference>>,
pub model_metrics_sources: Option<Vec<MetricsSource>>,
} }
#[derive(Debug, Clone, Serialize, Deserialize, Default)] #[derive(Debug, Clone, Serialize, Deserialize, Default)]
@ -237,6 +290,8 @@ pub enum TimeUnit {
Minute, Minute,
#[serde(rename = "hour")] #[serde(rename = "hour")]
Hour, Hour,
#[serde(rename = "day")]
Day,
} }
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq, Hash)] #[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq, Hash)]
@ -317,18 +372,6 @@ impl LlmProviderType {
} }
} }
#[derive(Serialize, Deserialize, Debug)]
pub struct ModelUsagePreference {
pub model: String,
pub routing_preferences: Vec<RoutingPreference>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct RoutingPreference {
pub name: String,
pub description: String,
}
#[derive(Serialize, Deserialize, Debug)] #[derive(Serialize, Deserialize, Debug)]
pub struct AgentUsagePreference { pub struct AgentUsagePreference {
pub model: String, pub model: String,
@ -378,7 +421,6 @@ pub struct LlmProvider {
pub port: Option<u16>, pub port: Option<u16>,
pub rate_limits: Option<LlmRatelimit>, pub rate_limits: Option<LlmRatelimit>,
pub usage: Option<String>, pub usage: Option<String>,
pub routing_preferences: Option<Vec<RoutingPreference>>,
pub cluster_name: Option<String>, pub cluster_name: Option<String>,
pub base_url_path_prefix: Option<String>, pub base_url_path_prefix: Option<String>,
pub internal: Option<bool>, pub internal: Option<bool>,
@ -422,7 +464,6 @@ impl Default for LlmProvider {
port: None, port: None,
rate_limits: None, rate_limits: None,
usage: None, usage: None,
routing_preferences: None,
cluster_name: None, cluster_name: None,
base_url_path_prefix: None, base_url_path_prefix: None,
internal: None, internal: None,

View file

@ -274,7 +274,6 @@ mod tests {
port: None, port: None,
rate_limits: None, rate_limits: None,
usage: None, usage: None,
routing_preferences: None,
internal: None, internal: None,
stream: None, stream: None,
passthrough_auth: None, passthrough_auth: None,

View file

@ -150,6 +150,10 @@ fn get_quota(limit: Limit) -> Quota {
TimeUnit::Second => Quota::per_second(tokens), TimeUnit::Second => Quota::per_second(tokens),
TimeUnit::Minute => Quota::per_minute(tokens), TimeUnit::Minute => Quota::per_minute(tokens),
TimeUnit::Hour => Quota::per_hour(tokens), TimeUnit::Hour => Quota::per_hour(tokens),
TimeUnit::Day => {
let per_hour = limit.tokens.saturating_div(24).max(1);
Quota::per_hour(NonZero::new(per_hour).expect("per_hour must be positive"))
}
} }
} }

View file

@ -13,42 +13,60 @@ Plano is an AI-native proxy and data plane for agentic apps — with built-in or
- **One endpoint, many models** — apps call Plano using standard OpenAI/Anthropic APIs; Plano handles provider selection, keys, and failover - **One endpoint, many models** — apps call Plano using standard OpenAI/Anthropic APIs; Plano handles provider selection, keys, and failover
- **Intelligent routing** — a lightweight 1.5B router model classifies user intent and picks the best model per request - **Intelligent routing** — a lightweight 1.5B router model classifies user intent and picks the best model per request
- **Cost & latency ranking** — models are ranked by live cost (DigitalOcean pricing API) or latency (Prometheus) before returning the fallback list
- **Platform governance** — centralize API keys, rate limits, guardrails, and observability without touching app code - **Platform governance** — centralize API keys, rate limits, guardrails, and observability without touching app code
- **Runs anywhere** — single binary; self-host the router for full data privacy - **Runs anywhere** — single binary; self-host the router for full data privacy
## How Routing Works ## How Routing Works
The entire routing configuration is plain YAML — no code: Routing is configured in top-level `routing_preferences` (requires `version: v0.4.0`):
```yaml ```yaml
model_providers: version: v0.4.0
- model: openai/gpt-4o-mini
default: true # fallback for unmatched requests
- model: openai/gpt-4o routing_preferences:
routing_preferences: - name: complex_reasoning
- name: complex_reasoning description: complex reasoning tasks, multi-step analysis, or detailed explanations
description: complex reasoning tasks, multi-step analysis models:
- openai/gpt-4o
- openai/gpt-4o-mini
selection_policy:
prefer: cheapest # rank by live cost data
- model: anthropic/claude-sonnet-4-20250514 - name: code_generation
routing_preferences: description: generating new code, writing functions, or creating boilerplate
- name: code_generation models:
description: generating new code, writing functions - anthropic/claude-sonnet-4-20250514
- openai/gpt-4o
selection_policy:
prefer: fastest # rank by Prometheus p95 latency
``` ```
When a request arrives, Plano sends the conversation and routing preferences to Arch-Router, which classifies the intent and returns the matching route: ### `selection_policy.prefer` values
| Value | Behavior |
|---|---|
| `cheapest` | Sort models by ascending cost. Requires `cost_metrics` or `digitalocean_pricing` in `model_metrics_sources`. |
| `fastest` | Sort models by ascending P95 latency. Requires `prometheus_metrics` in `model_metrics_sources`. |
| `random` | Shuffle the model list on each request. |
| `none` | Return models in definition order — no reordering. |
When a request arrives, Plano:
1. Sends the conversation + route descriptions to Arch-Router for intent classification
2. Looks up the matched route and ranks its candidate models by cost or latency
3. Returns an ordered list — client uses `models[0]`, falls back to `models[1]` on 429/5xx
``` ```
1. Request arrives → "Write binary search in Python" 1. Request arrives → "Write binary search in Python"
2. Preferences serialized → [{"name":"code_generation", ...}, {"name":"complex_reasoning", ...}] 2. Arch-Router classifies → route: "code_generation"
3. Arch-Router classifies → {"route": "code_generation"} 3. Rank by latency → claude-sonnet (0.85s) < gpt-4o (1.2s)
4. Route → Model lookup → code_generation → anthropic/claude-sonnet-4-20250514 4. Response → models: ["anthropic/claude-sonnet-4-20250514", "openai/gpt-4o"]
5. Request forwarded → Claude generates the response
``` ```
No match? Arch-Router returns `other` → Plano falls back to the default model. No match? Arch-Router returns `null` route → client falls back to the model in the original request.
The `/routing/v1/*` endpoints return the routing decision **without** forwarding to the LLM — useful for testing and validating routing behavior before going to production. The `/routing/v1/*` endpoints return the routing decision **without** forwarding to the LLM — useful for testing routing behavior before going to production.
## Setup ## Setup
@ -59,12 +77,28 @@ export OPENAI_API_KEY=<your-key>
export ANTHROPIC_API_KEY=<your-key> export ANTHROPIC_API_KEY=<your-key>
``` ```
Start Plano: Start Prometheus and the mock latency metrics server:
```bash ```bash
cd demos/llm_routing/model_routing_service cd demos/llm_routing/model_routing_service
docker compose up -d
```
Then start Plano:
```bash
planoai up config.yaml planoai up config.yaml
``` ```
On startup you should see logs like:
```
fetched digitalocean pricing: N models
fetched prometheus latency metrics: 3 models
```
If a model in `routing_preferences` has no matching pricing or latency data, Plano logs a warning at startup — the model is still included but ranked last.
## Run the demo ## Run the demo
```bash ```bash
@ -95,13 +129,65 @@ curl http://localhost:12000/routing/v1/chat/completions \
Response: Response:
```json ```json
{ {
"model": "anthropic/claude-sonnet-4-20250514", "models": ["anthropic/claude-sonnet-4-20250514", "openai/gpt-4o"],
"route": "code_generation", "route": "code_generation",
"trace_id": "c16d1096c1af4a17abb48fb182918a88" "trace_id": "c16d1096c1af4a17abb48fb182918a88"
} }
``` ```
The response tells you which model would handle this request and which route was matched, without actually making the LLM call. The response contains the ranked model list — your client should try `models[0]` first and fall back to `models[1]` on 429 or 5xx errors.
## Metrics Sources
### DigitalOcean Pricing (`digitalocean_pricing`)
Fetches public model pricing from the DigitalOcean Gen-AI catalog (no auth required). Model IDs are normalized as `lowercase(creator)/model_id`. Cost scalar = `input_price_per_million + output_price_per_million`.
```yaml
model_metrics_sources:
- type: digitalocean_pricing
refresh_interval: 3600 # re-fetch every hour
```
### Prometheus Latency (`prometheus_metrics`)
Queries a Prometheus instance for P95 latency. The PromQL expression must return an instant vector with a `model_name` label matching the model names in `routing_preferences`.
```yaml
model_metrics_sources:
- type: prometheus_metrics
url: http://localhost:9090
query: model_latency_p95_seconds
refresh_interval: 60
```
The demo's `metrics_server.py` exposes mock latency data; `docker compose up -d` starts it alongside Prometheus.
### Custom Cost Endpoint (`cost_metrics`)
```yaml
model_metrics_sources:
- type: cost_metrics
url: https://my-internal-pricing-api/costs
auth:
type: bearer
token: $PRICING_TOKEN
refresh_interval: 300
```
Expected response format:
```json
{
"anthropic/claude-sonnet-4-20250514": {
"input_per_million": 3.0,
"output_per_million": 15.0
},
"openai/gpt-4o": {
"input_per_million": 5.0,
"output_per_million": 20.0
}
}
```
## Kubernetes Deployment (Self-hosted Arch-Router on GPU) ## Kubernetes Deployment (Self-hosted Arch-Router on GPU)
@ -119,7 +205,6 @@ GPU nodes commonly have a `nvidia.com/gpu:NoSchedule` taint — `vllm-deployment
**1. Deploy Arch-Router and Plano:** **1. Deploy Arch-Router and Plano:**
```bash ```bash
# arch-router deployment # arch-router deployment
kubectl apply -f vllm-deployment.yaml kubectl apply -f vllm-deployment.yaml
@ -165,39 +250,3 @@ kubectl create configmap plano-config \
--dry-run=client -o yaml | kubectl apply -f - --dry-run=client -o yaml | kubectl apply -f -
kubectl rollout restart deployment/plano kubectl rollout restart deployment/plano
``` ```
## Demo Output
```
=== Model Routing Service Demo ===
--- 1. Code generation query (OpenAI format) ---
{
"model": "anthropic/claude-sonnet-4-20250514",
"route": "code_generation",
"trace_id": "c16d1096c1af4a17abb48fb182918a88"
}
--- 2. Complex reasoning query (OpenAI format) ---
{
"model": "openai/gpt-4o",
"route": "complex_reasoning",
"trace_id": "30795e228aff4d7696f082ed01b75ad4"
}
--- 3. Simple query - no routing match (OpenAI format) ---
{
"model": "none",
"route": null,
"trace_id": "ae0b6c3b220d499fb5298ac63f4eac0e"
}
--- 4. Code generation query (Anthropic format) ---
{
"model": "anthropic/claude-sonnet-4-20250514",
"route": "code_generation",
"trace_id": "26be822bbdf14a3ba19fe198e55ea4a9"
}
=== Demo Complete ===
```

View file

@ -1,4 +1,4 @@
version: v0.3.0 version: v0.4.0
listeners: listeners:
- type: model - type: model
@ -6,22 +6,48 @@ listeners:
port: 12000 port: 12000
model_providers: model_providers:
- model: openai/gpt-4o-mini - model: openai/gpt-4o-mini
access_key: $OPENAI_API_KEY access_key: $OPENAI_API_KEY
default: true default: true
- model: openai/gpt-4o - model: openai/gpt-4o
access_key: $OPENAI_API_KEY access_key: $OPENAI_API_KEY
routing_preferences:
- name: complex_reasoning
description: complex reasoning tasks, multi-step analysis, or detailed explanations
- model: anthropic/claude-sonnet-4-20250514 - model: anthropic/claude-sonnet-4-20250514
access_key: $ANTHROPIC_API_KEY access_key: $ANTHROPIC_API_KEY
routing_preferences:
- name: code_generation
description: generating new code, writing functions, or creating boilerplate
tracing: routing_preferences:
random_sampling: 100 - name: complex_reasoning
description: complex reasoning tasks, multi-step analysis, or detailed explanations
models:
- openai/gpt-4o
- openai/gpt-4o-mini
selection_policy:
prefer: cheapest
- name: code_generation
description: generating new code, writing functions, or creating boilerplate
models:
- anthropic/claude-sonnet-4-20250514
- openai/gpt-4o
selection_policy:
prefer: fastest
model_metrics_sources:
- type: digitalocean_pricing
refresh_interval: 3600
model_aliases:
openai-gpt-4o: openai/gpt-4o
openai-gpt-4o-mini: openai/gpt-4o-mini
anthropic-claude-sonnet-4: anthropic/claude-sonnet-4-20250514
# Use cost_metrics instead of digitalocean_pricing to supply your own pricing data.
# The demo metrics_server.py exposes /costs with OpenAI and Anthropic pricing.
# - type: cost_metrics
# url: http://localhost:8080/costs
# refresh_interval: 300
- type: prometheus_metrics
url: http://localhost:9090
query: model_latency_p95_seconds
refresh_interval: 60

View file

@ -8,9 +8,12 @@ echo ""
echo "This demo shows how to use the /routing/v1/* endpoints to get" echo "This demo shows how to use the /routing/v1/* endpoints to get"
echo "routing decisions without actually proxying the request to an LLM." echo "routing decisions without actually proxying the request to an LLM."
echo "" echo ""
echo "The response includes a ranked 'models' list — use models[0] as the"
echo "primary and fall back to models[1] on 429/5xx errors."
echo ""
# --- Example 1: OpenAI Chat Completions format --- # --- Example 1: Code generation (ranked by fastest) ---
echo "--- 1. Code generation query (OpenAI format) ---" echo "--- 1. Code generation query (prefer: fastest) ---"
echo "" echo ""
curl -s "$PLANO_URL/routing/v1/chat/completions" \ curl -s "$PLANO_URL/routing/v1/chat/completions" \
-H "Content-Type: application/json" \ -H "Content-Type: application/json" \
@ -22,8 +25,8 @@ curl -s "$PLANO_URL/routing/v1/chat/completions" \
}' | python3 -m json.tool }' | python3 -m json.tool
echo "" echo ""
# --- Example 2: Complex reasoning query --- # --- Example 2: Complex reasoning (ranked by cheapest) ---
echo "--- 2. Complex reasoning query (OpenAI format) ---" echo "--- 2. Complex reasoning query (prefer: cheapest) ---"
echo "" echo ""
curl -s "$PLANO_URL/routing/v1/chat/completions" \ curl -s "$PLANO_URL/routing/v1/chat/completions" \
-H "Content-Type: application/json" \ -H "Content-Type: application/json" \
@ -36,7 +39,7 @@ curl -s "$PLANO_URL/routing/v1/chat/completions" \
echo "" echo ""
# --- Example 3: Simple query (no routing match) --- # --- Example 3: Simple query (no routing match) ---
echo "--- 3. Simple query - no routing match (OpenAI format) ---" echo "--- 3. Simple query - no routing match (falls back to request model) ---"
echo "" echo ""
curl -s "$PLANO_URL/routing/v1/chat/completions" \ curl -s "$PLANO_URL/routing/v1/chat/completions" \
-H "Content-Type: application/json" \ -H "Content-Type: application/json" \
@ -62,8 +65,31 @@ curl -s "$PLANO_URL/routing/v1/messages" \
}' | python3 -m json.tool }' | python3 -m json.tool
echo "" echo ""
# --- Example 5: Inline routing policy in request body --- # --- Example 5: Inline routing_preferences with prefer:cheapest ---
echo "--- 5. Inline routing_policy (no config needed) ---" echo "--- 5. Inline routing_preferences (prefer: cheapest) ---"
echo " models[] will be sorted by ascending cost from DigitalOcean pricing"
echo ""
curl -s "$PLANO_URL/routing/v1/chat/completions" \
-H "Content-Type: application/json" \
-d '{
"model": "gpt-4o-mini",
"messages": [
{"role": "user", "content": "Summarize the key differences between TCP and UDP"}
],
"routing_preferences": [
{
"name": "general",
"description": "general questions, explanations, and summaries",
"models": ["openai/gpt-4o", "openai/gpt-4o-mini"],
"selection_policy": {"prefer": "cheapest"}
}
]
}' | python3 -m json.tool
echo ""
# --- Example 6: Inline routing_preferences with prefer:fastest ---
echo "--- 6. Inline routing_preferences (prefer: fastest) ---"
echo " models[] will be sorted by ascending P95 latency from Prometheus"
echo "" echo ""
curl -s "$PLANO_URL/routing/v1/chat/completions" \ curl -s "$PLANO_URL/routing/v1/chat/completions" \
-H "Content-Type: application/json" \ -H "Content-Type: application/json" \
@ -72,46 +98,12 @@ curl -s "$PLANO_URL/routing/v1/chat/completions" \
"messages": [ "messages": [
{"role": "user", "content": "Write a quicksort implementation in Go"} {"role": "user", "content": "Write a quicksort implementation in Go"}
], ],
"routing_policy": [ "routing_preferences": [
{ {
"model": "openai/gpt-4o", "name": "coding",
"routing_preferences": [ "description": "code generation, writing functions, debugging",
{"name": "coding", "description": "code generation, writing functions, debugging"} "models": ["anthropic/claude-sonnet-4-20250514", "openai/gpt-4o", "openai/gpt-4o-mini"],
] "selection_policy": {"prefer": "fastest"}
},
{
"model": "openai/gpt-4o-mini",
"routing_preferences": [
{"name": "general", "description": "general questions, simple lookups, casual conversation"}
]
}
]
}' | python3 -m json.tool
echo ""
# --- Example 6: Inline routing policy with Anthropic format ---
echo "--- 6. Inline routing_policy (Anthropic format) ---"
echo ""
curl -s "$PLANO_URL/routing/v1/messages" \
-H "Content-Type: application/json" \
-d '{
"model": "gpt-4o-mini",
"max_tokens": 1024,
"messages": [
{"role": "user", "content": "What is the weather like today?"}
],
"routing_policy": [
{
"model": "openai/gpt-4o",
"routing_preferences": [
{"name": "coding", "description": "code generation, writing functions, debugging"}
]
},
{
"model": "openai/gpt-4o-mini",
"routing_preferences": [
{"name": "general", "description": "general questions, simple lookups, casual conversation"}
]
} }
] ]
}' | python3 -m json.tool }' | python3 -m json.tool

View file

@ -0,0 +1,17 @@
services:
prometheus:
image: prom/prometheus:latest
ports:
- "9090:9090"
volumes:
- ./prometheus.yaml:/etc/prometheus/prometheus.yml:ro
depends_on:
- model-metrics
model-metrics:
image: python:3.11-slim
ports:
- "8080:8080"
volumes:
- ./metrics_server.py:/metrics_server.py:ro
command: python /metrics_server.py

View file

@ -0,0 +1,51 @@
"""
Demo metrics server.
Exposes two endpoints:
GET /metrics Prometheus text format, P95 latency per model (scraped by Prometheus)
GET /costs JSON cost data per model, compatible with cost_metrics source
"""
import json
from http.server import HTTPServer, BaseHTTPRequestHandler
PROMETHEUS_METRICS = """\
# HELP model_latency_p95_seconds P95 request latency in seconds per model
# TYPE model_latency_p95_seconds gauge
model_latency_p95_seconds{model_name="anthropic/claude-sonnet-4-20250514"} 0.85
model_latency_p95_seconds{model_name="openai/gpt-4o"} 1.20
model_latency_p95_seconds{model_name="openai/gpt-4o-mini"} 0.40
""".encode()
COST_DATA = {
"anthropic/claude-sonnet-4-20250514": {
"input_per_million": 3.0,
"output_per_million": 15.0,
},
"openai/gpt-4o": {"input_per_million": 5.0, "output_per_million": 20.0},
"openai/gpt-4o-mini": {"input_per_million": 0.15, "output_per_million": 0.6},
}
class MetricsHandler(BaseHTTPRequestHandler):
def do_GET(self):
if self.path == "/costs":
body = json.dumps(COST_DATA).encode()
self.send_response(200)
self.send_header("Content-Type", "application/json")
self.end_headers()
self.wfile.write(body)
else:
# /metrics and everything else → Prometheus format
self.send_response(200)
self.send_header("Content-Type", "text/plain; version=0.0.4; charset=utf-8")
self.end_headers()
self.wfile.write(PROMETHEUS_METRICS)
def log_message(self, fmt, *args):
pass # suppress access logs
if __name__ == "__main__":
server = HTTPServer(("", 8080), MetricsHandler)
print("metrics server listening on :8080 (/metrics, /costs)", flush=True)
server.serve_forever()

View file

@ -0,0 +1,8 @@
global:
scrape_interval: 15s
scrape_configs:
- job_name: model_latency
static_configs:
- targets:
- model-metrics:8080

245
docs/routing-api.md Normal file
View file

@ -0,0 +1,245 @@
# Plano Routing API — Request & Response Format
## Overview
Plano intercepts LLM requests and routes them to the best available model based on semantic intent and live cost/latency data. The developer sends a standard OpenAI-compatible request with an optional `routing_preferences` field. Plano returns an ordered list of candidate models; the client uses the first and falls back to the next on 429 or 5xx errors.
---
## Request Format
Standard OpenAI chat completion body. The only addition is the optional `routing_preferences` field, which is stripped before the request is forwarded upstream.
```json
POST /v1/chat/completions
{
"model": "openai/gpt-4o-mini",
"messages": [
{"role": "user", "content": "write a sorting algorithm in Python"}
],
"routing_preferences": [
{
"name": "code generation",
"description": "generating new code snippets",
"models": ["anthropic/claude-sonnet-4-20250514", "openai/gpt-4o", "openai/gpt-4o-mini"],
"selection_policy": {"prefer": "fastest"}
},
{
"name": "general questions",
"description": "casual conversation and simple queries",
"models": ["openai/gpt-4o-mini"],
"selection_policy": {"prefer": "cheapest"}
}
]
}
```
### `routing_preferences` fields
| Field | Type | Required | Description |
|---|---|---|---|
| `name` | string | yes | Route identifier. Must match the LLM router's route classification. |
| `description` | string | yes | Natural language description used by the router to match user intent. |
| `models` | string[] | yes | Ordered candidate pool. At least one entry required. Must be declared in `model_providers`. |
| `selection_policy.prefer` | enum | yes | How to rank models: `cheapest`, `fastest`, or `none`. |
### `selection_policy.prefer` values
| Value | Behavior |
|---|---|
| `cheapest` | Sort by ascending cost from the metrics endpoint. Models with no data appended last. |
| `fastest` | Sort by ascending latency from the metrics endpoint. Models with no data appended last. |
| `none` | Return models in the order they were defined — no reordering. |
### Notes
- `routing_preferences` is **optional**. If omitted, the config-defined preferences are used.
- If provided in the request body, it **overrides** the config for that single request only.
- `model` is still required and is used as the fallback if no route is matched.
---
## Response Format
```json
{
"models": [
"anthropic/claude-sonnet-4-20250514",
"openai/gpt-4o",
"openai/gpt-4o-mini"
],
"route": "code generation",
"trace_id": "4bf92f3577b34da6a3ce929d0e0e4736"
}
```
### Fields
| Field | Type | Description |
|---|---|---|
| `models` | string[] | Ranked model list. Use `models[0]` as primary; retry with `models[1]` on 429/5xx, and so on. |
| `route` | string \| null | Name of the matched route. `null` if no route matched — client should use the original request `model`. |
| `trace_id` | string | Trace ID for distributed tracing and observability. |
---
## Client Usage Pattern
```python
response = plano.routing_decision(request)
models = response["models"]
for model in models:
try:
result = call_llm(model, messages)
break # success — stop trying
except (RateLimitError, ServerError):
continue # try next model in the ranked list
```
---
## Configuration (set by platform/ops team)
Requires `version: v0.4.0` or above. Models listed under `routing_preferences` must be declared in `model_providers`.
```yaml
version: v0.4.0
model_providers:
- model: anthropic/claude-sonnet-4-20250514
access_key: $ANTHROPIC_API_KEY
- model: openai/gpt-4o
access_key: $OPENAI_API_KEY
- model: openai/gpt-4o-mini
access_key: $OPENAI_API_KEY
default: true
routing_preferences:
- name: code generation
description: generating new code snippets or boilerplate
models:
- anthropic/claude-sonnet-4-20250514
- openai/gpt-4o
selection_policy:
prefer: fastest
- name: general questions
description: casual conversation and simple queries
models:
- openai/gpt-4o-mini
- openai/gpt-4o
selection_policy:
prefer: cheapest
# Optional: live cost and latency data sources (max one per type)
model_metrics_sources:
# Option A: DigitalOcean public pricing (no auth required)
- type: digitalocean_pricing
refresh_interval: 3600
# Option B: custom cost endpoint (mutually exclusive with digitalocean_pricing)
# - type: cost_metrics
# url: https://internal-cost-api/models
# refresh_interval: 300 # seconds; omit for fetch-once on startup
# auth:
# type: bearer
# token: $COST_API_TOKEN
- type: prometheus_metrics
url: https://internal-prometheus/
query: histogram_quantile(0.95, sum by (model_name, le) (rate(model_latency_seconds_bucket[5m])))
refresh_interval: 60
```
### Startup validation
Plano validates metric source configuration at startup and exits with a clear error if:
| Condition | Error |
|---|---|
| `prefer: cheapest` with no cost source | `prefer: cheapest requires a cost data source — add cost_metrics or digitalocean_pricing` |
| `prefer: fastest` with no `prometheus_metrics` | `prefer: fastest requires a prometheus_metrics source` |
| Two `cost_metrics` entries | `only one cost_metrics source is allowed` |
| Two `prometheus_metrics` entries | `only one prometheus_metrics source is allowed` |
| Two `digitalocean_pricing` entries | `only one digitalocean_pricing source is allowed` |
| `cost_metrics` and `digitalocean_pricing` both present | `cannot both be configured — use one or the other` |
If a model listed in `routing_preferences` has no matching entry in the fetched pricing or latency data, Plano logs a `WARN` at startup — the model is still included but ranked last. The same warning is also emitted per routing request when a model has no data in cache at decision time (relevant for inline `routing_preferences` overrides that reference models not covered by the configured metrics sources).
### cost_metrics endpoint
Plano GETs `url` on startup (and on each `refresh_interval`). Expected response — a JSON object mapping model name to an object with `input_per_million` and `output_per_million` fields:
```json
{
"anthropic/claude-sonnet-4-20250514": {
"input_per_million": 3.0,
"output_per_million": 15.0
},
"openai/gpt-4o": {
"input_per_million": 5.0,
"output_per_million": 20.0
},
"openai/gpt-4o-mini": {
"input_per_million": 0.15,
"output_per_million": 0.6
}
}
```
- `auth.type: bearer` adds `Authorization: Bearer <token>` to the request
- Plano combines the two fields as `input_per_million + output_per_million` to produce a single cost scalar used for ranking
- Only relative order matters — the unit (e.g. USD per million tokens) is consistent so ranking is correct
### digitalocean_pricing source
Fetches public model pricing from the DigitalOcean Gen-AI catalog. No authentication required.
```yaml
model_metrics_sources:
- type: digitalocean_pricing
refresh_interval: 3600 # re-fetch every hour; omit to fetch once on startup
model_aliases:
openai-gpt-4o: openai/gpt-4o
openai-gpt-4o-mini: openai/gpt-4o-mini
anthropic-claude-sonnet-4: anthropic/claude-sonnet-4-20250514
```
DO catalog entries are stored by their `model_id` field (e.g. `openai-gpt-4o`). The cost scalar is `input_price_per_million + output_price_per_million`.
**`model_aliases`** — optional. Maps DO `model_id` values to the model names used in `routing_preferences`. Without aliases, cost data is stored under the DO model_id (e.g. `openai-gpt-4o`), which won't match models configured as `openai/gpt-4o`. Aliases let you bridge the naming gap without changing your routing config.
**Constraints:**
- `cost_metrics` and `digitalocean_pricing` cannot both be configured — use one or the other.
- Only one `digitalocean_pricing` entry is allowed.
### prometheus_metrics endpoint
Plano queries `{url}/api/v1/query?query={query}` on startup and each `refresh_interval`. The PromQL expression must return an instant vector with a `model_name` label:
```json
{
"status": "success",
"data": {
"resultType": "vector",
"result": [
{"metric": {"model_name": "anthropic/claude-sonnet-4-20250514"}, "value": [1234567890, "120.5"]},
{"metric": {"model_name": "openai/gpt-4o"}, "value": [1234567890, "200.3"]}
]
}
}
```
- The PromQL query is responsible for computing the percentile (e.g. `histogram_quantile(0.95, ...)`)
- Latency units are arbitrary — only relative order matters
- Models missing from the result are appended at the end of the ranked list
---
## Version Requirements
| Version | Top-level `routing_preferences` |
|---|---|
| `< v0.4.0` | Not allowed — startup error if present |
| `v0.4.0+` | Supported (required for model routing) |

View file

@ -36,35 +36,20 @@ model_providers:
# can select the best model for each request based on intent. Requires the # can select the best model for each request based on intent. Requires the
# Arch-Router model (or equivalent) to be configured in overrides.llm_routing_model. # Arch-Router model (or equivalent) to be configured in overrides.llm_routing_model.
# Each preference has a name (short label) and a description (used for intent matching). # Each preference has a name (short label) and a description (used for intent matching).
- model: openai/gpt-4o - model: groq/llama-3.3-70b-versatile
name: gpt-4o-coding # Optional friendly name to distinguish multiple entries for same model access_key: $GROQ_API_KEY
access_key: $OPENAI_API_KEY
routing_preferences: routing_preferences:
- name: code generation - name: code generation
description: generating new code snippets, functions, or boilerplate based on user prompts or requirements description: generating new code snippets, functions, or boilerplate based on user prompts or requirements
- name: code review - name: code review
description: reviewing, analyzing, and suggesting improvements to existing code description: reviewing, analyzing, and suggesting improvements to existing code
- model: anthropic/claude-sonnet-4-0
name: claude-sonnet-reasoning
access_key: $ANTHROPIC_API_KEY
routing_preferences:
- name: reasoning
description: complex multi-step reasoning, math, logic puzzles, and analytical tasks
# passthrough_auth: forwards the client's Authorization header upstream instead of # passthrough_auth: forwards the client's Authorization header upstream instead of
# using the configured access_key. Useful for LiteLLM or similar proxy setups. # using the configured access_key. Useful for LiteLLM or similar proxy setups.
- model: openai/gpt-4o-litellm - model: openai/gpt-4o-litellm
base_url: https://litellm.example.com base_url: https://litellm.example.com
passthrough_auth: true passthrough_auth: true
# provider_interface: specifies the API format when the provider doesn't match
# the default inferred from the model name. Supported: openai, claude, gemini,
# mistral, groq, deepseek, plano
- model: groq/llama-3.3-70b-versatile
access_key: $GROQ_API_KEY
provider_interface: groq
# Custom/self-hosted endpoint with explicit http_host override # Custom/self-hosted endpoint with explicit http_host override
- model: openai/llama-3.3-70b - model: openai/llama-3.3-70b
base_url: https://api.custom-provider.com base_url: https://api.custom-provider.com
@ -179,7 +164,7 @@ overrides:
# Trim conversation history to fit within the model's context window # Trim conversation history to fit within the model's context window
optimize_context_window: true optimize_context_window: true
# Use Plano's agent orchestrator for multi-agent request routing # Use Plano's agent orchestrator for multi-agent request routing
use_agent_orchestrator: true use_agent_orchestrator: false
# Connect timeout for upstream provider clusters (e.g., "5s", "10s"). Default: "5s" # Connect timeout for upstream provider clusters (e.g., "5s", "10s"). Default: "5s"
upstream_connect_timeout: 10s upstream_connect_timeout: 10s
# Path to the trusted CA bundle for upstream TLS verification # Path to the trusted CA bundle for upstream TLS verification

View file

@ -8,6 +8,7 @@ endpoints:
connect_timeout: 0.005s connect_timeout: 0.005s
endpoint: 127.0.0.1 endpoint: 127.0.0.1
port: 80 port: 80
protocol: http
flight_agent: flight_agent:
endpoint: localhost endpoint: localhost
port: 10520 port: 10520
@ -19,6 +20,11 @@ endpoints:
mistral_local: mistral_local:
endpoint: 127.0.0.1 endpoint: 127.0.0.1
port: 8001 port: 8001
secure_service:
endpoint: api.example.com
http_host: api.example.com
port: 443
protocol: https
weather_agent: weather_agent:
endpoint: localhost endpoint: localhost
port: 10510 port: 10510
@ -38,6 +44,9 @@ listeners:
router: plano_orchestrator_v1 router: plano_orchestrator_v1
type: agent type: agent
- address: 0.0.0.0 - address: 0.0.0.0
input_filters:
- input_guards
max_retries: 3
model_providers: model_providers:
- access_key: $OPENAI_API_KEY - access_key: $OPENAI_API_KEY
default: true default: true
@ -56,6 +65,16 @@ listeners:
model: ministral-3b-latest model: ministral-3b-latest
name: mistral/ministral-3b-latest name: mistral/ministral-3b-latest
provider_interface: mistral provider_interface: mistral
- access_key: $GROQ_API_KEY
model: llama-3.3-70b-versatile
name: groq/llama-3.3-70b-versatile
provider_interface: groq
routing_preferences:
- description: generating new code snippets, functions, or boilerplate based on
user prompts or requirements
name: code generation
- description: reviewing, analyzing, and suggesting improvements to existing code
name: code review
- base_url: https://litellm.example.com - base_url: https://litellm.example.com
cluster_name: openai_litellm.example.com cluster_name: openai_litellm.example.com
endpoint: litellm.example.com endpoint: litellm.example.com
@ -65,8 +84,21 @@ listeners:
port: 443 port: 443
protocol: https protocol: https
provider_interface: openai provider_interface: openai
- access_key: $CUSTOM_API_KEY
base_url: https://api.custom-provider.com
cluster_name: openai_api.custom-provider.com
endpoint: api.custom-provider.com
http_host: api.custom-provider.com
model: llama-3.3-70b
name: openai/llama-3.3-70b
port: 443
protocol: https
provider_interface: openai
name: model_1 name: model_1
output_filters:
- input_guards
port: 12000 port: 12000
timeout: 30s
type: model type: model
- address: 0.0.0.0 - address: 0.0.0.0
name: prompt_function_listener name: prompt_function_listener
@ -95,6 +127,16 @@ model_providers:
model: ministral-3b-latest model: ministral-3b-latest
name: mistral/ministral-3b-latest name: mistral/ministral-3b-latest
provider_interface: mistral provider_interface: mistral
- access_key: $GROQ_API_KEY
model: llama-3.3-70b-versatile
name: groq/llama-3.3-70b-versatile
provider_interface: groq
routing_preferences:
- description: generating new code snippets, functions, or boilerplate based on
user prompts or requirements
name: code generation
- description: reviewing, analyzing, and suggesting improvements to existing code
name: code review
- base_url: https://litellm.example.com - base_url: https://litellm.example.com
cluster_name: openai_litellm.example.com cluster_name: openai_litellm.example.com
endpoint: litellm.example.com endpoint: litellm.example.com
@ -104,6 +146,20 @@ model_providers:
port: 443 port: 443
protocol: https protocol: https
provider_interface: openai provider_interface: openai
- access_key: $CUSTOM_API_KEY
base_url: https://api.custom-provider.com
cluster_name: openai_api.custom-provider.com
endpoint: api.custom-provider.com
http_host: api.custom-provider.com
model: llama-3.3-70b
name: openai/llama-3.3-70b
port: 443
protocol: https
provider_interface: openai
- internal: true
model: Arch-Router
name: arch-router
provider_interface: plano
- internal: true - internal: true
model: Arch-Function model: Arch-Function
name: arch-function name: arch-function
@ -112,8 +168,22 @@ model_providers:
model: Plano-Orchestrator model: Plano-Orchestrator
name: plano/orchestrator name: plano/orchestrator
provider_interface: plano provider_interface: plano
overrides:
agent_orchestration_model: Plano-Orchestrator
llm_routing_model: Arch-Router
optimize_context_window: true
prompt_target_intent_matching_threshold: 0.7
upstream_connect_timeout: 10s
upstream_tls_ca_path: /etc/ssl/certs/ca-certificates.crt
use_agent_orchestrator: false
prompt_guards:
input_guards:
jailbreak:
on_exception:
message: I'm sorry, I can't help with that request.
prompt_targets: prompt_targets:
- description: Get current weather at a location. - auto_llm_dispatch_on_response: true
description: Get current weather at a location.
endpoint: endpoint:
http_method: POST http_method: POST
name: app_server name: app_server
@ -129,7 +199,36 @@ prompt_targets:
name: days name: days
required: true required: true
type: int type: int
system_prompt: You are a weather expert. Provide accurate and concise weather information.
ratelimits:
- limit:
tokens: 100000
unit: hour
model: openai/gpt-4o
selector:
key: x-user-id
value: '*'
- limit:
tokens: 500000
unit: day
model: openai/gpt-4o-mini
selector:
key: x-org-id
value: acme-corp
state_storage:
type: memory
system_prompt: 'You are a helpful assistant. Always respond concisely and accurately.
'
tracing: tracing:
opentracing_grpc_endpoint: http://localhost:4317 opentracing_grpc_endpoint: http://localhost:4317
random_sampling: 100 random_sampling: 100
span_attributes:
header_prefixes:
- x-user-
- x-org-
static:
environment: production
service.team: platform
trace_arch_internal: false
version: v0.3.0 version: v0.3.0