diff --git a/README.md b/README.md index db398507..b7ff7efc 100644 --- a/README.md +++ b/README.md @@ -37,7 +37,7 @@ Plano pulls rote plumbing out of your framework so you can stay focused on what **Jump to our [docs](https://docs.planoai.dev)** to learn how you can use Plano to improve the speed, safety and obervability of your agentic applications. > [!IMPORTANT] -> Plano and the Arch family of LLMs (like Plano-Orchestrator-4B, Arch-Router, etc) are hosted free of charge in the US-central region to give you a great first-run developer experience of Plano. To scale and run in production, you can either run these LLMs locally or contact us on [Discord](https://discord.gg/pGZf2gcwEc) for API keys. +> Plano and the Plano family of LLMs (like Plano-Orchestrator) are hosted free of charge in the US-central region to give you a great first-run developer experience of Plano. To scale and run in production, you can either run these LLMs locally or contact us on [Discord](https://discord.gg/pGZf2gcwEc) for API keys. --- diff --git a/cli/planoai/config_generator.py b/cli/planoai/config_generator.py index 3ffebe09..5a3d4f63 100644 --- a/cli/planoai/config_generator.py +++ b/cli/planoai/config_generator.py @@ -372,16 +372,15 @@ def validate_and_render_schema(): # Build lookup of model names (already prefix-stripped by config processing) model_name_set = {mp.get("model") for mp in updated_model_providers} - # Auto-add arch-router provider if routing preferences exist and no provider matches the router model - router_model = overrides_config.get("llm_routing_model", "Arch-Router") - # Strip provider prefix for comparison since config processing strips prefixes from model names + # Auto-add plano-orchestrator provider if routing preferences exist and no provider matches the routing model + router_model = overrides_config.get("llm_routing_model", "Plano-Orchestrator") router_model_id = ( router_model.split("/", 1)[1] if "/" in router_model else router_model ) if len(model_usage_name_keys) > 0 and router_model_id not in model_name_set: updated_model_providers.append( { - "name": "arch-router", + "name": "plano-orchestrator", "provider_interface": "plano", "model": router_model_id, "internal": True, diff --git a/config/plano_config_schema.yaml b/config/plano_config_schema.yaml index 95a2e5cc..d3d6a643 100644 --- a/config/plano_config_schema.yaml +++ b/config/plano_config_schema.yaml @@ -284,10 +284,13 @@ properties: description: "Path to the trusted CA bundle for upstream TLS verification. Default is '/etc/ssl/certs/ca-certificates.crt'." llm_routing_model: type: string - description: "Model name for the LLM router (e.g., 'Arch-Router'). Must match a model in model_providers." + description: "Model name for the LLM router (e.g., 'Plano-Orchestrator'). Must match a model in model_providers." agent_orchestration_model: type: string description: "Model name for the agent orchestrator (e.g., 'Plano-Orchestrator'). Must match a model in model_providers." + orchestrator_model_context_length: + type: integer + description: "Maximum token length for the orchestrator/routing model context window. Default is 8192." system_prompt: type: string prompt_targets: diff --git a/crates/brightstaff/src/app_state.rs b/crates/brightstaff/src/app_state.rs index 57707f6e..e585d2db 100644 --- a/crates/brightstaff/src/app_state.rs +++ b/crates/brightstaff/src/app_state.rs @@ -5,7 +5,6 @@ use common::configuration::{Agent, FilterPipeline, Listener, ModelAlias, SpanAtt use common::llm_providers::LlmProviders; use tokio::sync::RwLock; -use crate::router::llm::RouterService; use crate::router::orchestrator::OrchestratorService; use crate::state::StateStorage; @@ -14,7 +13,6 @@ use crate::state::StateStorage; /// Instead of cloning 8+ individual `Arc`s per connection, a single /// `Arc` is cloned once and passed to the request handler. pub struct AppState { - pub router_service: Arc, pub orchestrator_service: Arc, pub model_aliases: Option>, pub llm_providers: Arc>, diff --git a/crates/brightstaff/src/handlers/agents/selector.rs b/crates/brightstaff/src/handlers/agents/selector.rs index 8225a003..e0467163 100644 --- a/crates/brightstaff/src/handlers/agents/selector.rs +++ b/crates/brightstaff/src/handlers/agents/selector.rs @@ -177,6 +177,7 @@ mod tests { "http://localhost:8080".to_string(), "test-model".to_string(), "plano-orchestrator".to_string(), + crate::router::orchestrator_model_v1::MAX_TOKEN_LEN, )) } diff --git a/crates/brightstaff/src/handlers/integration_tests.rs b/crates/brightstaff/src/handlers/integration_tests.rs index 499fbfca..c826dc50 100644 --- a/crates/brightstaff/src/handlers/integration_tests.rs +++ b/crates/brightstaff/src/handlers/integration_tests.rs @@ -23,6 +23,7 @@ mod tests { "http://localhost:8080".to_string(), "test-model".to_string(), "plano-orchestrator".to_string(), + crate::router::orchestrator_model_v1::MAX_TOKEN_LEN, )) } @@ -147,8 +148,8 @@ mod tests { #[tokio::test] async fn test_error_handling_flow() { - let router_service = create_test_orchestrator_service(); - let agent_selector = AgentSelector::new(router_service); + let orchestrator_service = create_test_orchestrator_service(); + let agent_selector = AgentSelector::new(orchestrator_service); // Test listener not found let result = agent_selector.find_listener(Some("nonexistent"), &[]); diff --git a/crates/brightstaff/src/handlers/llm/mod.rs b/crates/brightstaff/src/handlers/llm/mod.rs index 5e108c56..8f00e4b6 100644 --- a/crates/brightstaff/src/handlers/llm/mod.rs +++ b/crates/brightstaff/src/handlers/llm/mod.rs @@ -22,7 +22,6 @@ pub(crate) mod model_selection; use crate::app_state::AppState; use crate::handlers::agents::pipeline::PipelineProcessor; -use crate::handlers::extract_or_generate_traceparent; use crate::handlers::extract_request_id; use crate::handlers::full; use crate::state::response_state_processor::ResponsesStateProcessor; @@ -92,22 +91,20 @@ async fn llm_chat_inner( } }); - let traceparent = extract_or_generate_traceparent(&request_headers); - // Session pinning: extract session ID and check cache before routing let session_id: Option = request_headers .get(MODEL_AFFINITY_HEADER) .and_then(|h| h.to_str().ok()) .map(|s| s.to_string()); let tenant_id: Option = state - .router_service + .orchestrator_service .tenant_header() .and_then(|hdr| request_headers.get(hdr)) .and_then(|v| v.to_str().ok()) .map(|s| s.to_string()); let pinned_model: Option = if let Some(ref sid) = session_id { state - .router_service + .orchestrator_service .get_cached_route(sid, tenant_id.as_deref()) .await .map(|c| c.model_name) @@ -287,9 +284,8 @@ async fn llm_chat_inner( let routing_result = match async { set_service_name(operation_component::ROUTING); router_chat_get_upstream_model( - Arc::clone(&state.router_service), + Arc::clone(&state.orchestrator_service), client_request, - &traceparent, &request_path, &request_id, inline_routing_preferences, @@ -315,10 +311,9 @@ async fn llm_chat_inner( alias_resolved_model.clone() }; - // Cache the routing decision so subsequent requests with the same session ID are pinned if let Some(ref sid) = session_id { state - .router_service + .orchestrator_service .cache_route(sid.clone(), tenant_id.as_deref(), model.clone(), route_name) .await; } diff --git a/crates/brightstaff/src/handlers/llm/model_selection.rs b/crates/brightstaff/src/handlers/llm/model_selection.rs index 1f5aea71..1b4315e7 100644 --- a/crates/brightstaff/src/handlers/llm/model_selection.rs +++ b/crates/brightstaff/src/handlers/llm/model_selection.rs @@ -5,7 +5,7 @@ use hyper::StatusCode; use std::sync::Arc; use tracing::{debug, info, warn}; -use crate::router::llm::RouterService; +use crate::router::orchestrator::OrchestratorService; use crate::streaming::truncate_message; use crate::tracing::routing; @@ -37,9 +37,8 @@ impl RoutingError { /// * `Ok(RoutingResult)` - Contains the selected model name and span ID /// * `Err(RoutingError)` - Contains error details and optional span ID pub async fn router_chat_get_upstream_model( - router_service: Arc, + orchestrator_service: Arc, client_request: ProviderRequestType, - traceparent: &str, request_path: &str, request_id: &str, inline_routing_preferences: Option>, @@ -99,11 +98,9 @@ pub async fn router_chat_get_upstream_model( // Capture start time for routing span let routing_start_time = std::time::Instant::now(); - // Attempt to determine route using the router service - let routing_result = router_service + let routing_result = orchestrator_service .determine_route( &chat_request.messages, - traceparent, inline_routing_preferences, request_id, ) diff --git a/crates/brightstaff/src/handlers/routing_service.rs b/crates/brightstaff/src/handlers/routing_service.rs index 3365b6e9..5fc0d3b9 100644 --- a/crates/brightstaff/src/handlers/routing_service.rs +++ b/crates/brightstaff/src/handlers/routing_service.rs @@ -12,7 +12,7 @@ use tracing::{debug, info, info_span, warn, Instrument}; use super::extract_or_generate_traceparent; use crate::handlers::llm::model_selection::router_chat_get_upstream_model; -use crate::router::llm::RouterService; +use crate::router::orchestrator::OrchestratorService; use crate::tracing::{collect_custom_trace_attributes, operation_component, set_service_name}; /// Extracts `routing_preferences` from a JSON body, returning the cleaned body bytes @@ -60,7 +60,7 @@ struct RoutingDecisionResponse { pub async fn routing_decision( request: Request, - router_service: Arc, + orchestrator_service: Arc, request_path: String, span_attributes: &Option, ) -> Result>, hyper::Error> { @@ -76,7 +76,7 @@ pub async fn routing_decision( .and_then(|h| h.to_str().ok()) .map(|s| s.to_string()); - let tenant_id: Option = router_service + let tenant_id: Option = orchestrator_service .tenant_header() .and_then(|hdr| request_headers.get(hdr)) .and_then(|v| v.to_str().ok()) @@ -94,7 +94,7 @@ pub async fn routing_decision( routing_decision_inner( request, - router_service, + orchestrator_service, request_id, request_path, request_headers, @@ -109,7 +109,7 @@ pub async fn routing_decision( #[allow(clippy::too_many_arguments)] async fn routing_decision_inner( request: Request, - router_service: Arc, + orchestrator_service: Arc, request_id: String, request_path: String, request_headers: hyper::HeaderMap, @@ -133,9 +133,8 @@ async fn routing_decision_inner( .unwrap_or("unknown") .to_string(); - // Session pinning: check cache before doing any routing work if let Some(ref sid) = session_id { - if let Some(cached) = router_service + if let Some(cached) = orchestrator_service .get_cached_route(sid, tenant_id.as_deref()) .await { @@ -202,9 +201,8 @@ async fn routing_decision_inner( }; let routing_result = router_chat_get_upstream_model( - Arc::clone(&router_service), + Arc::clone(&orchestrator_service), client_request, - &traceparent, &request_path, &request_id, inline_routing_preferences, @@ -213,9 +211,8 @@ async fn routing_decision_inner( match routing_result { Ok(result) => { - // Cache the result if session_id is present if let Some(ref sid) = session_id { - router_service + orchestrator_service .cache_route( sid.clone(), tenant_id.as_deref(), diff --git a/crates/brightstaff/src/main.rs b/crates/brightstaff/src/main.rs index 73102a97..40ac429d 100644 --- a/crates/brightstaff/src/main.rs +++ b/crates/brightstaff/src/main.rs @@ -5,7 +5,6 @@ use brightstaff::handlers::function_calling::function_calling_chat_handler; use brightstaff::handlers::llm::llm_chat; use brightstaff::handlers::models::list_models; use brightstaff::handlers::routing_service::routing_decision; -use brightstaff::router::llm::RouterService; use brightstaff::router::model_metrics::ModelMetricsService; use brightstaff::router::orchestrator::OrchestratorService; use brightstaff::session_cache::init_session_cache; @@ -37,8 +36,6 @@ use tokio::sync::RwLock; use tracing::{debug, info, warn}; const BIND_ADDRESS: &str = "0.0.0.0:9091"; -const DEFAULT_ROUTING_LLM_PROVIDER: &str = "arch-router"; -const DEFAULT_ROUTING_MODEL_NAME: &str = "Arch-Router"; const DEFAULT_ORCHESTRATOR_LLM_PROVIDER: &str = "plano-orchestrator"; const DEFAULT_ORCHESTRATOR_MODEL_NAME: &str = "Plano-Orchestrator"; @@ -161,20 +158,6 @@ async fn init_app_state( let overrides = config.overrides.clone().unwrap_or_default(); - let routing_model_name: String = overrides - .llm_routing_model - .as_deref() - .map(|m| m.split_once('/').map(|(_, id)| id).unwrap_or(m)) - .unwrap_or(DEFAULT_ROUTING_MODEL_NAME) - .to_string(); - - let routing_llm_provider = config - .model_providers - .iter() - .find(|p| p.model.as_deref() == Some(routing_model_name.as_str())) - .map(|p| p.name.clone()) - .unwrap_or_else(|| DEFAULT_ROUTING_LLM_PROVIDER.to_string()); - let session_ttl_seconds = config.routing.as_ref().and_then(|r| r.session_ttl_seconds); let session_cache = init_session_cache(config).await?; @@ -304,20 +287,11 @@ async fn init_app_state( .and_then(|r| r.session_cache.as_ref()) .and_then(|c| c.tenant_header.clone()); - let router_service = Arc::new(RouterService::new( - config.routing_preferences.clone(), - metrics_service, - format!("{llm_provider_url}{CHAT_COMPLETIONS_PATH}"), - routing_model_name, - routing_llm_provider, - session_ttl_seconds, - session_cache, - session_tenant_header, - )); - + // Resolve model name: prefer llm_routing_model override, then agent_orchestration_model, then default. let orchestrator_model_name: String = overrides - .agent_orchestration_model + .llm_routing_model .as_deref() + .or(overrides.agent_orchestration_model.as_deref()) .map(|m| m.split_once('/').map(|(_, id)| id).unwrap_or(m)) .unwrap_or(DEFAULT_ORCHESTRATOR_MODEL_NAME) .to_string(); @@ -329,10 +303,20 @@ async fn init_app_state( .map(|p| p.name.clone()) .unwrap_or_else(|| DEFAULT_ORCHESTRATOR_LLM_PROVIDER.to_string()); - let orchestrator_service = Arc::new(OrchestratorService::new( + let orchestrator_max_tokens = overrides + .orchestrator_model_context_length + .unwrap_or(brightstaff::router::orchestrator_model_v1::MAX_TOKEN_LEN); + + let orchestrator_service = Arc::new(OrchestratorService::with_routing( format!("{llm_provider_url}{CHAT_COMPLETIONS_PATH}"), orchestrator_model_name, orchestrator_llm_provider, + config.routing_preferences.clone(), + metrics_service, + session_ttl_seconds, + session_cache, + session_tenant_header, + orchestrator_max_tokens, )); let state_storage = init_state_storage(config).await?; @@ -343,7 +327,6 @@ async fn init_app_state( .and_then(|tracing| tracing.span_attributes.clone()); Ok(AppState { - router_service, orchestrator_service, model_aliases: config.model_aliases.clone(), llm_providers: Arc::new(RwLock::new(llm_providers)), @@ -430,7 +413,7 @@ async fn route( ) { return routing_decision( req, - Arc::clone(&state.router_service), + Arc::clone(&state.orchestrator_service), stripped, &state.span_attributes, ) diff --git a/crates/brightstaff/src/router/llm.rs b/crates/brightstaff/src/router/llm.rs deleted file mode 100644 index b1a74641..00000000 --- a/crates/brightstaff/src/router/llm.rs +++ /dev/null @@ -1,371 +0,0 @@ -use std::{borrow::Cow, collections::HashMap, sync::Arc, time::Duration}; - -use common::{ - configuration::TopLevelRoutingPreference, - consts::{ARCH_PROVIDER_HINT_HEADER, REQUEST_ID_HEADER, TRACE_PARENT_HEADER}, -}; - -use super::router_model::{ModelUsagePreference, RoutingPreference}; -use hermesllm::apis::openai::Message; -use hyper::header; -use thiserror::Error; -use tracing::{debug, info}; - -use super::http::{self, post_and_extract_content}; -use super::model_metrics::ModelMetricsService; -use super::router_model::RouterModel; - -use crate::router::router_model_v1; -use crate::session_cache::SessionCache; - -pub use crate::session_cache::CachedRoute; - -const DEFAULT_SESSION_TTL_SECONDS: u64 = 600; - -pub struct RouterService { - router_url: String, - client: reqwest::Client, - router_model: Arc, - routing_provider_name: String, - top_level_preferences: HashMap, - metrics_service: Option>, - session_cache: Arc, - session_ttl: Duration, - tenant_header: Option, -} - -#[derive(Debug, Error)] -pub enum RoutingError { - #[error(transparent)] - Http(#[from] http::HttpError), - - #[error("Router model error: {0}")] - RouterModelError(#[from] super::router_model::RoutingModelError), -} - -pub type Result = std::result::Result; - -impl RouterService { - #[allow(clippy::too_many_arguments)] - pub fn new( - top_level_prefs: Option>, - metrics_service: Option>, - router_url: String, - routing_model_name: String, - routing_provider_name: String, - session_ttl_seconds: Option, - session_cache: Arc, - tenant_header: Option, - ) -> Self { - let top_level_preferences: HashMap = top_level_prefs - .map_or_else(HashMap::new, |prefs| { - prefs.into_iter().map(|p| (p.name.clone(), p)).collect() - }); - - // Build sentinel routes for RouterModelV1: route_name → first model. - // RouterModelV1 uses this to build its prompt; RouterService overrides - // the model selection via rank_models() after the route is determined. - let sentinel_routes: HashMap> = top_level_preferences - .iter() - .filter_map(|(name, pref)| { - pref.models.first().map(|first_model| { - ( - first_model.clone(), - vec![RoutingPreference { - name: name.clone(), - description: pref.description.clone(), - }], - ) - }) - }) - .collect(); - - let router_model = Arc::new(router_model_v1::RouterModelV1::new( - sentinel_routes, - routing_model_name, - router_model_v1::MAX_TOKEN_LEN, - )); - - let session_ttl = - Duration::from_secs(session_ttl_seconds.unwrap_or(DEFAULT_SESSION_TTL_SECONDS)); - - RouterService { - router_url, - client: reqwest::Client::new(), - router_model, - routing_provider_name, - top_level_preferences, - metrics_service, - session_cache, - session_ttl, - tenant_header, - } - } - - /// Name of the HTTP header used to scope cache keys by tenant, if configured. - #[must_use] - pub fn tenant_header(&self) -> Option<&str> { - self.tenant_header.as_deref() - } - - /// Build the cache key, optionally scoped by tenant: `{tenant_id}:{session_id}` or `{session_id}`. - /// Returns a borrowed key when no tenant prefix is needed, avoiding an allocation. - fn session_key<'a>(tenant_id: Option<&str>, session_id: &'a str) -> Cow<'a, str> { - match tenant_id { - Some(t) => Cow::Owned(format!("{t}:{session_id}")), - None => Cow::Borrowed(session_id), - } - } - - /// Look up a cached routing decision by session ID. - /// Returns None if not found or expired. - pub async fn get_cached_route( - &self, - session_id: &str, - tenant_id: Option<&str>, - ) -> Option { - self.session_cache - .get(&Self::session_key(tenant_id, session_id)) - .await - } - - /// Store a routing decision in the session cache. - pub async fn cache_route( - &self, - session_id: String, - tenant_id: Option<&str>, - model_name: String, - route_name: Option, - ) { - self.session_cache - .put( - &Self::session_key(tenant_id, &session_id), - CachedRoute { - model_name, - route_name, - }, - self.session_ttl, - ) - .await; - } - - pub async fn determine_route( - &self, - messages: &[Message], - traceparent: &str, - inline_routing_preferences: Option>, - request_id: &str, - ) -> Result)>> { - if messages.is_empty() { - return Ok(None); - } - - // Build inline top-level map from request if present (inline overrides config). - let inline_top_map: Option> = - inline_routing_preferences - .map(|prefs| prefs.into_iter().map(|p| (p.name.clone(), p)).collect()); - - // No routing defined — skip the router call entirely. - if inline_top_map.is_none() && self.top_level_preferences.is_empty() { - return Ok(None); - } - - // For inline overrides, build synthetic ModelUsagePreference list so RouterModelV1 - // generates the correct prompt (route name + description pairs). - // For config-level prefs the sentinel routes are already baked into RouterModelV1. - let effective_usage_preferences: Option> = - inline_top_map.as_ref().map(|inline_map| { - inline_map - .values() - .map(|p| ModelUsagePreference { - model: p.models.first().cloned().unwrap_or_default(), - routing_preferences: vec![RoutingPreference { - name: p.name.clone(), - description: p.description.clone(), - }], - }) - .collect() - }); - - let router_request = self - .router_model - .generate_request(messages, &effective_usage_preferences); - - debug!( - model = %self.router_model.get_model_name(), - endpoint = %self.router_url, - "sending request to arch-router" - ); - - let body = serde_json::to_string(&router_request) - .map_err(super::router_model::RoutingModelError::from)?; - debug!(body = %body, "arch router request"); - - let mut headers = header::HeaderMap::new(); - headers.insert( - header::CONTENT_TYPE, - header::HeaderValue::from_static("application/json"), - ); - if let Ok(val) = header::HeaderValue::from_str(&self.routing_provider_name) { - headers.insert( - header::HeaderName::from_static(ARCH_PROVIDER_HINT_HEADER), - val, - ); - } - if let Ok(val) = header::HeaderValue::from_str(traceparent) { - headers.insert(header::HeaderName::from_static(TRACE_PARENT_HEADER), val); - } - if let Ok(val) = header::HeaderValue::from_str(request_id) { - headers.insert(header::HeaderName::from_static(REQUEST_ID_HEADER), val); - } - headers.insert( - header::HeaderName::from_static("model"), - header::HeaderValue::from_static("arch-router"), - ); - - let Some((content, elapsed)) = - post_and_extract_content(&self.client, &self.router_url, headers, body).await? - else { - return Ok(None); - }; - - // Parse the route name from the router response. - let parsed = self - .router_model - .parse_response(&content, &effective_usage_preferences)?; - - let result = if let Some((route_name, _sentinel)) = parsed { - let top_pref = inline_top_map - .as_ref() - .and_then(|m| m.get(&route_name)) - .or_else(|| self.top_level_preferences.get(&route_name)); - - if let Some(pref) = top_pref { - let ranked = match &self.metrics_service { - Some(svc) => svc.rank_models(&pref.models, &pref.selection_policy).await, - None => pref.models.clone(), - }; - Some((route_name, ranked)) - } else { - None - } - } else { - None - }; - - info!( - content = %content.replace("\n", "\\n"), - selected_model = ?result, - response_time_ms = elapsed.as_millis(), - "arch-router determined route" - ); - - Ok(result) - } -} - -#[cfg(test)] -mod tests { - use super::*; - use crate::session_cache::memory::MemorySessionCache; - - fn make_router_service(ttl_seconds: u64, max_entries: usize) -> RouterService { - let session_cache = Arc::new(MemorySessionCache::new(max_entries)); - RouterService::new( - None, - None, - "http://localhost:12001/v1/chat/completions".to_string(), - "Arch-Router".to_string(), - "arch-router".to_string(), - Some(ttl_seconds), - session_cache, - None, - ) - } - - #[tokio::test] - async fn test_cache_miss_returns_none() { - let svc = make_router_service(600, 100); - assert!(svc - .get_cached_route("unknown-session", None) - .await - .is_none()); - } - - #[tokio::test] - async fn test_cache_hit_returns_cached_route() { - let svc = make_router_service(600, 100); - svc.cache_route( - "s1".to_string(), - None, - "gpt-4o".to_string(), - Some("code".to_string()), - ) - .await; - - let cached = svc.get_cached_route("s1", None).await.unwrap(); - assert_eq!(cached.model_name, "gpt-4o"); - assert_eq!(cached.route_name, Some("code".to_string())); - } - - #[tokio::test] - async fn test_cache_expired_entry_returns_none() { - let svc = make_router_service(0, 100); - svc.cache_route("s1".to_string(), None, "gpt-4o".to_string(), None) - .await; - assert!(svc.get_cached_route("s1", None).await.is_none()); - } - - #[tokio::test] - async fn test_expired_entries_not_returned() { - let svc = make_router_service(0, 100); - svc.cache_route("s1".to_string(), None, "gpt-4o".to_string(), None) - .await; - svc.cache_route("s2".to_string(), None, "claude".to_string(), None) - .await; - - // Entries with TTL=0 should be expired immediately - assert!(svc.get_cached_route("s1", None).await.is_none()); - assert!(svc.get_cached_route("s2", None).await.is_none()); - } - - #[tokio::test] - async fn test_cache_evicts_oldest_when_full() { - let svc = make_router_service(600, 2); - svc.cache_route("s1".to_string(), None, "model-a".to_string(), None) - .await; - tokio::time::sleep(Duration::from_millis(10)).await; - svc.cache_route("s2".to_string(), None, "model-b".to_string(), None) - .await; - - svc.cache_route("s3".to_string(), None, "model-c".to_string(), None) - .await; - - // s1 should be evicted (oldest); s2 and s3 should remain - assert!(svc.get_cached_route("s1", None).await.is_none()); - assert!(svc.get_cached_route("s2", None).await.is_some()); - assert!(svc.get_cached_route("s3", None).await.is_some()); - } - - #[tokio::test] - async fn test_cache_update_existing_session_does_not_evict() { - let svc = make_router_service(600, 2); - svc.cache_route("s1".to_string(), None, "model-a".to_string(), None) - .await; - svc.cache_route("s2".to_string(), None, "model-b".to_string(), None) - .await; - - svc.cache_route( - "s1".to_string(), - None, - "model-a-updated".to_string(), - Some("route".to_string()), - ) - .await; - - // Both sessions should still be present - let s1 = svc.get_cached_route("s1", None).await.unwrap(); - assert_eq!(s1.model_name, "model-a-updated"); - assert!(svc.get_cached_route("s2", None).await.is_some()); - } -} diff --git a/crates/brightstaff/src/router/mod.rs b/crates/brightstaff/src/router/mod.rs index 2d9d00a7..2ef0d11a 100644 --- a/crates/brightstaff/src/router/mod.rs +++ b/crates/brightstaff/src/router/mod.rs @@ -1,8 +1,5 @@ pub(crate) mod http; -pub mod llm; pub mod model_metrics; pub mod orchestrator; pub mod orchestrator_model; pub mod orchestrator_model_v1; -pub mod router_model; -pub mod router_model_v1; diff --git a/crates/brightstaff/src/router/orchestrator.rs b/crates/brightstaff/src/router/orchestrator.rs index c75aa64b..7aaf70a2 100644 --- a/crates/brightstaff/src/router/orchestrator.rs +++ b/crates/brightstaff/src/router/orchestrator.rs @@ -1,7 +1,7 @@ -use std::{collections::HashMap, sync::Arc}; +use std::{borrow::Cow, collections::HashMap, sync::Arc, time::Duration}; use common::{ - configuration::{AgentUsagePreference, OrchestrationPreference}, + configuration::{AgentUsagePreference, OrchestrationPreference, TopLevelRoutingPreference}, consts::{ARCH_PROVIDER_HINT_HEADER, REQUEST_ID_HEADER}, }; use hermesllm::apis::openai::Message; @@ -12,15 +12,26 @@ use thiserror::Error; use tracing::{debug, info}; use super::http::{self, post_and_extract_content}; +use super::model_metrics::ModelMetricsService; use super::orchestrator_model::OrchestratorModel; use crate::router::orchestrator_model_v1; +use crate::session_cache::SessionCache; + +pub use crate::session_cache::CachedRoute; + +const DEFAULT_SESSION_TTL_SECONDS: u64 = 600; pub struct OrchestratorService { orchestrator_url: String, client: reqwest::Client, orchestrator_model: Arc, orchestrator_provider_name: String, + top_level_preferences: HashMap, + metrics_service: Option>, + session_cache: Option>, + session_ttl: Duration, + tenant_header: Option, } #[derive(Debug, Error)] @@ -39,13 +50,12 @@ impl OrchestratorService { orchestrator_url: String, orchestration_model_name: String, orchestrator_provider_name: String, + max_token_length: usize, ) -> Self { - let agent_orchestrations: HashMap> = HashMap::new(); - let orchestrator_model = Arc::new(orchestrator_model_v1::OrchestratorModelV1::new( - agent_orchestrations, - orchestration_model_name.clone(), - orchestrator_model_v1::MAX_TOKEN_LEN, + HashMap::new(), + orchestration_model_name, + max_token_length, )); OrchestratorService { @@ -53,9 +63,182 @@ impl OrchestratorService { client: reqwest::Client::new(), orchestrator_model, orchestrator_provider_name, + top_level_preferences: HashMap::new(), + metrics_service: None, + session_cache: None, + session_ttl: Duration::from_secs(DEFAULT_SESSION_TTL_SECONDS), + tenant_header: None, } } + #[allow(clippy::too_many_arguments)] + pub fn with_routing( + orchestrator_url: String, + orchestration_model_name: String, + orchestrator_provider_name: String, + top_level_prefs: Option>, + metrics_service: Option>, + session_ttl_seconds: Option, + session_cache: Arc, + tenant_header: Option, + max_token_length: usize, + ) -> Self { + let top_level_preferences: HashMap = top_level_prefs + .map_or_else(HashMap::new, |prefs| { + prefs.into_iter().map(|p| (p.name.clone(), p)).collect() + }); + + let orchestrator_model = Arc::new(orchestrator_model_v1::OrchestratorModelV1::new( + HashMap::new(), + orchestration_model_name, + max_token_length, + )); + + let session_ttl = + Duration::from_secs(session_ttl_seconds.unwrap_or(DEFAULT_SESSION_TTL_SECONDS)); + + OrchestratorService { + orchestrator_url, + client: reqwest::Client::new(), + orchestrator_model, + orchestrator_provider_name, + top_level_preferences, + metrics_service, + session_cache: Some(session_cache), + session_ttl, + tenant_header, + } + } + + // ---- Session cache methods ---- + + #[must_use] + pub fn tenant_header(&self) -> Option<&str> { + self.tenant_header.as_deref() + } + + fn session_key<'a>(tenant_id: Option<&str>, session_id: &'a str) -> Cow<'a, str> { + match tenant_id { + Some(t) => Cow::Owned(format!("{t}:{session_id}")), + None => Cow::Borrowed(session_id), + } + } + + pub async fn get_cached_route( + &self, + session_id: &str, + tenant_id: Option<&str>, + ) -> Option { + let cache = self.session_cache.as_ref()?; + cache.get(&Self::session_key(tenant_id, session_id)).await + } + + pub async fn cache_route( + &self, + session_id: String, + tenant_id: Option<&str>, + model_name: String, + route_name: Option, + ) { + if let Some(ref cache) = self.session_cache { + cache + .put( + &Self::session_key(tenant_id, &session_id), + CachedRoute { + model_name, + route_name, + }, + self.session_ttl, + ) + .await; + } + } + + // ---- LLM routing ---- + + pub async fn determine_route( + &self, + messages: &[Message], + inline_routing_preferences: Option>, + request_id: &str, + ) -> Result)>> { + if messages.is_empty() { + return Ok(None); + } + + let inline_top_map: Option> = + inline_routing_preferences + .map(|prefs| prefs.into_iter().map(|p| (p.name.clone(), p)).collect()); + + if inline_top_map.is_none() && self.top_level_preferences.is_empty() { + return Ok(None); + } + + let effective_source = inline_top_map + .as_ref() + .unwrap_or(&self.top_level_preferences); + + let effective_prefs: Vec = effective_source + .values() + .map(|p| AgentUsagePreference { + model: p.models.first().cloned().unwrap_or_default(), + orchestration_preferences: vec![OrchestrationPreference { + name: p.name.clone(), + description: p.description.clone(), + }], + }) + .collect(); + + let orchestration_result = self + .determine_orchestration( + messages, + Some(effective_prefs), + Some(request_id.to_string()), + ) + .await?; + + let result = if let Some(ref routes) = orchestration_result { + if routes.len() > 1 { + let all_routes: Vec<&str> = routes.iter().map(|(name, _)| name.as_str()).collect(); + info!( + routes = ?all_routes, + using = %all_routes.first().unwrap_or(&"none"), + "plano-orchestrator detected multiple intents, using first" + ); + } + + if let Some((route_name, _)) = routes.first() { + let top_pref = inline_top_map + .as_ref() + .and_then(|m| m.get(route_name)) + .or_else(|| self.top_level_preferences.get(route_name)); + + if let Some(pref) = top_pref { + let ranked = match &self.metrics_service { + Some(svc) => svc.rank_models(&pref.models, &pref.selection_policy).await, + None => pref.models.clone(), + }; + Some((route_name.clone(), ranked)) + } else { + None + } + } else { + None + } + } else { + None + }; + + info!( + selected_model = ?result, + "plano-orchestrator determined route" + ); + + Ok(result) + } + + // ---- Agent orchestration (existing) ---- + pub async fn determine_orchestration( &self, messages: &[Message], @@ -80,12 +263,12 @@ impl OrchestratorService { debug!( model = %self.orchestrator_model.get_model_name(), endpoint = %self.orchestrator_url, - "sending request to arch-orchestrator" + "sending request to plano-orchestrator" ); let body = serde_json::to_string(&orchestrator_request) .map_err(super::orchestrator_model::OrchestratorModelError::from)?; - debug!(body = %body, "arch orchestrator request"); + debug!(body = %body, "plano-orchestrator request"); let mut headers = header::HeaderMap::new(); headers.insert( @@ -98,7 +281,6 @@ impl OrchestratorService { .unwrap_or_else(|_| header::HeaderValue::from_static("plano-orchestrator")), ); - // Inject OpenTelemetry trace context from current span global::get_text_map_propagator(|propagator| { let cx = tracing_opentelemetry::OpenTelemetrySpanExt::context(&tracing::Span::current()); @@ -130,9 +312,113 @@ impl OrchestratorService { content = %content.replace("\n", "\\n"), selected_routes = ?parsed, response_time_ms = elapsed.as_millis(), - "arch-orchestrator determined routes" + "plano-orchestrator determined routes" ); Ok(parsed) } } + +#[cfg(test)] +mod tests { + use super::*; + use crate::session_cache::memory::MemorySessionCache; + + fn make_orchestrator_service(ttl_seconds: u64, max_entries: usize) -> OrchestratorService { + let session_cache = Arc::new(MemorySessionCache::new(max_entries)); + OrchestratorService::with_routing( + "http://localhost:12001/v1/chat/completions".to_string(), + "Plano-Orchestrator".to_string(), + "plano-orchestrator".to_string(), + None, + None, + Some(ttl_seconds), + session_cache, + None, + orchestrator_model_v1::MAX_TOKEN_LEN, + ) + } + + #[tokio::test] + async fn test_cache_miss_returns_none() { + let svc = make_orchestrator_service(600, 100); + assert!(svc + .get_cached_route("unknown-session", None) + .await + .is_none()); + } + + #[tokio::test] + async fn test_cache_hit_returns_cached_route() { + let svc = make_orchestrator_service(600, 100); + svc.cache_route( + "s1".to_string(), + None, + "gpt-4o".to_string(), + Some("code".to_string()), + ) + .await; + + let cached = svc.get_cached_route("s1", None).await.unwrap(); + assert_eq!(cached.model_name, "gpt-4o"); + assert_eq!(cached.route_name, Some("code".to_string())); + } + + #[tokio::test] + async fn test_cache_expired_entry_returns_none() { + let svc = make_orchestrator_service(0, 100); + svc.cache_route("s1".to_string(), None, "gpt-4o".to_string(), None) + .await; + assert!(svc.get_cached_route("s1", None).await.is_none()); + } + + #[tokio::test] + async fn test_expired_entries_not_returned() { + let svc = make_orchestrator_service(0, 100); + svc.cache_route("s1".to_string(), None, "gpt-4o".to_string(), None) + .await; + svc.cache_route("s2".to_string(), None, "claude".to_string(), None) + .await; + + assert!(svc.get_cached_route("s1", None).await.is_none()); + assert!(svc.get_cached_route("s2", None).await.is_none()); + } + + #[tokio::test] + async fn test_cache_evicts_oldest_when_full() { + let svc = make_orchestrator_service(600, 2); + svc.cache_route("s1".to_string(), None, "model-a".to_string(), None) + .await; + tokio::time::sleep(Duration::from_millis(10)).await; + svc.cache_route("s2".to_string(), None, "model-b".to_string(), None) + .await; + + svc.cache_route("s3".to_string(), None, "model-c".to_string(), None) + .await; + + assert!(svc.get_cached_route("s1", None).await.is_none()); + assert!(svc.get_cached_route("s2", None).await.is_some()); + assert!(svc.get_cached_route("s3", None).await.is_some()); + } + + #[tokio::test] + async fn test_cache_update_existing_session_does_not_evict() { + let svc = make_orchestrator_service(600, 2); + svc.cache_route("s1".to_string(), None, "model-a".to_string(), None) + .await; + svc.cache_route("s2".to_string(), None, "model-b".to_string(), None) + .await; + + svc.cache_route( + "s1".to_string(), + None, + "model-a-updated".to_string(), + Some("route".to_string()), + ) + .await; + + let s1 = svc.get_cached_route("s1", None).await.unwrap(); + assert_eq!(s1.model_name, "model-a-updated"); + assert!(svc.get_cached_route("s2", None).await.is_some()); + } +} diff --git a/crates/brightstaff/src/router/orchestrator_model.rs b/crates/brightstaff/src/router/orchestrator_model.rs index 19c78ca3..a6b32b8e 100644 --- a/crates/brightstaff/src/router/orchestrator_model.rs +++ b/crates/brightstaff/src/router/orchestrator_model.rs @@ -11,8 +11,7 @@ pub enum OrchestratorModelError { pub type Result = std::result::Result; /// OrchestratorModel trait for handling orchestration requests. -/// Unlike RouterModel which returns a single route, OrchestratorModel -/// can return multiple routes as the model output format is: +/// Returns multiple routes as the model output format is: /// {"route": ["route_name_1", "route_name_2", ...]} pub trait OrchestratorModel: Send + Sync { fn generate_request( diff --git a/crates/brightstaff/src/router/orchestrator_model_v1.rs b/crates/brightstaff/src/router/orchestrator_model_v1.rs index ec4d2d12..75e5c586 100644 --- a/crates/brightstaff/src/router/orchestrator_model_v1.rs +++ b/crates/brightstaff/src/router/orchestrator_model_v1.rs @@ -8,7 +8,7 @@ use tracing::{debug, warn}; use super::orchestrator_model::{OrchestratorModel, OrchestratorModelError}; -pub const MAX_TOKEN_LEN: usize = 2048; // Default max token length for the orchestration model +pub const MAX_TOKEN_LEN: usize = 8192; // Default max token length for the orchestration model /// Custom JSON formatter that produces spaced JSON (space after colons and commas), same as JSON in python struct SpacedJsonFormatter; diff --git a/crates/brightstaff/src/router/router_model.rs b/crates/brightstaff/src/router/router_model.rs deleted file mode 100644 index 4fe023a3..00000000 --- a/crates/brightstaff/src/router/router_model.rs +++ /dev/null @@ -1,39 +0,0 @@ -use hermesllm::apis::openai::{ChatCompletionsRequest, Message}; -use serde::{Deserialize, Serialize}; -use thiserror::Error; - -#[derive(Debug, Error)] -pub enum RoutingModelError { - #[error("Failed to parse JSON: {0}")] - JsonError(#[from] serde_json::Error), -} - -pub type Result = std::result::Result; - -/// Internal route descriptor passed to the router model to build its prompt. -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct RoutingPreference { - pub name: String, - pub description: String, -} - -/// Groups a model with its routing preferences (used internally by RouterModelV1). -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct ModelUsagePreference { - pub model: String, - pub routing_preferences: Vec, -} - -pub trait RouterModel: Send + Sync { - fn generate_request( - &self, - messages: &[Message], - usage_preferences: &Option>, - ) -> ChatCompletionsRequest; - fn parse_response( - &self, - content: &str, - usage_preferences: &Option>, - ) -> Result>; - fn get_model_name(&self) -> String; -} diff --git a/crates/brightstaff/src/router/router_model_v1.rs b/crates/brightstaff/src/router/router_model_v1.rs deleted file mode 100644 index e1189c94..00000000 --- a/crates/brightstaff/src/router/router_model_v1.rs +++ /dev/null @@ -1,842 +0,0 @@ -use std::collections::HashMap; - -use super::router_model::{ModelUsagePreference, RoutingPreference}; -use hermesllm::apis::openai::{ChatCompletionsRequest, Message, MessageContent, Role}; -use hermesllm::transforms::lib::ExtractText; -use serde::{Deserialize, Serialize}; -use tracing::{debug, warn}; - -use super::router_model::{RouterModel, RoutingModelError}; - -pub const MAX_TOKEN_LEN: usize = 2048; // Default max token length for the routing model -pub const ARCH_ROUTER_V1_SYSTEM_PROMPT: &str = r#" -You are a helpful assistant designed to find the best suited route. -You are provided with route description within XML tags: - -{routes} - - - -{conversation} - - -Your task is to decide which route is best suit with user intent on the conversation in XML tags. Follow the instruction: -1. If the latest intent from user is irrelevant or user intent is full filled, response with other route {"route": "other"}. -2. You must analyze the route descriptions and find the best match route for user latest intent. -3. You only response the name of the route that best matches the user's request, use the exact name in the . - -Based on your analysis, provide your response in the following JSON formats if you decide to match any route: -{"route": "route_name"} -"#; - -pub type Result = std::result::Result; -pub struct RouterModelV1 { - llm_route_json_str: String, - llm_route_to_model_map: HashMap, - routing_model: String, - max_token_length: usize, -} -impl RouterModelV1 { - pub fn new( - llm_routes: HashMap>, - routing_model: String, - max_token_length: usize, - ) -> Self { - let llm_route_values: Vec = - llm_routes.values().flatten().cloned().collect(); - let llm_route_json_str = - serde_json::to_string(&llm_route_values).unwrap_or_else(|_| "[]".to_string()); - let llm_route_to_model_map: HashMap = llm_routes - .iter() - .flat_map(|(model, prefs)| prefs.iter().map(|pref| (pref.name.clone(), model.clone()))) - .collect(); - - RouterModelV1 { - routing_model, - max_token_length, - llm_route_json_str, - llm_route_to_model_map, - } - } -} - -#[derive(Debug, Clone, Serialize, Deserialize)] -struct LlmRouterResponse { - pub route: Option, -} - -const TOKEN_LENGTH_DIVISOR: usize = 4; // Approximate token length divisor for UTF-8 characters - -impl RouterModel for RouterModelV1 { - fn generate_request( - &self, - messages: &[Message], - usage_preferences_from_request: &Option>, - ) -> ChatCompletionsRequest { - // remove system prompt, tool calls, tool call response and messages without content - // if content is empty its likely a tool call - // when role == tool its tool call response - let messages_vec = messages - .iter() - .filter(|m| { - m.role != Role::System - && m.role != Role::Developer - && m.role != Role::Tool - && !m.content.extract_text().is_empty() - }) - .collect::>(); - - // Following code is to ensure that the conversation does not exceed max token length - // Note: we use a simple heuristic to estimate token count based on character length to optimize for performance - let mut token_count = ARCH_ROUTER_V1_SYSTEM_PROMPT.len() / TOKEN_LENGTH_DIVISOR; - let mut selected_messages_list_reversed: Vec<&Message> = vec![]; - for (selected_messsage_count, message) in messages_vec.iter().rev().enumerate() { - let message_token_count = message.content.extract_text().len() / TOKEN_LENGTH_DIVISOR; - token_count += message_token_count; - if token_count > self.max_token_length { - debug!( - token_count = token_count, - max_tokens = self.max_token_length, - selected = selected_messsage_count, - total = messages_vec.len(), - "token count exceeds max, truncating conversation" - ); - if message.role == Role::User { - // If message that exceeds max token length is from user, we need to keep it - selected_messages_list_reversed.push(message); - } - break; - } - // If we are here, it means that the message is within the max token length - selected_messages_list_reversed.push(message); - } - - if selected_messages_list_reversed.is_empty() { - debug!("no messages selected, using last message"); - if let Some(last_message) = messages_vec.last() { - selected_messages_list_reversed.push(last_message); - } - } - - // ensure that first and last selected message is from user - if let Some(first_message) = selected_messages_list_reversed.first() { - if first_message.role != Role::User { - warn!("last message is not from user, may lead to incorrect routing"); - } - } - if let Some(last_message) = selected_messages_list_reversed.last() { - if last_message.role != Role::User { - warn!("first message is not from user, may lead to incorrect routing"); - } - } - - // Reverse the selected messages to maintain the conversation order - let selected_conversation_list = selected_messages_list_reversed - .iter() - .rev() - .map(|message| { - Message { - role: message.role.clone(), - // we can unwrap here because we have already filtered out messages without content - content: Some(MessageContent::Text( - message - .content - .as_ref() - .map_or(String::new(), |c| c.to_string()), - )), - name: None, - tool_calls: None, - tool_call_id: None, - } - }) - .collect::>(); - - // Generate the router request message based on the usage preferences. - // If preferences are passed in request then we use them otherwise we use the default routing model preferences. - let router_message = match convert_to_router_preferences(usage_preferences_from_request) { - Some(prefs) => generate_router_message(&prefs, &selected_conversation_list), - None => generate_router_message(&self.llm_route_json_str, &selected_conversation_list), - }; - - ChatCompletionsRequest { - model: self.routing_model.clone(), - messages: vec![Message { - content: Some(MessageContent::Text(router_message)), - role: Role::User, - name: None, - tool_calls: None, - tool_call_id: None, - }], - temperature: Some(0.01), - ..Default::default() - } - } - - fn parse_response( - &self, - content: &str, - usage_preferences: &Option>, - ) -> Result> { - if content.is_empty() { - return Ok(None); - } - let router_resp_fixed = fix_json_response(content); - let router_response: LlmRouterResponse = serde_json::from_str(router_resp_fixed.as_str())?; - - let selected_route = router_response.route.unwrap_or_default().to_string(); - - if selected_route.is_empty() || selected_route == "other" { - return Ok(None); - } - - if let Some(usage_preferences) = usage_preferences { - // If usage preferences are defined, we need to find the model that matches the selected route - let model_name: Option = usage_preferences - .iter() - .map(|pref| { - pref.routing_preferences - .iter() - .find(|routing_pref| routing_pref.name == selected_route) - .map(|_| pref.model.clone()) - }) - .find_map(|model| model); - - if let Some(model_name) = model_name { - return Ok(Some((selected_route, model_name))); - } else { - warn!( - route = %selected_route, - preferences = ?usage_preferences, - "no matching model found for route" - ); - return Ok(None); - } - } - - // If no usage preferences are passed in request then use the default routing model preferences - if let Some(model) = self.llm_route_to_model_map.get(&selected_route).cloned() { - return Ok(Some((selected_route, model))); - } - - warn!( - route = %selected_route, - preferences = ?self.llm_route_to_model_map, - "no model found for route" - ); - - Ok(None) - } - - fn get_model_name(&self) -> String { - self.routing_model.clone() - } -} - -fn generate_router_message(prefs: &str, selected_conversation_list: &Vec) -> String { - ARCH_ROUTER_V1_SYSTEM_PROMPT - .replace("{routes}", prefs) - .replace( - "{conversation}", - &serde_json::to_string(&selected_conversation_list).unwrap_or_default(), - ) -} - -fn convert_to_router_preferences( - prefs_from_request: &Option>, -) -> Option { - if let Some(usage_preferences) = prefs_from_request { - let routing_preferences = usage_preferences - .iter() - .flat_map(|pref| { - pref.routing_preferences - .iter() - .map(|routing_pref| RoutingPreference { - name: routing_pref.name.clone(), - description: routing_pref.description.clone(), - }) - }) - .collect::>(); - - return Some(serde_json::to_string(&routing_preferences).unwrap_or_default()); - } - - None -} - -fn fix_json_response(body: &str) -> String { - let mut updated_body = body.to_string(); - - updated_body = updated_body.replace("'", "\""); - - if updated_body.contains("\\n") { - updated_body = updated_body.replace("\\n", ""); - } - - if updated_body.starts_with("```json") { - updated_body = updated_body - .strip_prefix("```json") - .unwrap_or(&updated_body) - .to_string(); - } - - if updated_body.ends_with("```") { - updated_body = updated_body - .strip_suffix("```") - .unwrap_or(&updated_body) - .to_string(); - } - - updated_body -} - -impl std::fmt::Debug for dyn RouterModel { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - write!(f, "RouterModel") - } -} - -#[cfg(test)] -mod tests { - use super::*; - use pretty_assertions::assert_eq; - - #[test] - fn test_system_prompt_format() { - let expected_prompt = r#" -You are a helpful assistant designed to find the best suited route. -You are provided with route description within XML tags: - -[{"name":"Image generation","description":"generating image"}] - - - -[{"role":"user","content":"hi"},{"role":"assistant","content":"Hello! How can I assist you today?"},{"role":"user","content":"given the image In style of Andy Warhol, portrait of Bart and Lisa Simpson"}] - - -Your task is to decide which route is best suit with user intent on the conversation in XML tags. Follow the instruction: -1. If the latest intent from user is irrelevant or user intent is full filled, response with other route {"route": "other"}. -2. You must analyze the route descriptions and find the best match route for user latest intent. -3. You only response the name of the route that best matches the user's request, use the exact name in the . - -Based on your analysis, provide your response in the following JSON formats if you decide to match any route: -{"route": "route_name"} -"#; - let routes_str = r#" - { - "gpt-4o": [ - {"name": "Image generation", "description": "generating image"} - ] - } - "#; - let llm_routes = - serde_json::from_str::>>(routes_str).unwrap(); - let routing_model = "test-model".to_string(); - let router = RouterModelV1::new(llm_routes, routing_model, usize::MAX); - - let conversation_str = r#" - [ - { - "role": "user", - "content": "hi" - }, - { - "role": "assistant", - "content": "Hello! How can I assist you today?" - }, - { - "role": "user", - "content": "given the image In style of Andy Warhol, portrait of Bart and Lisa Simpson" - } - ] - "#; - let conversation: Vec = serde_json::from_str(conversation_str).unwrap(); - - let req = router.generate_request(&conversation, &None); - - let prompt = req.messages[0].content.extract_text(); - - assert_eq!(expected_prompt, prompt); - } - - #[test] - fn test_system_prompt_format_usage_preferences() { - let expected_prompt = r#" -You are a helpful assistant designed to find the best suited route. -You are provided with route description within XML tags: - -[{"name":"code-generation","description":"generating new code snippets, functions, or boilerplate based on user prompts or requirements"}] - - - -[{"role":"user","content":"hi"},{"role":"assistant","content":"Hello! How can I assist you today?"},{"role":"user","content":"given the image In style of Andy Warhol, portrait of Bart and Lisa Simpson"}] - - -Your task is to decide which route is best suit with user intent on the conversation in XML tags. Follow the instruction: -1. If the latest intent from user is irrelevant or user intent is full filled, response with other route {"route": "other"}. -2. You must analyze the route descriptions and find the best match route for user latest intent. -3. You only response the name of the route that best matches the user's request, use the exact name in the . - -Based on your analysis, provide your response in the following JSON formats if you decide to match any route: -{"route": "route_name"} -"#; - let routes_str = r#" - { - "gpt-4o": [ - {"name": "Image generation", "description": "generating image"} - ] - } - "#; - let llm_routes = - serde_json::from_str::>>(routes_str).unwrap(); - let routing_model = "test-model".to_string(); - let router = RouterModelV1::new(llm_routes, routing_model, usize::MAX); - - let conversation_str = r#" - [ - { - "role": "user", - "content": "hi" - }, - { - "role": "assistant", - "content": "Hello! How can I assist you today?" - }, - { - "role": "user", - "content": "given the image In style of Andy Warhol, portrait of Bart and Lisa Simpson" - } - ] - "#; - let conversation: Vec = serde_json::from_str(conversation_str).unwrap(); - - let usage_preferences = Some(vec![ModelUsagePreference { - model: "claude/claude-3-7-sonnet".to_string(), - routing_preferences: vec![RoutingPreference { - name: "code-generation".to_string(), - description: "generating new code snippets, functions, or boilerplate based on user prompts or requirements".to_string(), - }], - }]); - let req = router.generate_request(&conversation, &usage_preferences); - - let prompt = req.messages[0].content.extract_text(); - - assert_eq!(expected_prompt, prompt); - } - - #[test] - fn test_conversation_exceed_token_count() { - let expected_prompt = r#" -You are a helpful assistant designed to find the best suited route. -You are provided with route description within XML tags: - -[{"name":"Image generation","description":"generating image"}] - - - -[{"role":"user","content":"given the image In style of Andy Warhol, portrait of Bart and Lisa Simpson"}] - - -Your task is to decide which route is best suit with user intent on the conversation in XML tags. Follow the instruction: -1. If the latest intent from user is irrelevant or user intent is full filled, response with other route {"route": "other"}. -2. You must analyze the route descriptions and find the best match route for user latest intent. -3. You only response the name of the route that best matches the user's request, use the exact name in the . - -Based on your analysis, provide your response in the following JSON formats if you decide to match any route: -{"route": "route_name"} -"#; - - let routes_str = r#" - { - "gpt-4o": [ - {"name": "Image generation", "description": "generating image"} - ] - } - "#; - let llm_routes = - serde_json::from_str::>>(routes_str).unwrap(); - let routing_model = "test-model".to_string(); - let router = RouterModelV1::new(llm_routes, routing_model, 235); - - let conversation_str = r#" - [ - { - "role": "user", - "content": "hi" - }, - { - "role": "assistant", - "content": "Hello! How can I assist you today?" - }, - { - "role": "user", - "content": "given the image In style of Andy Warhol, portrait of Bart and Lisa Simpson" - } - ] - "#; - - let conversation: Vec = serde_json::from_str(conversation_str).unwrap(); - - let req = router.generate_request(&conversation, &None); - - let prompt = req.messages[0].content.extract_text(); - - assert_eq!(expected_prompt, prompt); - } - - #[test] - fn test_conversation_exceed_token_count_large_single_message() { - let expected_prompt = r#" -You are a helpful assistant designed to find the best suited route. -You are provided with route description within XML tags: - -[{"name":"Image generation","description":"generating image"}] - - - -[{"role":"user","content":"given the image In style of Andy Warhol, portrait of Bart and Lisa Simpson and this is a very long message that exceeds the max token length of the routing model, so it should be truncated and only the last user message should be included in the conversation for routing."}] - - -Your task is to decide which route is best suit with user intent on the conversation in XML tags. Follow the instruction: -1. If the latest intent from user is irrelevant or user intent is full filled, response with other route {"route": "other"}. -2. You must analyze the route descriptions and find the best match route for user latest intent. -3. You only response the name of the route that best matches the user's request, use the exact name in the . - -Based on your analysis, provide your response in the following JSON formats if you decide to match any route: -{"route": "route_name"} -"#; - - let routes_str = r#" - { - "gpt-4o": [ - {"name": "Image generation", "description": "generating image"} - ] - } - "#; - let llm_routes = - serde_json::from_str::>>(routes_str).unwrap(); - - let routing_model = "test-model".to_string(); - let router = RouterModelV1::new(llm_routes, routing_model, 200); - - let conversation_str = r#" - [ - { - "role": "user", - "content": "hi" - }, - { - "role": "assistant", - "content": "Hello! How can I assist you today?" - }, - { - "role": "user", - "content": "given the image In style of Andy Warhol, portrait of Bart and Lisa Simpson and this is a very long message that exceeds the max token length of the routing model, so it should be truncated and only the last user message should be included in the conversation for routing." - } - ] - "#; - - let conversation: Vec = serde_json::from_str(conversation_str).unwrap(); - - let req = router.generate_request(&conversation, &None); - - let prompt = req.messages[0].content.extract_text(); - - assert_eq!(expected_prompt, prompt); - } - - #[test] - fn test_conversation_trim_upto_user_message() { - let expected_prompt = r#" -You are a helpful assistant designed to find the best suited route. -You are provided with route description within XML tags: - -[{"name":"Image generation","description":"generating image"}] - - - -[{"role":"user","content":"given the image In style of Andy Warhol"},{"role":"assistant","content":"ok here is the image"},{"role":"user","content":"pls give me another image about Bart and Lisa"}] - - -Your task is to decide which route is best suit with user intent on the conversation in XML tags. Follow the instruction: -1. If the latest intent from user is irrelevant or user intent is full filled, response with other route {"route": "other"}. -2. You must analyze the route descriptions and find the best match route for user latest intent. -3. You only response the name of the route that best matches the user's request, use the exact name in the . - -Based on your analysis, provide your response in the following JSON formats if you decide to match any route: -{"route": "route_name"} -"#; - - let routes_str = r#" - { - "gpt-4o": [ - {"name": "Image generation", "description": "generating image"} - ] - } - "#; - let llm_routes = - serde_json::from_str::>>(routes_str).unwrap(); - let routing_model = "test-model".to_string(); - let router = RouterModelV1::new(llm_routes, routing_model, 230); - - let conversation_str = r#" - [ - { - "role": "user", - "content": "hi" - }, - { - "role": "assistant", - "content": "Hello! How can I assist you today?" - }, - { - "role": "user", - "content": "given the image In style of Andy Warhol" - }, - { - "role": "assistant", - "content": "ok here is the image" - }, - { - "role": "user", - "content": "pls give me another image about Bart and Lisa" - } - ] - "#; - - let conversation: Vec = serde_json::from_str(conversation_str).unwrap(); - - let req = router.generate_request(&conversation, &None); - - let prompt = req.messages[0].content.extract_text(); - - assert_eq!(expected_prompt, prompt); - } - - #[test] - fn test_non_text_input() { - let expected_prompt = r#" -You are a helpful assistant designed to find the best suited route. -You are provided with route description within XML tags: - -[{"name":"Image generation","description":"generating image"}] - - - -[{"role":"user","content":"hi"},{"role":"assistant","content":"Hello! How can I assist you today?"},{"role":"user","content":"given the image In style of Andy Warhol, portrait of Bart and Lisa Simpson"}] - - -Your task is to decide which route is best suit with user intent on the conversation in XML tags. Follow the instruction: -1. If the latest intent from user is irrelevant or user intent is full filled, response with other route {"route": "other"}. -2. You must analyze the route descriptions and find the best match route for user latest intent. -3. You only response the name of the route that best matches the user's request, use the exact name in the . - -Based on your analysis, provide your response in the following JSON formats if you decide to match any route: -{"route": "route_name"} -"#; - let routes_str = r#" - { - "gpt-4o": [ - {"name": "Image generation", "description": "generating image"} - ] - } - "#; - let llm_routes = - serde_json::from_str::>>(routes_str).unwrap(); - let routing_model = "test-model".to_string(); - let router = RouterModelV1::new(llm_routes, routing_model, usize::MAX); - - let conversation_str = r#" - [ - { - "role": "user", - "content": [ - { - "type": "text", - "text": "hi" - }, - { - "type": "image_url", - "image_url": { - "url": "https://example.com/image.png" - } - } - ] - }, - { - "role": "assistant", - "content": "Hello! How can I assist you today?" - }, - { - "role": "user", - "content": "given the image In style of Andy Warhol, portrait of Bart and Lisa Simpson" - } - ] - "#; - let conversation: Vec = serde_json::from_str(conversation_str).unwrap(); - - let req = router.generate_request(&conversation, &None); - - let prompt = req.messages[0].content.extract_text(); - - assert_eq!(expected_prompt, prompt); - } - - #[test] - fn test_skip_tool_call() { - let expected_prompt = r#" -You are a helpful assistant designed to find the best suited route. -You are provided with route description within XML tags: - -[{"name":"Image generation","description":"generating image"}] - - - -[{"role":"user","content":"What's the weather like in Tokyo?"},{"role":"assistant","content":"The current weather in Tokyo is 22°C and sunny."},{"role":"user","content":"What about in New York?"}] - - -Your task is to decide which route is best suit with user intent on the conversation in XML tags. Follow the instruction: -1. If the latest intent from user is irrelevant or user intent is full filled, response with other route {"route": "other"}. -2. You must analyze the route descriptions and find the best match route for user latest intent. -3. You only response the name of the route that best matches the user's request, use the exact name in the . - -Based on your analysis, provide your response in the following JSON formats if you decide to match any route: -{"route": "route_name"} -"#; - let routes_str = r#" - { - "gpt-4o": [ - {"name": "Image generation", "description": "generating image"} - ] - } - "#; - let llm_routes = - serde_json::from_str::>>(routes_str).unwrap(); - let routing_model = "test-model".to_string(); - let router = RouterModelV1::new(llm_routes, routing_model, usize::MAX); - - let conversation_str = r#" - [ - { - "role": "user", - "content": "What's the weather like in Tokyo?" - }, - { - "role": "assistant", - "content": "", - "tool_calls": [ - { - "id": "toolcall-abc123", - "type": "function", - "function": { - "name": "get_weather", - "arguments": "{ \"location\": \"Tokyo\" }" - } - } - ] - }, - { - "role": "tool", - "tool_call_id": "toolcall-abc123", - "content": "{ \"temperature\": \"22°C\", \"condition\": \"Sunny\" }" - }, - { - "role": "assistant", - "content": "The current weather in Tokyo is 22°C and sunny." - }, - { - "role": "user", - "content": "What about in New York?" - } - ] - "#; - - // expects conversation to look like this - - // [ - // { - // "role": "user", - // "content": "What's the weather like in Tokyo?" - // }, - // { - // "role": "assistant", - // "content": "The current weather in Tokyo is 22°C and sunny." - // }, - // { - // "role": "user", - // "content": "What about in New York?" - // } - // ] - - let conversation: Vec = serde_json::from_str(conversation_str).unwrap(); - - let req: ChatCompletionsRequest = router.generate_request(&conversation, &None); - - let prompt = req.messages[0].content.extract_text(); - - assert_eq!(expected_prompt, prompt); - } - - #[test] - fn test_parse_response() { - let routes_str = r#" - { - "gpt-4o": [ - {"name": "Image generation", "description": "generating image"} - ] - } - "#; - let llm_routes = - serde_json::from_str::>>(routes_str).unwrap(); - - let router = RouterModelV1::new(llm_routes, "test-model".to_string(), 2000); - - // Case 1: Valid JSON with non-empty route - let input = r#"{"route": "Image generation"}"#; - let result = router.parse_response(input, &None).unwrap(); - assert_eq!( - result, - Some(("Image generation".to_string(), "gpt-4o".to_string())) - ); - - // Case 2: Valid JSON with empty route - let input = r#"{"route": ""}"#; - let result = router.parse_response(input, &None).unwrap(); - assert_eq!(result, None); - - // Case 3: Valid JSON with null route - let input = r#"{"route": null}"#; - let result = router.parse_response(input, &None).unwrap(); - assert_eq!(result, None); - - // Case 4: JSON missing route field - let input = r#"{}"#; - let result = router.parse_response(input, &None).unwrap(); - assert_eq!(result, None); - - // Case 4.1: empty string - let input = r#""#; - let result = router.parse_response(input, &None).unwrap(); - assert_eq!(result, None); - - // Case 5: Malformed JSON - let input = r#"{"route": "route1""#; // missing closing } - let result = router.parse_response(input, &None); - assert!(result.is_err()); - - // Case 6: Single quotes and \n in JSON - let input = "{'route': 'Image generation'}\\n"; - let result = router.parse_response(input, &None).unwrap(); - assert_eq!( - result, - Some(("Image generation".to_string(), "gpt-4o".to_string())) - ); - - // Case 7: Code block marker - let input = "```json\n{\"route\": \"Image generation\"}\n```"; - let result = router.parse_response(input, &None).unwrap(); - assert_eq!( - result, - Some(("Image generation".to_string(), "gpt-4o".to_string())) - ); - } -} diff --git a/crates/common/src/configuration.rs b/crates/common/src/configuration.rs index 10114274..125a986d 100644 --- a/crates/common/src/configuration.rs +++ b/crates/common/src/configuration.rs @@ -233,6 +233,7 @@ pub struct Overrides { pub use_agent_orchestrator: Option, pub llm_routing_model: Option, pub agent_orchestration_model: Option, + pub orchestrator_model_context_length: Option, } #[derive(Debug, Clone, Serialize, Deserialize, Default)] @@ -729,13 +730,6 @@ mod test { internal: None, ..Default::default() }, - LlmProvider { - name: "arch-router".to_string(), - provider_interface: LlmProviderType::Plano, - model: Some("Arch-Router".to_string()), - internal: Some(true), - ..Default::default() - }, LlmProvider { name: "plano-orchestrator".to_string(), provider_interface: LlmProviderType::Plano, @@ -747,13 +741,10 @@ mod test { let models = providers.into_models(); - // Should only have 1 model: openai-gpt4 assert_eq!(models.data.len(), 1); - // Verify internal models are excluded from /v1/models let model_ids: Vec = models.data.iter().map(|m| m.id.clone()).collect(); assert!(model_ids.contains(&"openai-gpt4".to_string())); - assert!(!model_ids.contains(&"arch-router".to_string())); assert!(!model_ids.contains(&"plano-orchestrator".to_string())); } } diff --git a/demos/llm_routing/claude_code_router/pretty_model_resolution.sh b/demos/llm_routing/claude_code_router/pretty_model_resolution.sh index b6187e65..3902a63e 100644 --- a/demos/llm_routing/claude_code_router/pretty_model_resolution.sh +++ b/demos/llm_routing/claude_code_router/pretty_model_resolution.sh @@ -1,6 +1,6 @@ #!/usr/bin/env bash # Pretty-print Plano MODEL_RESOLUTION lines from docker logs -# - hides Arch-Router +# - hides Plano-Orchestrator # - prints timestamp # - colors MODEL_RESOLUTION red # - colors req_model cyan @@ -9,7 +9,7 @@ docker logs -f plano 2>&1 \ | awk ' -/MODEL_RESOLUTION:/ && $0 !~ /Arch-Router/ { +/MODEL_RESOLUTION:/ && $0 !~ /Plano-Orchestrator/ { # extract timestamp between first [ and ] ts="" if (match($0, /\[[0-9-]+ [0-9:.]+\]/)) { diff --git a/demos/llm_routing/codex_router/pretty_model_resolution.sh b/demos/llm_routing/codex_router/pretty_model_resolution.sh index b6187e65..3902a63e 100644 --- a/demos/llm_routing/codex_router/pretty_model_resolution.sh +++ b/demos/llm_routing/codex_router/pretty_model_resolution.sh @@ -1,6 +1,6 @@ #!/usr/bin/env bash # Pretty-print Plano MODEL_RESOLUTION lines from docker logs -# - hides Arch-Router +# - hides Plano-Orchestrator # - prints timestamp # - colors MODEL_RESOLUTION red # - colors req_model cyan @@ -9,7 +9,7 @@ docker logs -f plano 2>&1 \ | awk ' -/MODEL_RESOLUTION:/ && $0 !~ /Arch-Router/ { +/MODEL_RESOLUTION:/ && $0 !~ /Plano-Orchestrator/ { # extract timestamp between first [ and ] ts="" if (match($0, /\[[0-9-]+ [0-9:.]+\]/)) { diff --git a/demos/llm_routing/model_routing_service/README.md b/demos/llm_routing/model_routing_service/README.md index 4687b47c..eaec32c7 100644 --- a/demos/llm_routing/model_routing_service/README.md +++ b/demos/llm_routing/model_routing_service/README.md @@ -6,7 +6,7 @@ Plano is an AI-native proxy and data plane for agentic apps — with built-in or ┌───────────┐ ┌─────────────────────────────────┐ ┌──────────────┐ │ Client │ ───► │ Plano │ ───► │ OpenAI │ │ (any │ │ │ │ Anthropic │ -│ language)│ │ Arch-Router (1.5B model) │ │ Any Provider│ +│ language)│ │ Plano-Orchestrator │ │ Any Provider│ └───────────┘ │ analyzes intent → picks model │ └──────────────┘ └─────────────────────────────────┘ ``` @@ -39,17 +39,17 @@ routing_preferences: When a request arrives, Plano: -1. Sends the conversation + route descriptions to Arch-Router for intent classification +1. Sends the conversation + route descriptions to Plano-Orchestrator for intent classification 2. Looks up the matched route and returns its candidate models 3. Returns an ordered list — client uses `models[0]`, falls back to `models[1]` on 429/5xx ``` 1. Request arrives → "Write binary search in Python" -2. Arch-Router classifies → route: "code_generation" +2. Plano-Orchestrator classifies → route: "code_generation" 3. Response → models: ["anthropic/claude-sonnet-4-20250514", "openai/gpt-4o"] ``` -No match? Arch-Router returns `null` route → client falls back to the model in the original request. +No match? Plano-Orchestrator returns an empty route → client falls back to the model in the original request. The `/routing/v1/*` endpoints return the routing decision **without** forwarding to the LLM — useful for testing routing behavior before going to production. @@ -163,9 +163,9 @@ routing: Without the `X-Model-Affinity` header, routing runs fresh every time (no breaking change). -## Kubernetes Deployment (Self-hosted Arch-Router on GPU) +## Kubernetes Deployment (Self-hosted Plano-Orchestrator on GPU) -To run Arch-Router in-cluster using vLLM instead of the default hosted endpoint: +To run Plano-Orchestrator in-cluster using vLLM instead of the default hosted endpoint: **0. Check your GPU node labels and taints** @@ -176,10 +176,10 @@ kubectl get node -o jsonpath='{.spec.taints}' GPU nodes commonly have a `nvidia.com/gpu:NoSchedule` taint — `vllm-deployment.yaml` includes a matching toleration. If you have multiple GPU node pools and need to pin to a specific one, uncomment and set the `nodeSelector` in `vllm-deployment.yaml` using the label for your cloud provider. -**1. Deploy Arch-Router and Plano:** +**1. Deploy Plano-Orchestrator and Plano:** ```bash -# arch-router deployment +# plano-orchestrator deployment kubectl apply -f vllm-deployment.yaml # plano deployment @@ -197,8 +197,8 @@ kubectl apply -f plano-deployment.yaml **3. Wait for both pods to be ready:** ```bash -# Arch-Router downloads the model (~1 min) then vLLM loads it (~2 min) -kubectl get pods -l app=arch-router -w +# Plano-Orchestrator downloads the model (~1 min) then vLLM loads it (~2 min) +kubectl get pods -l app=plano-orchestrator -w kubectl rollout status deployment/plano ``` @@ -209,10 +209,10 @@ kubectl port-forward svc/plano 12000:12000 ./demo.sh ``` -To confirm requests are hitting your in-cluster Arch-Router (not just health checks): +To confirm requests are hitting your in-cluster Plano-Orchestrator (not just health checks): ```bash -kubectl logs -l app=arch-router -f --tail=0 +kubectl logs -l app=plano-orchestrator -f --tail=0 # Look for POST /v1/chat/completions entries ``` diff --git a/demos/llm_routing/model_routing_service/config_k8s.yaml b/demos/llm_routing/model_routing_service/config_k8s.yaml index bdf98bfa..49f452a9 100644 --- a/demos/llm_routing/model_routing_service/config_k8s.yaml +++ b/demos/llm_routing/model_routing_service/config_k8s.yaml @@ -1,7 +1,7 @@ version: v0.3.0 overrides: - llm_routing_model: plano/Arch-Router + llm_routing_model: plano/Plano-Orchestrator listeners: - type: model @@ -10,8 +10,8 @@ listeners: model_providers: - - model: plano/Arch-Router - base_url: http://arch-router:10000 + - model: plano/Plano-Orchestrator + base_url: http://plano-orchestrator:10000 - model: openai/gpt-4o-mini access_key: $OPENAI_API_KEY diff --git a/demos/llm_routing/model_routing_service/vllm-deployment.yaml b/demos/llm_routing/model_routing_service/vllm-deployment.yaml index 1debe15e..b384b1c4 100644 --- a/demos/llm_routing/model_routing_service/vllm-deployment.yaml +++ b/demos/llm_routing/model_routing_service/vllm-deployment.yaml @@ -1,18 +1,18 @@ apiVersion: apps/v1 kind: Deployment metadata: - name: arch-router + name: plano-orchestrator labels: - app: arch-router + app: plano-orchestrator spec: replicas: 1 selector: matchLabels: - app: arch-router + app: plano-orchestrator template: metadata: labels: - app: arch-router + app: plano-orchestrator spec: tolerations: - key: nvidia.com/gpu @@ -53,7 +53,7 @@ spec: - "--tokenizer" - "katanemo/Arch-Router-1.5B" - "--served-model-name" - - "Arch-Router" + - "Plano-Orchestrator" - "--gpu-memory-utilization" - "0.3" - "--tensor-parallel-size" @@ -94,10 +94,10 @@ spec: apiVersion: v1 kind: Service metadata: - name: arch-router + name: plano-orchestrator spec: selector: - app: arch-router + app: plano-orchestrator ports: - name: http port: 10000 diff --git a/demos/llm_routing/openclaw_routing/config.yaml b/demos/llm_routing/openclaw_routing/config.yaml index 9690e747..aed0a2c1 100644 --- a/demos/llm_routing/openclaw_routing/config.yaml +++ b/demos/llm_routing/openclaw_routing/config.yaml @@ -1,7 +1,7 @@ version: v0.1.0 overrides: - llm_routing_model: Arch-Router + llm_routing_model: Plano-Orchestrator listeners: egress_traffic: diff --git a/demos/llm_routing/preference_based_routing/README.md b/demos/llm_routing/preference_based_routing/README.md index 7b8d3b25..533e4906 100644 --- a/demos/llm_routing/preference_based_routing/README.md +++ b/demos/llm_routing/preference_based_routing/README.md @@ -32,9 +32,9 @@ planoai up config.yaml 3. Test with curl or open AnythingLLM http://localhost:3001/ -## Running with local Arch-Router (via Ollama) +## Running with local routing model (via Ollama) -By default, Plano uses a hosted Arch-Router endpoint. To self-host Arch-Router locally using Ollama: +By default, Plano uses a hosted Plano-Orchestrator endpoint. To self-host a routing model locally using Ollama: 1. Install [Ollama](https://ollama.ai) and pull the model: ```bash diff --git a/demos/llm_routing/preference_based_routing/test_router_endpoint.rest b/demos/llm_routing/preference_based_routing/test_router_endpoint.rest index 72686a70..13a3f924 100644 --- a/demos/llm_routing/preference_based_routing/test_router_endpoint.rest +++ b/demos/llm_routing/preference_based_routing/test_router_endpoint.rest @@ -22,11 +22,11 @@ Content-Type: application/json ### get model list from arch-function GET https://archfc.katanemo.dev/v1/models HTTP/1.1 -model: Arch-Router +model: Plano-Orchestrator -### get model list from Arch-Router (notice model header) +### get model list from Plano-Orchestrator (notice model header) GET https://archfc.katanemo.dev/v1/models HTTP/1.1 -model: Arch-Router +model: Plano-Orchestrator ### test try code generating diff --git a/docs/source/guides/llm_router.rst b/docs/source/guides/llm_router.rst index 25b78db5..5539dddc 100644 --- a/docs/source/guides/llm_router.rst +++ b/docs/source/guides/llm_router.rst @@ -133,16 +133,16 @@ Clients use semantic names: .. _preference_aligned_routing: -Preference-aligned routing (Arch-Router) -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Preference-aligned routing (Plano-Orchestrator) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Preference-aligned routing uses the `Arch-Router `_ model to pick the best LLM based on domain, action, and your configured preferences instead of hard-coding a model. +Preference-aligned routing uses the `Plano-Orchestrator `_ model to pick the best LLM based on domain, action, and your configured preferences instead of hard-coding a model. - **Domain**: High-level topic of the request (e.g., legal, healthcare, programming). - **Action**: What the user wants to do (e.g., summarize, generate code, translate). - **Routing preferences**: Your mapping from (domain, action) to preferred models. -Arch-Router analyzes each prompt to infer domain and action, then applies your preferences to select a model. This decouples **routing policy** (how to choose) from **model assignment** (what to run), making routing transparent, controllable, and easy to extend as you add or swap models. +Plano-Orchestrator analyzes each prompt to infer domain and action, then applies your preferences to select a model. This decouples **routing policy** (how to choose) from **model assignment** (what to run), making routing transparent, controllable, and easy to extend as you add or swap models. Configuration ^^^^^^^^^^^^^ @@ -187,21 +187,21 @@ Clients can let the router decide or still specify aliases: .. code-block:: python - # Let Arch-Router choose based on content + # Let Plano-Orchestrator choose based on content response = client.chat.completions.create( messages=[{"role": "user", "content": "Write a creative story about space exploration"}] # No model specified - router will analyze and choose claude-sonnet-4-5 ) -Arch-Router ------------ -The `Arch-Router `_ is a state-of-the-art **preference-based routing model** specifically designed to address the limitations of traditional LLM routing. This compact 1.5B model delivers production-ready performance with low latency and high accuracy while solving key routing challenges. +Plano-Orchestrator +------------------- +Plano-Orchestrator is a **preference-based routing model** specifically designed to address the limitations of traditional LLM routing. It delivers production-ready performance with low latency and high accuracy while solving key routing challenges. **Addressing Traditional Routing Limitations:** **Human Preference Alignment** -Unlike benchmark-driven approaches, Arch-Router learns to match queries with human preferences by using domain-action mappings that capture subjective evaluation criteria, ensuring routing decisions align with real-world user needs. +Unlike benchmark-driven approaches, Plano-Orchestrator learns to match queries with human preferences by using domain-action mappings that capture subjective evaluation criteria, ensuring routing decisions align with real-world user needs. **Flexible Model Integration** The system supports seamlessly adding new models for routing without requiring retraining or architectural modifications, enabling dynamic adaptation to evolving model landscapes. @@ -209,15 +209,15 @@ The system supports seamlessly adding new models for routing without requiring r **Preference-Encoded Routing** Provides a practical mechanism to encode user preferences through domain-action mappings, offering transparent and controllable routing decisions that can be customized for specific use cases. -To support effective routing, Arch-Router introduces two key concepts: +To support effective routing, Plano-Orchestrator introduces two key concepts: - **Domain** – the high-level thematic category or subject matter of a request (e.g., legal, healthcare, programming). - **Action** – the specific type of operation the user wants performed (e.g., summarization, code generation, booking appointment, translation). -Both domain and action configs are associated with preferred models or model variants. At inference time, Arch-Router analyzes the incoming prompt to infer its domain and action using semantic similarity, task indicators, and contextual cues. It then applies the user-defined routing preferences to select the model best suited to handle the request. +Both domain and action configs are associated with preferred models or model variants. At inference time, Plano-Orchestrator analyzes the incoming prompt to infer its domain and action using semantic similarity, task indicators, and contextual cues. It then applies the user-defined routing preferences to select the model best suited to handle the request. -In summary, Arch-Router demonstrates: +In summary, Plano-Orchestrator demonstrates: - **Structured Preference Routing**: Aligns prompt request with model strengths using explicit domain–action mappings. @@ -228,10 +228,10 @@ In summary, Arch-Router demonstrates: - **Production-Ready Performance**: Optimized for low-latency, high-throughput applications in multi-model environments. -Self-hosting Arch-Router ------------------------- +Self-hosting Plano-Orchestrator +------------------------------- -By default, Plano uses a hosted Arch-Router endpoint. To run Arch-Router locally, you can serve the model yourself using either **Ollama** or **vLLM**. +By default, Plano uses a hosted Plano-Orchestrator endpoint. To run Plano-Orchestrator locally, you can serve the model yourself using either **Ollama** or **vLLM**. Using Ollama (recommended for local development) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -240,7 +240,7 @@ Using Ollama (recommended for local development) Download and install from `ollama.ai `_. -2. **Pull and serve Arch-Router** +2. **Pull and serve the routing model** .. code-block:: bash @@ -249,7 +249,7 @@ Using Ollama (recommended for local development) This downloads the quantized GGUF model from HuggingFace and starts serving on ``http://localhost:11434``. -3. **Configure Plano to use local Arch-Router** +3. **Configure Plano to use local routing model** .. code-block:: yaml @@ -313,7 +313,7 @@ vLLM provides higher throughput and GPU optimizations suitable for production de --load-format gguf \ --chat-template ${SNAPSHOT_DIR}template.jinja \ --tokenizer katanemo/Arch-Router-1.5B \ - --served-model-name Arch-Router \ + --served-model-name Plano-Orchestrator \ --gpu-memory-utilization 0.3 \ --tensor-parallel-size 1 \ --enable-prefix-caching @@ -323,10 +323,10 @@ vLLM provides higher throughput and GPU optimizations suitable for production de .. code-block:: yaml overrides: - llm_routing_model: plano/Arch-Router + llm_routing_model: plano/Plano-Orchestrator model_providers: - - model: plano/Arch-Router + - model: plano/Plano-Orchestrator base_url: http://:10000 - model: openai/gpt-5.2 @@ -350,14 +350,14 @@ vLLM provides higher throughput and GPU optimizations suitable for production de Using vLLM on Kubernetes (GPU nodes) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -For teams running Kubernetes, Arch-Router and Plano can be deployed as in-cluster services. +For teams running Kubernetes, Plano-Orchestrator and Plano can be deployed as in-cluster services. The ``demos/llm_routing/model_routing_service/`` directory includes ready-to-use manifests: -- ``vllm-deployment.yaml`` — Arch-Router served by vLLM, with an init container to download +- ``vllm-deployment.yaml`` — Plano-Orchestrator served by vLLM, with an init container to download the model from HuggingFace -- ``plano-deployment.yaml`` — Plano proxy configured to use the in-cluster Arch-Router +- ``plano-deployment.yaml`` — Plano proxy configured to use the in-cluster Plano-Orchestrator - ``config_k8s.yaml`` — Plano config with ``llm_routing_model`` pointing at - ``http://arch-router:10000`` instead of the default hosted endpoint + ``http://plano-orchestrator:10000`` instead of the default hosted endpoint Key things to know before deploying: @@ -504,7 +504,7 @@ This configuration allows clients to: Example Use Cases ----------------- -Here are common scenarios where Arch-Router excels: +Here are common scenarios where Plano-Orchestrator excels: - **Coding Tasks**: Distinguish between code generation requests ("write a Python function"), debugging needs ("fix this error"), and code optimization ("make this faster"), routing each to appropriately specialized models. @@ -545,10 +545,10 @@ Best practices Unsupported Features -------------------- -The following features are **not supported** by the Arch-Router model: +The following features are **not supported** by the Plano-Orchestrator routing model: - **Multi-modality**: The model is not trained to process raw image or audio inputs. It can handle textual queries *about* these modalities (e.g., "generate an image of a cat"), but cannot interpret encoded multimedia data directly. -- **Function calling**: Arch-Router is designed for **semantic preference matching**, not exact intent classification or tool execution. For structured function invocation, use models in the Plano Function Calling collection instead. +- **Function calling**: Plano-Orchestrator is designed for **semantic preference matching**, not exact intent classification or tool execution. For structured function invocation, use models in the Plano Function Calling collection instead. -- **System prompt dependency**: Arch-Router routes based solely on the user’s conversation history. It does not use or rely on system prompts for routing decisions. +- **System prompt dependency**: Plano-Orchestrator routes based solely on the user’s conversation history. It does not use or rely on system prompts for routing decisions. diff --git a/docs/source/resources/includes/plano_config_full_reference.yaml b/docs/source/resources/includes/plano_config_full_reference.yaml index e9c89175..1d544727 100644 --- a/docs/source/resources/includes/plano_config_full_reference.yaml +++ b/docs/source/resources/includes/plano_config_full_reference.yaml @@ -34,7 +34,7 @@ model_providers: # routing_preferences: tags a model with named capabilities so Plano's LLM router # can select the best model for each request based on intent. Requires the - # Arch-Router model (or equivalent) to be configured in overrides.llm_routing_model. + # Plano-Orchestrator model (or equivalent) to be configured in overrides.llm_routing_model. # Each preference has a name (short label) and a description (used for intent matching). - model: groq/llama-3.3-70b-versatile access_key: $GROQ_API_KEY @@ -170,7 +170,7 @@ overrides: # Path to the trusted CA bundle for upstream TLS verification upstream_tls_ca_path: /etc/ssl/certs/ca-certificates.crt # Model used for intent-based LLM routing (must be listed in model_providers) - llm_routing_model: Arch-Router + llm_routing_model: Plano-Orchestrator # Model used for agent orchestration (must be listed in model_providers) agent_orchestration_model: Plano-Orchestrator diff --git a/docs/source/resources/includes/plano_config_full_reference_rendered.yaml b/docs/source/resources/includes/plano_config_full_reference_rendered.yaml index 8b1fb26b..4992ce3b 100644 --- a/docs/source/resources/includes/plano_config_full_reference_rendered.yaml +++ b/docs/source/resources/includes/plano_config_full_reference_rendered.yaml @@ -157,8 +157,8 @@ model_providers: protocol: https provider_interface: openai - internal: true - model: Arch-Router - name: arch-router + model: Plano-Orchestrator + name: plano-orchestrator provider_interface: plano - internal: true model: Arch-Function @@ -170,7 +170,7 @@ model_providers: provider_interface: plano overrides: agent_orchestration_model: Plano-Orchestrator - llm_routing_model: Arch-Router + llm_routing_model: Plano-Orchestrator optimize_context_window: true prompt_target_intent_matching_threshold: 0.7 upstream_connect_timeout: 10s