use plano-orchestrator for LLM routing, remove arch-router

Replace RouterService/RouterModelV1 (arch-router prompt) with
OrchestratorService/OrchestratorModelV1 (plano-orchestrator prompt)
for LLM routing. This ensures the correct system prompt is used when
llm_routing_model points at a Plano-Orchestrator model.

- Extend OrchestratorService with session caching, ModelMetricsService,
  top-level routing preferences, and determine_route() for LLM routing
- Delete RouterService, RouterModel trait, RouterModelV1, and
  ARCH_ROUTER_V1_SYSTEM_PROMPT
- Unify defaults to Plano-Orchestrator / plano-orchestrator
- Update CLI config generator, demos, docs, and config schema

Made-with: Cursor
This commit is contained in:
Adil Hafeez 2026-04-15 13:11:17 -07:00
parent 980faef6be
commit af724fcc1e
27 changed files with 380 additions and 1412 deletions

View file

@ -37,7 +37,7 @@ Plano pulls rote plumbing out of your framework so you can stay focused on what
**Jump to our [docs](https://docs.planoai.dev)** to learn how you can use Plano to improve the speed, safety and obervability of your agentic applications.
> [!IMPORTANT]
> Plano and the Arch family of LLMs (like Plano-Orchestrator-4B, Arch-Router, etc) are hosted free of charge in the US-central region to give you a great first-run developer experience of Plano. To scale and run in production, you can either run these LLMs locally or contact us on [Discord](https://discord.gg/pGZf2gcwEc) for API keys.
> Plano and the Plano family of LLMs (like Plano-Orchestrator) are hosted free of charge in the US-central region to give you a great first-run developer experience of Plano. To scale and run in production, you can either run these LLMs locally or contact us on [Discord](https://discord.gg/pGZf2gcwEc) for API keys.
---

View file

@ -372,16 +372,15 @@ def validate_and_render_schema():
# Build lookup of model names (already prefix-stripped by config processing)
model_name_set = {mp.get("model") for mp in updated_model_providers}
# Auto-add arch-router provider if routing preferences exist and no provider matches the router model
router_model = overrides_config.get("llm_routing_model", "Arch-Router")
# Strip provider prefix for comparison since config processing strips prefixes from model names
# Auto-add plano-orchestrator provider if routing preferences exist and no provider matches the routing model
router_model = overrides_config.get("llm_routing_model", "Plano-Orchestrator")
router_model_id = (
router_model.split("/", 1)[1] if "/" in router_model else router_model
)
if len(model_usage_name_keys) > 0 and router_model_id not in model_name_set:
updated_model_providers.append(
{
"name": "arch-router",
"name": "plano-orchestrator",
"provider_interface": "plano",
"model": router_model_id,
"internal": True,

View file

@ -284,7 +284,7 @@ properties:
description: "Path to the trusted CA bundle for upstream TLS verification. Default is '/etc/ssl/certs/ca-certificates.crt'."
llm_routing_model:
type: string
description: "Model name for the LLM router (e.g., 'Arch-Router'). Must match a model in model_providers."
description: "Model name for the LLM router (e.g., 'Plano-Orchestrator'). Must match a model in model_providers."
agent_orchestration_model:
type: string
description: "Model name for the agent orchestrator (e.g., 'Plano-Orchestrator'). Must match a model in model_providers."

View file

@ -5,7 +5,6 @@ use common::configuration::{Agent, FilterPipeline, Listener, ModelAlias, SpanAtt
use common::llm_providers::LlmProviders;
use tokio::sync::RwLock;
use crate::router::llm::RouterService;
use crate::router::orchestrator::OrchestratorService;
use crate::state::StateStorage;
@ -14,7 +13,6 @@ use crate::state::StateStorage;
/// Instead of cloning 8+ individual `Arc`s per connection, a single
/// `Arc<AppState>` is cloned once and passed to the request handler.
pub struct AppState {
pub router_service: Arc<RouterService>,
pub orchestrator_service: Arc<OrchestratorService>,
pub model_aliases: Option<HashMap<String, ModelAlias>>,
pub llm_providers: Arc<RwLock<LlmProviders>>,

View file

@ -147,8 +147,8 @@ mod tests {
#[tokio::test]
async fn test_error_handling_flow() {
let router_service = create_test_orchestrator_service();
let agent_selector = AgentSelector::new(router_service);
let orchestrator_service = create_test_orchestrator_service();
let agent_selector = AgentSelector::new(orchestrator_service);
// Test listener not found
let result = agent_selector.find_listener(Some("nonexistent"), &[]);

View file

@ -22,7 +22,6 @@ pub(crate) mod model_selection;
use crate::app_state::AppState;
use crate::handlers::agents::pipeline::PipelineProcessor;
use crate::handlers::extract_or_generate_traceparent;
use crate::handlers::extract_request_id;
use crate::handlers::full;
use crate::state::response_state_processor::ResponsesStateProcessor;
@ -92,22 +91,20 @@ async fn llm_chat_inner(
}
});
let traceparent = extract_or_generate_traceparent(&request_headers);
// Session pinning: extract session ID and check cache before routing
let session_id: Option<String> = request_headers
.get(MODEL_AFFINITY_HEADER)
.and_then(|h| h.to_str().ok())
.map(|s| s.to_string());
let tenant_id: Option<String> = state
.router_service
.orchestrator_service
.tenant_header()
.and_then(|hdr| request_headers.get(hdr))
.and_then(|v| v.to_str().ok())
.map(|s| s.to_string());
let pinned_model: Option<String> = if let Some(ref sid) = session_id {
state
.router_service
.orchestrator_service
.get_cached_route(sid, tenant_id.as_deref())
.await
.map(|c| c.model_name)
@ -287,9 +284,8 @@ async fn llm_chat_inner(
let routing_result = match async {
set_service_name(operation_component::ROUTING);
router_chat_get_upstream_model(
Arc::clone(&state.router_service),
Arc::clone(&state.orchestrator_service),
client_request,
&traceparent,
&request_path,
&request_id,
inline_routing_preferences,
@ -315,10 +311,9 @@ async fn llm_chat_inner(
alias_resolved_model.clone()
};
// Cache the routing decision so subsequent requests with the same session ID are pinned
if let Some(ref sid) = session_id {
state
.router_service
.orchestrator_service
.cache_route(sid.clone(), tenant_id.as_deref(), model.clone(), route_name)
.await;
}

View file

@ -5,7 +5,7 @@ use hyper::StatusCode;
use std::sync::Arc;
use tracing::{debug, info, warn};
use crate::router::llm::RouterService;
use crate::router::orchestrator::OrchestratorService;
use crate::streaming::truncate_message;
use crate::tracing::routing;
@ -37,9 +37,8 @@ impl RoutingError {
/// * `Ok(RoutingResult)` - Contains the selected model name and span ID
/// * `Err(RoutingError)` - Contains error details and optional span ID
pub async fn router_chat_get_upstream_model(
router_service: Arc<RouterService>,
orchestrator_service: Arc<OrchestratorService>,
client_request: ProviderRequestType,
traceparent: &str,
request_path: &str,
request_id: &str,
inline_routing_preferences: Option<Vec<TopLevelRoutingPreference>>,
@ -99,11 +98,9 @@ pub async fn router_chat_get_upstream_model(
// Capture start time for routing span
let routing_start_time = std::time::Instant::now();
// Attempt to determine route using the router service
let routing_result = router_service
let routing_result = orchestrator_service
.determine_route(
&chat_request.messages,
traceparent,
inline_routing_preferences,
request_id,
)

View file

@ -12,7 +12,7 @@ use tracing::{debug, info, info_span, warn, Instrument};
use super::extract_or_generate_traceparent;
use crate::handlers::llm::model_selection::router_chat_get_upstream_model;
use crate::router::llm::RouterService;
use crate::router::orchestrator::OrchestratorService;
use crate::tracing::{collect_custom_trace_attributes, operation_component, set_service_name};
/// Extracts `routing_preferences` from a JSON body, returning the cleaned body bytes
@ -60,7 +60,7 @@ struct RoutingDecisionResponse {
pub async fn routing_decision(
request: Request<hyper::body::Incoming>,
router_service: Arc<RouterService>,
orchestrator_service: Arc<OrchestratorService>,
request_path: String,
span_attributes: &Option<SpanAttributes>,
) -> Result<Response<BoxBody<Bytes, hyper::Error>>, hyper::Error> {
@ -76,7 +76,7 @@ pub async fn routing_decision(
.and_then(|h| h.to_str().ok())
.map(|s| s.to_string());
let tenant_id: Option<String> = router_service
let tenant_id: Option<String> = orchestrator_service
.tenant_header()
.and_then(|hdr| request_headers.get(hdr))
.and_then(|v| v.to_str().ok())
@ -94,7 +94,7 @@ pub async fn routing_decision(
routing_decision_inner(
request,
router_service,
orchestrator_service,
request_id,
request_path,
request_headers,
@ -109,7 +109,7 @@ pub async fn routing_decision(
#[allow(clippy::too_many_arguments)]
async fn routing_decision_inner(
request: Request<hyper::body::Incoming>,
router_service: Arc<RouterService>,
orchestrator_service: Arc<OrchestratorService>,
request_id: String,
request_path: String,
request_headers: hyper::HeaderMap,
@ -133,9 +133,8 @@ async fn routing_decision_inner(
.unwrap_or("unknown")
.to_string();
// Session pinning: check cache before doing any routing work
if let Some(ref sid) = session_id {
if let Some(cached) = router_service
if let Some(cached) = orchestrator_service
.get_cached_route(sid, tenant_id.as_deref())
.await
{
@ -202,9 +201,8 @@ async fn routing_decision_inner(
};
let routing_result = router_chat_get_upstream_model(
Arc::clone(&router_service),
Arc::clone(&orchestrator_service),
client_request,
&traceparent,
&request_path,
&request_id,
inline_routing_preferences,
@ -213,9 +211,8 @@ async fn routing_decision_inner(
match routing_result {
Ok(result) => {
// Cache the result if session_id is present
if let Some(ref sid) = session_id {
router_service
orchestrator_service
.cache_route(
sid.clone(),
tenant_id.as_deref(),

View file

@ -5,7 +5,6 @@ use brightstaff::handlers::function_calling::function_calling_chat_handler;
use brightstaff::handlers::llm::llm_chat;
use brightstaff::handlers::models::list_models;
use brightstaff::handlers::routing_service::routing_decision;
use brightstaff::router::llm::RouterService;
use brightstaff::router::model_metrics::ModelMetricsService;
use brightstaff::router::orchestrator::OrchestratorService;
use brightstaff::session_cache::init_session_cache;
@ -37,8 +36,6 @@ use tokio::sync::RwLock;
use tracing::{debug, info, warn};
const BIND_ADDRESS: &str = "0.0.0.0:9091";
const DEFAULT_ROUTING_LLM_PROVIDER: &str = "arch-router";
const DEFAULT_ROUTING_MODEL_NAME: &str = "Arch-Router";
const DEFAULT_ORCHESTRATOR_LLM_PROVIDER: &str = "plano-orchestrator";
const DEFAULT_ORCHESTRATOR_MODEL_NAME: &str = "Plano-Orchestrator";
@ -161,20 +158,6 @@ async fn init_app_state(
let overrides = config.overrides.clone().unwrap_or_default();
let routing_model_name: String = overrides
.llm_routing_model
.as_deref()
.map(|m| m.split_once('/').map(|(_, id)| id).unwrap_or(m))
.unwrap_or(DEFAULT_ROUTING_MODEL_NAME)
.to_string();
let routing_llm_provider = config
.model_providers
.iter()
.find(|p| p.model.as_deref() == Some(routing_model_name.as_str()))
.map(|p| p.name.clone())
.unwrap_or_else(|| DEFAULT_ROUTING_LLM_PROVIDER.to_string());
let session_ttl_seconds = config.routing.as_ref().and_then(|r| r.session_ttl_seconds);
let session_cache = init_session_cache(config).await?;
@ -304,20 +287,11 @@ async fn init_app_state(
.and_then(|r| r.session_cache.as_ref())
.and_then(|c| c.tenant_header.clone());
let router_service = Arc::new(RouterService::new(
config.routing_preferences.clone(),
metrics_service,
format!("{llm_provider_url}{CHAT_COMPLETIONS_PATH}"),
routing_model_name,
routing_llm_provider,
session_ttl_seconds,
session_cache,
session_tenant_header,
));
// Resolve model name: prefer llm_routing_model override, then agent_orchestration_model, then default.
let orchestrator_model_name: String = overrides
.agent_orchestration_model
.llm_routing_model
.as_deref()
.or(overrides.agent_orchestration_model.as_deref())
.map(|m| m.split_once('/').map(|(_, id)| id).unwrap_or(m))
.unwrap_or(DEFAULT_ORCHESTRATOR_MODEL_NAME)
.to_string();
@ -329,10 +303,15 @@ async fn init_app_state(
.map(|p| p.name.clone())
.unwrap_or_else(|| DEFAULT_ORCHESTRATOR_LLM_PROVIDER.to_string());
let orchestrator_service = Arc::new(OrchestratorService::new(
let orchestrator_service = Arc::new(OrchestratorService::with_routing(
format!("{llm_provider_url}{CHAT_COMPLETIONS_PATH}"),
orchestrator_model_name,
orchestrator_llm_provider,
config.routing_preferences.clone(),
metrics_service,
session_ttl_seconds,
session_cache,
session_tenant_header,
));
let state_storage = init_state_storage(config).await?;
@ -343,7 +322,6 @@ async fn init_app_state(
.and_then(|tracing| tracing.span_attributes.clone());
Ok(AppState {
router_service,
orchestrator_service,
model_aliases: config.model_aliases.clone(),
llm_providers: Arc::new(RwLock::new(llm_providers)),
@ -430,7 +408,7 @@ async fn route(
) {
return routing_decision(
req,
Arc::clone(&state.router_service),
Arc::clone(&state.orchestrator_service),
stripped,
&state.span_attributes,
)

View file

@ -1,371 +0,0 @@
use std::{borrow::Cow, collections::HashMap, sync::Arc, time::Duration};
use common::{
configuration::TopLevelRoutingPreference,
consts::{ARCH_PROVIDER_HINT_HEADER, REQUEST_ID_HEADER, TRACE_PARENT_HEADER},
};
use super::router_model::{ModelUsagePreference, RoutingPreference};
use hermesllm::apis::openai::Message;
use hyper::header;
use thiserror::Error;
use tracing::{debug, info};
use super::http::{self, post_and_extract_content};
use super::model_metrics::ModelMetricsService;
use super::router_model::RouterModel;
use crate::router::router_model_v1;
use crate::session_cache::SessionCache;
pub use crate::session_cache::CachedRoute;
const DEFAULT_SESSION_TTL_SECONDS: u64 = 600;
pub struct RouterService {
router_url: String,
client: reqwest::Client,
router_model: Arc<dyn RouterModel>,
routing_provider_name: String,
top_level_preferences: HashMap<String, TopLevelRoutingPreference>,
metrics_service: Option<Arc<ModelMetricsService>>,
session_cache: Arc<dyn SessionCache>,
session_ttl: Duration,
tenant_header: Option<String>,
}
#[derive(Debug, Error)]
pub enum RoutingError {
#[error(transparent)]
Http(#[from] http::HttpError),
#[error("Router model error: {0}")]
RouterModelError(#[from] super::router_model::RoutingModelError),
}
pub type Result<T> = std::result::Result<T, RoutingError>;
impl RouterService {
#[allow(clippy::too_many_arguments)]
pub fn new(
top_level_prefs: Option<Vec<TopLevelRoutingPreference>>,
metrics_service: Option<Arc<ModelMetricsService>>,
router_url: String,
routing_model_name: String,
routing_provider_name: String,
session_ttl_seconds: Option<u64>,
session_cache: Arc<dyn SessionCache>,
tenant_header: Option<String>,
) -> Self {
let top_level_preferences: HashMap<String, TopLevelRoutingPreference> = top_level_prefs
.map_or_else(HashMap::new, |prefs| {
prefs.into_iter().map(|p| (p.name.clone(), p)).collect()
});
// Build sentinel routes for RouterModelV1: route_name → first model.
// RouterModelV1 uses this to build its prompt; RouterService overrides
// the model selection via rank_models() after the route is determined.
let sentinel_routes: HashMap<String, Vec<RoutingPreference>> = top_level_preferences
.iter()
.filter_map(|(name, pref)| {
pref.models.first().map(|first_model| {
(
first_model.clone(),
vec![RoutingPreference {
name: name.clone(),
description: pref.description.clone(),
}],
)
})
})
.collect();
let router_model = Arc::new(router_model_v1::RouterModelV1::new(
sentinel_routes,
routing_model_name,
router_model_v1::MAX_TOKEN_LEN,
));
let session_ttl =
Duration::from_secs(session_ttl_seconds.unwrap_or(DEFAULT_SESSION_TTL_SECONDS));
RouterService {
router_url,
client: reqwest::Client::new(),
router_model,
routing_provider_name,
top_level_preferences,
metrics_service,
session_cache,
session_ttl,
tenant_header,
}
}
/// Name of the HTTP header used to scope cache keys by tenant, if configured.
#[must_use]
pub fn tenant_header(&self) -> Option<&str> {
self.tenant_header.as_deref()
}
/// Build the cache key, optionally scoped by tenant: `{tenant_id}:{session_id}` or `{session_id}`.
/// Returns a borrowed key when no tenant prefix is needed, avoiding an allocation.
fn session_key<'a>(tenant_id: Option<&str>, session_id: &'a str) -> Cow<'a, str> {
match tenant_id {
Some(t) => Cow::Owned(format!("{t}:{session_id}")),
None => Cow::Borrowed(session_id),
}
}
/// Look up a cached routing decision by session ID.
/// Returns None if not found or expired.
pub async fn get_cached_route(
&self,
session_id: &str,
tenant_id: Option<&str>,
) -> Option<CachedRoute> {
self.session_cache
.get(&Self::session_key(tenant_id, session_id))
.await
}
/// Store a routing decision in the session cache.
pub async fn cache_route(
&self,
session_id: String,
tenant_id: Option<&str>,
model_name: String,
route_name: Option<String>,
) {
self.session_cache
.put(
&Self::session_key(tenant_id, &session_id),
CachedRoute {
model_name,
route_name,
},
self.session_ttl,
)
.await;
}
pub async fn determine_route(
&self,
messages: &[Message],
traceparent: &str,
inline_routing_preferences: Option<Vec<TopLevelRoutingPreference>>,
request_id: &str,
) -> Result<Option<(String, Vec<String>)>> {
if messages.is_empty() {
return Ok(None);
}
// Build inline top-level map from request if present (inline overrides config).
let inline_top_map: Option<HashMap<String, TopLevelRoutingPreference>> =
inline_routing_preferences
.map(|prefs| prefs.into_iter().map(|p| (p.name.clone(), p)).collect());
// No routing defined — skip the router call entirely.
if inline_top_map.is_none() && self.top_level_preferences.is_empty() {
return Ok(None);
}
// For inline overrides, build synthetic ModelUsagePreference list so RouterModelV1
// generates the correct prompt (route name + description pairs).
// For config-level prefs the sentinel routes are already baked into RouterModelV1.
let effective_usage_preferences: Option<Vec<ModelUsagePreference>> =
inline_top_map.as_ref().map(|inline_map| {
inline_map
.values()
.map(|p| ModelUsagePreference {
model: p.models.first().cloned().unwrap_or_default(),
routing_preferences: vec![RoutingPreference {
name: p.name.clone(),
description: p.description.clone(),
}],
})
.collect()
});
let router_request = self
.router_model
.generate_request(messages, &effective_usage_preferences);
debug!(
model = %self.router_model.get_model_name(),
endpoint = %self.router_url,
"sending request to arch-router"
);
let body = serde_json::to_string(&router_request)
.map_err(super::router_model::RoutingModelError::from)?;
debug!(body = %body, "arch router request");
let mut headers = header::HeaderMap::new();
headers.insert(
header::CONTENT_TYPE,
header::HeaderValue::from_static("application/json"),
);
if let Ok(val) = header::HeaderValue::from_str(&self.routing_provider_name) {
headers.insert(
header::HeaderName::from_static(ARCH_PROVIDER_HINT_HEADER),
val,
);
}
if let Ok(val) = header::HeaderValue::from_str(traceparent) {
headers.insert(header::HeaderName::from_static(TRACE_PARENT_HEADER), val);
}
if let Ok(val) = header::HeaderValue::from_str(request_id) {
headers.insert(header::HeaderName::from_static(REQUEST_ID_HEADER), val);
}
headers.insert(
header::HeaderName::from_static("model"),
header::HeaderValue::from_static("arch-router"),
);
let Some((content, elapsed)) =
post_and_extract_content(&self.client, &self.router_url, headers, body).await?
else {
return Ok(None);
};
// Parse the route name from the router response.
let parsed = self
.router_model
.parse_response(&content, &effective_usage_preferences)?;
let result = if let Some((route_name, _sentinel)) = parsed {
let top_pref = inline_top_map
.as_ref()
.and_then(|m| m.get(&route_name))
.or_else(|| self.top_level_preferences.get(&route_name));
if let Some(pref) = top_pref {
let ranked = match &self.metrics_service {
Some(svc) => svc.rank_models(&pref.models, &pref.selection_policy).await,
None => pref.models.clone(),
};
Some((route_name, ranked))
} else {
None
}
} else {
None
};
info!(
content = %content.replace("\n", "\\n"),
selected_model = ?result,
response_time_ms = elapsed.as_millis(),
"arch-router determined route"
);
Ok(result)
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::session_cache::memory::MemorySessionCache;
fn make_router_service(ttl_seconds: u64, max_entries: usize) -> RouterService {
let session_cache = Arc::new(MemorySessionCache::new(max_entries));
RouterService::new(
None,
None,
"http://localhost:12001/v1/chat/completions".to_string(),
"Arch-Router".to_string(),
"arch-router".to_string(),
Some(ttl_seconds),
session_cache,
None,
)
}
#[tokio::test]
async fn test_cache_miss_returns_none() {
let svc = make_router_service(600, 100);
assert!(svc
.get_cached_route("unknown-session", None)
.await
.is_none());
}
#[tokio::test]
async fn test_cache_hit_returns_cached_route() {
let svc = make_router_service(600, 100);
svc.cache_route(
"s1".to_string(),
None,
"gpt-4o".to_string(),
Some("code".to_string()),
)
.await;
let cached = svc.get_cached_route("s1", None).await.unwrap();
assert_eq!(cached.model_name, "gpt-4o");
assert_eq!(cached.route_name, Some("code".to_string()));
}
#[tokio::test]
async fn test_cache_expired_entry_returns_none() {
let svc = make_router_service(0, 100);
svc.cache_route("s1".to_string(), None, "gpt-4o".to_string(), None)
.await;
assert!(svc.get_cached_route("s1", None).await.is_none());
}
#[tokio::test]
async fn test_expired_entries_not_returned() {
let svc = make_router_service(0, 100);
svc.cache_route("s1".to_string(), None, "gpt-4o".to_string(), None)
.await;
svc.cache_route("s2".to_string(), None, "claude".to_string(), None)
.await;
// Entries with TTL=0 should be expired immediately
assert!(svc.get_cached_route("s1", None).await.is_none());
assert!(svc.get_cached_route("s2", None).await.is_none());
}
#[tokio::test]
async fn test_cache_evicts_oldest_when_full() {
let svc = make_router_service(600, 2);
svc.cache_route("s1".to_string(), None, "model-a".to_string(), None)
.await;
tokio::time::sleep(Duration::from_millis(10)).await;
svc.cache_route("s2".to_string(), None, "model-b".to_string(), None)
.await;
svc.cache_route("s3".to_string(), None, "model-c".to_string(), None)
.await;
// s1 should be evicted (oldest); s2 and s3 should remain
assert!(svc.get_cached_route("s1", None).await.is_none());
assert!(svc.get_cached_route("s2", None).await.is_some());
assert!(svc.get_cached_route("s3", None).await.is_some());
}
#[tokio::test]
async fn test_cache_update_existing_session_does_not_evict() {
let svc = make_router_service(600, 2);
svc.cache_route("s1".to_string(), None, "model-a".to_string(), None)
.await;
svc.cache_route("s2".to_string(), None, "model-b".to_string(), None)
.await;
svc.cache_route(
"s1".to_string(),
None,
"model-a-updated".to_string(),
Some("route".to_string()),
)
.await;
// Both sessions should still be present
let s1 = svc.get_cached_route("s1", None).await.unwrap();
assert_eq!(s1.model_name, "model-a-updated");
assert!(svc.get_cached_route("s2", None).await.is_some());
}
}

View file

@ -1,8 +1,5 @@
pub(crate) mod http;
pub mod llm;
pub mod model_metrics;
pub mod orchestrator;
pub mod orchestrator_model;
pub mod orchestrator_model_v1;
pub mod router_model;
pub mod router_model_v1;

View file

@ -1,7 +1,7 @@
use std::{collections::HashMap, sync::Arc};
use std::{borrow::Cow, collections::HashMap, sync::Arc, time::Duration};
use common::{
configuration::{AgentUsagePreference, OrchestrationPreference},
configuration::{AgentUsagePreference, OrchestrationPreference, TopLevelRoutingPreference},
consts::{ARCH_PROVIDER_HINT_HEADER, REQUEST_ID_HEADER},
};
use hermesllm::apis::openai::Message;
@ -12,15 +12,26 @@ use thiserror::Error;
use tracing::{debug, info};
use super::http::{self, post_and_extract_content};
use super::model_metrics::ModelMetricsService;
use super::orchestrator_model::OrchestratorModel;
use crate::router::orchestrator_model_v1;
use crate::session_cache::SessionCache;
pub use crate::session_cache::CachedRoute;
const DEFAULT_SESSION_TTL_SECONDS: u64 = 600;
pub struct OrchestratorService {
orchestrator_url: String,
client: reqwest::Client,
orchestrator_model: Arc<dyn OrchestratorModel>,
orchestrator_provider_name: String,
top_level_preferences: HashMap<String, TopLevelRoutingPreference>,
metrics_service: Option<Arc<ModelMetricsService>>,
session_cache: Option<Arc<dyn SessionCache>>,
session_ttl: Duration,
tenant_header: Option<String>,
}
#[derive(Debug, Error)]
@ -40,11 +51,9 @@ impl OrchestratorService {
orchestration_model_name: String,
orchestrator_provider_name: String,
) -> Self {
let agent_orchestrations: HashMap<String, Vec<OrchestrationPreference>> = HashMap::new();
let orchestrator_model = Arc::new(orchestrator_model_v1::OrchestratorModelV1::new(
agent_orchestrations,
orchestration_model_name.clone(),
HashMap::new(),
orchestration_model_name,
orchestrator_model_v1::MAX_TOKEN_LEN,
));
@ -53,9 +62,172 @@ impl OrchestratorService {
client: reqwest::Client::new(),
orchestrator_model,
orchestrator_provider_name,
top_level_preferences: HashMap::new(),
metrics_service: None,
session_cache: None,
session_ttl: Duration::from_secs(DEFAULT_SESSION_TTL_SECONDS),
tenant_header: None,
}
}
#[allow(clippy::too_many_arguments)]
pub fn with_routing(
orchestrator_url: String,
orchestration_model_name: String,
orchestrator_provider_name: String,
top_level_prefs: Option<Vec<TopLevelRoutingPreference>>,
metrics_service: Option<Arc<ModelMetricsService>>,
session_ttl_seconds: Option<u64>,
session_cache: Arc<dyn SessionCache>,
tenant_header: Option<String>,
) -> Self {
let top_level_preferences: HashMap<String, TopLevelRoutingPreference> = top_level_prefs
.map_or_else(HashMap::new, |prefs| {
prefs.into_iter().map(|p| (p.name.clone(), p)).collect()
});
let orchestrator_model = Arc::new(orchestrator_model_v1::OrchestratorModelV1::new(
HashMap::new(),
orchestration_model_name,
orchestrator_model_v1::MAX_TOKEN_LEN,
));
let session_ttl =
Duration::from_secs(session_ttl_seconds.unwrap_or(DEFAULT_SESSION_TTL_SECONDS));
OrchestratorService {
orchestrator_url,
client: reqwest::Client::new(),
orchestrator_model,
orchestrator_provider_name,
top_level_preferences,
metrics_service,
session_cache: Some(session_cache),
session_ttl,
tenant_header,
}
}
// ---- Session cache methods ----
#[must_use]
pub fn tenant_header(&self) -> Option<&str> {
self.tenant_header.as_deref()
}
fn session_key<'a>(tenant_id: Option<&str>, session_id: &'a str) -> Cow<'a, str> {
match tenant_id {
Some(t) => Cow::Owned(format!("{t}:{session_id}")),
None => Cow::Borrowed(session_id),
}
}
pub async fn get_cached_route(
&self,
session_id: &str,
tenant_id: Option<&str>,
) -> Option<CachedRoute> {
let cache = self.session_cache.as_ref()?;
cache.get(&Self::session_key(tenant_id, session_id)).await
}
pub async fn cache_route(
&self,
session_id: String,
tenant_id: Option<&str>,
model_name: String,
route_name: Option<String>,
) {
if let Some(ref cache) = self.session_cache {
cache
.put(
&Self::session_key(tenant_id, &session_id),
CachedRoute {
model_name,
route_name,
},
self.session_ttl,
)
.await;
}
}
// ---- LLM routing ----
pub async fn determine_route(
&self,
messages: &[Message],
inline_routing_preferences: Option<Vec<TopLevelRoutingPreference>>,
request_id: &str,
) -> Result<Option<(String, Vec<String>)>> {
if messages.is_empty() {
return Ok(None);
}
let inline_top_map: Option<HashMap<String, TopLevelRoutingPreference>> =
inline_routing_preferences
.map(|prefs| prefs.into_iter().map(|p| (p.name.clone(), p)).collect());
if inline_top_map.is_none() && self.top_level_preferences.is_empty() {
return Ok(None);
}
let effective_source = inline_top_map
.as_ref()
.unwrap_or(&self.top_level_preferences);
let effective_prefs: Vec<AgentUsagePreference> = effective_source
.values()
.map(|p| AgentUsagePreference {
model: p.models.first().cloned().unwrap_or_default(),
orchestration_preferences: vec![OrchestrationPreference {
name: p.name.clone(),
description: p.description.clone(),
}],
})
.collect();
let orchestration_result = self
.determine_orchestration(
messages,
Some(effective_prefs),
Some(request_id.to_string()),
)
.await?;
let result = if let Some(routes) = orchestration_result {
if let Some((route_name, _)) = routes.first() {
let top_pref = inline_top_map
.as_ref()
.and_then(|m| m.get(route_name))
.or_else(|| self.top_level_preferences.get(route_name));
if let Some(pref) = top_pref {
let ranked = match &self.metrics_service {
Some(svc) => svc.rank_models(&pref.models, &pref.selection_policy).await,
None => pref.models.clone(),
};
Some((route_name.clone(), ranked))
} else {
None
}
} else {
None
}
} else {
None
};
info!(
selected_model = ?result,
"plano-orchestrator determined route"
);
Ok(result)
}
// ---- Agent orchestration (existing) ----
pub async fn determine_orchestration(
&self,
messages: &[Message],
@ -80,12 +252,12 @@ impl OrchestratorService {
debug!(
model = %self.orchestrator_model.get_model_name(),
endpoint = %self.orchestrator_url,
"sending request to arch-orchestrator"
"sending request to plano-orchestrator"
);
let body = serde_json::to_string(&orchestrator_request)
.map_err(super::orchestrator_model::OrchestratorModelError::from)?;
debug!(body = %body, "arch orchestrator request");
debug!(body = %body, "plano-orchestrator request");
let mut headers = header::HeaderMap::new();
headers.insert(
@ -98,7 +270,6 @@ impl OrchestratorService {
.unwrap_or_else(|_| header::HeaderValue::from_static("plano-orchestrator")),
);
// Inject OpenTelemetry trace context from current span
global::get_text_map_propagator(|propagator| {
let cx =
tracing_opentelemetry::OpenTelemetrySpanExt::context(&tracing::Span::current());
@ -130,9 +301,112 @@ impl OrchestratorService {
content = %content.replace("\n", "\\n"),
selected_routes = ?parsed,
response_time_ms = elapsed.as_millis(),
"arch-orchestrator determined routes"
"plano-orchestrator determined routes"
);
Ok(parsed)
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::session_cache::memory::MemorySessionCache;
fn make_orchestrator_service(ttl_seconds: u64, max_entries: usize) -> OrchestratorService {
let session_cache = Arc::new(MemorySessionCache::new(max_entries));
OrchestratorService::with_routing(
"http://localhost:12001/v1/chat/completions".to_string(),
"Plano-Orchestrator".to_string(),
"plano-orchestrator".to_string(),
None,
None,
Some(ttl_seconds),
session_cache,
None,
)
}
#[tokio::test]
async fn test_cache_miss_returns_none() {
let svc = make_orchestrator_service(600, 100);
assert!(svc
.get_cached_route("unknown-session", None)
.await
.is_none());
}
#[tokio::test]
async fn test_cache_hit_returns_cached_route() {
let svc = make_orchestrator_service(600, 100);
svc.cache_route(
"s1".to_string(),
None,
"gpt-4o".to_string(),
Some("code".to_string()),
)
.await;
let cached = svc.get_cached_route("s1", None).await.unwrap();
assert_eq!(cached.model_name, "gpt-4o");
assert_eq!(cached.route_name, Some("code".to_string()));
}
#[tokio::test]
async fn test_cache_expired_entry_returns_none() {
let svc = make_orchestrator_service(0, 100);
svc.cache_route("s1".to_string(), None, "gpt-4o".to_string(), None)
.await;
assert!(svc.get_cached_route("s1", None).await.is_none());
}
#[tokio::test]
async fn test_expired_entries_not_returned() {
let svc = make_orchestrator_service(0, 100);
svc.cache_route("s1".to_string(), None, "gpt-4o".to_string(), None)
.await;
svc.cache_route("s2".to_string(), None, "claude".to_string(), None)
.await;
assert!(svc.get_cached_route("s1", None).await.is_none());
assert!(svc.get_cached_route("s2", None).await.is_none());
}
#[tokio::test]
async fn test_cache_evicts_oldest_when_full() {
let svc = make_orchestrator_service(600, 2);
svc.cache_route("s1".to_string(), None, "model-a".to_string(), None)
.await;
tokio::time::sleep(Duration::from_millis(10)).await;
svc.cache_route("s2".to_string(), None, "model-b".to_string(), None)
.await;
svc.cache_route("s3".to_string(), None, "model-c".to_string(), None)
.await;
assert!(svc.get_cached_route("s1", None).await.is_none());
assert!(svc.get_cached_route("s2", None).await.is_some());
assert!(svc.get_cached_route("s3", None).await.is_some());
}
#[tokio::test]
async fn test_cache_update_existing_session_does_not_evict() {
let svc = make_orchestrator_service(600, 2);
svc.cache_route("s1".to_string(), None, "model-a".to_string(), None)
.await;
svc.cache_route("s2".to_string(), None, "model-b".to_string(), None)
.await;
svc.cache_route(
"s1".to_string(),
None,
"model-a-updated".to_string(),
Some("route".to_string()),
)
.await;
let s1 = svc.get_cached_route("s1", None).await.unwrap();
assert_eq!(s1.model_name, "model-a-updated");
assert!(svc.get_cached_route("s2", None).await.is_some());
}
}

View file

@ -11,8 +11,7 @@ pub enum OrchestratorModelError {
pub type Result<T> = std::result::Result<T, OrchestratorModelError>;
/// OrchestratorModel trait for handling orchestration requests.
/// Unlike RouterModel which returns a single route, OrchestratorModel
/// can return multiple routes as the model output format is:
/// Returns multiple routes as the model output format is:
/// {"route": ["route_name_1", "route_name_2", ...]}
pub trait OrchestratorModel: Send + Sync {
fn generate_request(

View file

@ -1,39 +0,0 @@
use hermesllm::apis::openai::{ChatCompletionsRequest, Message};
use serde::{Deserialize, Serialize};
use thiserror::Error;
#[derive(Debug, Error)]
pub enum RoutingModelError {
#[error("Failed to parse JSON: {0}")]
JsonError(#[from] serde_json::Error),
}
pub type Result<T> = std::result::Result<T, RoutingModelError>;
/// Internal route descriptor passed to the router model to build its prompt.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct RoutingPreference {
pub name: String,
pub description: String,
}
/// Groups a model with its routing preferences (used internally by RouterModelV1).
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ModelUsagePreference {
pub model: String,
pub routing_preferences: Vec<RoutingPreference>,
}
pub trait RouterModel: Send + Sync {
fn generate_request(
&self,
messages: &[Message],
usage_preferences: &Option<Vec<ModelUsagePreference>>,
) -> ChatCompletionsRequest;
fn parse_response(
&self,
content: &str,
usage_preferences: &Option<Vec<ModelUsagePreference>>,
) -> Result<Option<(String, String)>>;
fn get_model_name(&self) -> String;
}

View file

@ -1,842 +0,0 @@
use std::collections::HashMap;
use super::router_model::{ModelUsagePreference, RoutingPreference};
use hermesllm::apis::openai::{ChatCompletionsRequest, Message, MessageContent, Role};
use hermesllm::transforms::lib::ExtractText;
use serde::{Deserialize, Serialize};
use tracing::{debug, warn};
use super::router_model::{RouterModel, RoutingModelError};
pub const MAX_TOKEN_LEN: usize = 2048; // Default max token length for the routing model
pub const ARCH_ROUTER_V1_SYSTEM_PROMPT: &str = r#"
You are a helpful assistant designed to find the best suited route.
You are provided with route description within <routes></routes> XML tags:
<routes>
{routes}
</routes>
<conversation>
{conversation}
</conversation>
Your task is to decide which route is best suit with user intent on the conversation in <conversation></conversation> XML tags. Follow the instruction:
1. If the latest intent from user is irrelevant or user intent is full filled, response with other route {"route": "other"}.
2. You must analyze the route descriptions and find the best match route for user latest intent.
3. You only response the name of the route that best matches the user's request, use the exact name in the <routes></routes>.
Based on your analysis, provide your response in the following JSON formats if you decide to match any route:
{"route": "route_name"}
"#;
pub type Result<T> = std::result::Result<T, RoutingModelError>;
pub struct RouterModelV1 {
llm_route_json_str: String,
llm_route_to_model_map: HashMap<String, String>,
routing_model: String,
max_token_length: usize,
}
impl RouterModelV1 {
pub fn new(
llm_routes: HashMap<String, Vec<RoutingPreference>>,
routing_model: String,
max_token_length: usize,
) -> Self {
let llm_route_values: Vec<RoutingPreference> =
llm_routes.values().flatten().cloned().collect();
let llm_route_json_str =
serde_json::to_string(&llm_route_values).unwrap_or_else(|_| "[]".to_string());
let llm_route_to_model_map: HashMap<String, String> = llm_routes
.iter()
.flat_map(|(model, prefs)| prefs.iter().map(|pref| (pref.name.clone(), model.clone())))
.collect();
RouterModelV1 {
routing_model,
max_token_length,
llm_route_json_str,
llm_route_to_model_map,
}
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
struct LlmRouterResponse {
pub route: Option<String>,
}
const TOKEN_LENGTH_DIVISOR: usize = 4; // Approximate token length divisor for UTF-8 characters
impl RouterModel for RouterModelV1 {
fn generate_request(
&self,
messages: &[Message],
usage_preferences_from_request: &Option<Vec<ModelUsagePreference>>,
) -> ChatCompletionsRequest {
// remove system prompt, tool calls, tool call response and messages without content
// if content is empty its likely a tool call
// when role == tool its tool call response
let messages_vec = messages
.iter()
.filter(|m| {
m.role != Role::System
&& m.role != Role::Developer
&& m.role != Role::Tool
&& !m.content.extract_text().is_empty()
})
.collect::<Vec<&Message>>();
// Following code is to ensure that the conversation does not exceed max token length
// Note: we use a simple heuristic to estimate token count based on character length to optimize for performance
let mut token_count = ARCH_ROUTER_V1_SYSTEM_PROMPT.len() / TOKEN_LENGTH_DIVISOR;
let mut selected_messages_list_reversed: Vec<&Message> = vec![];
for (selected_messsage_count, message) in messages_vec.iter().rev().enumerate() {
let message_token_count = message.content.extract_text().len() / TOKEN_LENGTH_DIVISOR;
token_count += message_token_count;
if token_count > self.max_token_length {
debug!(
token_count = token_count,
max_tokens = self.max_token_length,
selected = selected_messsage_count,
total = messages_vec.len(),
"token count exceeds max, truncating conversation"
);
if message.role == Role::User {
// If message that exceeds max token length is from user, we need to keep it
selected_messages_list_reversed.push(message);
}
break;
}
// If we are here, it means that the message is within the max token length
selected_messages_list_reversed.push(message);
}
if selected_messages_list_reversed.is_empty() {
debug!("no messages selected, using last message");
if let Some(last_message) = messages_vec.last() {
selected_messages_list_reversed.push(last_message);
}
}
// ensure that first and last selected message is from user
if let Some(first_message) = selected_messages_list_reversed.first() {
if first_message.role != Role::User {
warn!("last message is not from user, may lead to incorrect routing");
}
}
if let Some(last_message) = selected_messages_list_reversed.last() {
if last_message.role != Role::User {
warn!("first message is not from user, may lead to incorrect routing");
}
}
// Reverse the selected messages to maintain the conversation order
let selected_conversation_list = selected_messages_list_reversed
.iter()
.rev()
.map(|message| {
Message {
role: message.role.clone(),
// we can unwrap here because we have already filtered out messages without content
content: Some(MessageContent::Text(
message
.content
.as_ref()
.map_or(String::new(), |c| c.to_string()),
)),
name: None,
tool_calls: None,
tool_call_id: None,
}
})
.collect::<Vec<Message>>();
// Generate the router request message based on the usage preferences.
// If preferences are passed in request then we use them otherwise we use the default routing model preferences.
let router_message = match convert_to_router_preferences(usage_preferences_from_request) {
Some(prefs) => generate_router_message(&prefs, &selected_conversation_list),
None => generate_router_message(&self.llm_route_json_str, &selected_conversation_list),
};
ChatCompletionsRequest {
model: self.routing_model.clone(),
messages: vec![Message {
content: Some(MessageContent::Text(router_message)),
role: Role::User,
name: None,
tool_calls: None,
tool_call_id: None,
}],
temperature: Some(0.01),
..Default::default()
}
}
fn parse_response(
&self,
content: &str,
usage_preferences: &Option<Vec<ModelUsagePreference>>,
) -> Result<Option<(String, String)>> {
if content.is_empty() {
return Ok(None);
}
let router_resp_fixed = fix_json_response(content);
let router_response: LlmRouterResponse = serde_json::from_str(router_resp_fixed.as_str())?;
let selected_route = router_response.route.unwrap_or_default().to_string();
if selected_route.is_empty() || selected_route == "other" {
return Ok(None);
}
if let Some(usage_preferences) = usage_preferences {
// If usage preferences are defined, we need to find the model that matches the selected route
let model_name: Option<String> = usage_preferences
.iter()
.map(|pref| {
pref.routing_preferences
.iter()
.find(|routing_pref| routing_pref.name == selected_route)
.map(|_| pref.model.clone())
})
.find_map(|model| model);
if let Some(model_name) = model_name {
return Ok(Some((selected_route, model_name)));
} else {
warn!(
route = %selected_route,
preferences = ?usage_preferences,
"no matching model found for route"
);
return Ok(None);
}
}
// If no usage preferences are passed in request then use the default routing model preferences
if let Some(model) = self.llm_route_to_model_map.get(&selected_route).cloned() {
return Ok(Some((selected_route, model)));
}
warn!(
route = %selected_route,
preferences = ?self.llm_route_to_model_map,
"no model found for route"
);
Ok(None)
}
fn get_model_name(&self) -> String {
self.routing_model.clone()
}
}
fn generate_router_message(prefs: &str, selected_conversation_list: &Vec<Message>) -> String {
ARCH_ROUTER_V1_SYSTEM_PROMPT
.replace("{routes}", prefs)
.replace(
"{conversation}",
&serde_json::to_string(&selected_conversation_list).unwrap_or_default(),
)
}
fn convert_to_router_preferences(
prefs_from_request: &Option<Vec<ModelUsagePreference>>,
) -> Option<String> {
if let Some(usage_preferences) = prefs_from_request {
let routing_preferences = usage_preferences
.iter()
.flat_map(|pref| {
pref.routing_preferences
.iter()
.map(|routing_pref| RoutingPreference {
name: routing_pref.name.clone(),
description: routing_pref.description.clone(),
})
})
.collect::<Vec<RoutingPreference>>();
return Some(serde_json::to_string(&routing_preferences).unwrap_or_default());
}
None
}
fn fix_json_response(body: &str) -> String {
let mut updated_body = body.to_string();
updated_body = updated_body.replace("'", "\"");
if updated_body.contains("\\n") {
updated_body = updated_body.replace("\\n", "");
}
if updated_body.starts_with("```json") {
updated_body = updated_body
.strip_prefix("```json")
.unwrap_or(&updated_body)
.to_string();
}
if updated_body.ends_with("```") {
updated_body = updated_body
.strip_suffix("```")
.unwrap_or(&updated_body)
.to_string();
}
updated_body
}
impl std::fmt::Debug for dyn RouterModel {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "RouterModel")
}
}
#[cfg(test)]
mod tests {
use super::*;
use pretty_assertions::assert_eq;
#[test]
fn test_system_prompt_format() {
let expected_prompt = r#"
You are a helpful assistant designed to find the best suited route.
You are provided with route description within <routes></routes> XML tags:
<routes>
[{"name":"Image generation","description":"generating image"}]
</routes>
<conversation>
[{"role":"user","content":"hi"},{"role":"assistant","content":"Hello! How can I assist you today?"},{"role":"user","content":"given the image In style of Andy Warhol, portrait of Bart and Lisa Simpson"}]
</conversation>
Your task is to decide which route is best suit with user intent on the conversation in <conversation></conversation> XML tags. Follow the instruction:
1. If the latest intent from user is irrelevant or user intent is full filled, response with other route {"route": "other"}.
2. You must analyze the route descriptions and find the best match route for user latest intent.
3. You only response the name of the route that best matches the user's request, use the exact name in the <routes></routes>.
Based on your analysis, provide your response in the following JSON formats if you decide to match any route:
{"route": "route_name"}
"#;
let routes_str = r#"
{
"gpt-4o": [
{"name": "Image generation", "description": "generating image"}
]
}
"#;
let llm_routes =
serde_json::from_str::<HashMap<String, Vec<RoutingPreference>>>(routes_str).unwrap();
let routing_model = "test-model".to_string();
let router = RouterModelV1::new(llm_routes, routing_model, usize::MAX);
let conversation_str = r#"
[
{
"role": "user",
"content": "hi"
},
{
"role": "assistant",
"content": "Hello! How can I assist you today?"
},
{
"role": "user",
"content": "given the image In style of Andy Warhol, portrait of Bart and Lisa Simpson"
}
]
"#;
let conversation: Vec<Message> = serde_json::from_str(conversation_str).unwrap();
let req = router.generate_request(&conversation, &None);
let prompt = req.messages[0].content.extract_text();
assert_eq!(expected_prompt, prompt);
}
#[test]
fn test_system_prompt_format_usage_preferences() {
let expected_prompt = r#"
You are a helpful assistant designed to find the best suited route.
You are provided with route description within <routes></routes> XML tags:
<routes>
[{"name":"code-generation","description":"generating new code snippets, functions, or boilerplate based on user prompts or requirements"}]
</routes>
<conversation>
[{"role":"user","content":"hi"},{"role":"assistant","content":"Hello! How can I assist you today?"},{"role":"user","content":"given the image In style of Andy Warhol, portrait of Bart and Lisa Simpson"}]
</conversation>
Your task is to decide which route is best suit with user intent on the conversation in <conversation></conversation> XML tags. Follow the instruction:
1. If the latest intent from user is irrelevant or user intent is full filled, response with other route {"route": "other"}.
2. You must analyze the route descriptions and find the best match route for user latest intent.
3. You only response the name of the route that best matches the user's request, use the exact name in the <routes></routes>.
Based on your analysis, provide your response in the following JSON formats if you decide to match any route:
{"route": "route_name"}
"#;
let routes_str = r#"
{
"gpt-4o": [
{"name": "Image generation", "description": "generating image"}
]
}
"#;
let llm_routes =
serde_json::from_str::<HashMap<String, Vec<RoutingPreference>>>(routes_str).unwrap();
let routing_model = "test-model".to_string();
let router = RouterModelV1::new(llm_routes, routing_model, usize::MAX);
let conversation_str = r#"
[
{
"role": "user",
"content": "hi"
},
{
"role": "assistant",
"content": "Hello! How can I assist you today?"
},
{
"role": "user",
"content": "given the image In style of Andy Warhol, portrait of Bart and Lisa Simpson"
}
]
"#;
let conversation: Vec<Message> = serde_json::from_str(conversation_str).unwrap();
let usage_preferences = Some(vec![ModelUsagePreference {
model: "claude/claude-3-7-sonnet".to_string(),
routing_preferences: vec![RoutingPreference {
name: "code-generation".to_string(),
description: "generating new code snippets, functions, or boilerplate based on user prompts or requirements".to_string(),
}],
}]);
let req = router.generate_request(&conversation, &usage_preferences);
let prompt = req.messages[0].content.extract_text();
assert_eq!(expected_prompt, prompt);
}
#[test]
fn test_conversation_exceed_token_count() {
let expected_prompt = r#"
You are a helpful assistant designed to find the best suited route.
You are provided with route description within <routes></routes> XML tags:
<routes>
[{"name":"Image generation","description":"generating image"}]
</routes>
<conversation>
[{"role":"user","content":"given the image In style of Andy Warhol, portrait of Bart and Lisa Simpson"}]
</conversation>
Your task is to decide which route is best suit with user intent on the conversation in <conversation></conversation> XML tags. Follow the instruction:
1. If the latest intent from user is irrelevant or user intent is full filled, response with other route {"route": "other"}.
2. You must analyze the route descriptions and find the best match route for user latest intent.
3. You only response the name of the route that best matches the user's request, use the exact name in the <routes></routes>.
Based on your analysis, provide your response in the following JSON formats if you decide to match any route:
{"route": "route_name"}
"#;
let routes_str = r#"
{
"gpt-4o": [
{"name": "Image generation", "description": "generating image"}
]
}
"#;
let llm_routes =
serde_json::from_str::<HashMap<String, Vec<RoutingPreference>>>(routes_str).unwrap();
let routing_model = "test-model".to_string();
let router = RouterModelV1::new(llm_routes, routing_model, 235);
let conversation_str = r#"
[
{
"role": "user",
"content": "hi"
},
{
"role": "assistant",
"content": "Hello! How can I assist you today?"
},
{
"role": "user",
"content": "given the image In style of Andy Warhol, portrait of Bart and Lisa Simpson"
}
]
"#;
let conversation: Vec<Message> = serde_json::from_str(conversation_str).unwrap();
let req = router.generate_request(&conversation, &None);
let prompt = req.messages[0].content.extract_text();
assert_eq!(expected_prompt, prompt);
}
#[test]
fn test_conversation_exceed_token_count_large_single_message() {
let expected_prompt = r#"
You are a helpful assistant designed to find the best suited route.
You are provided with route description within <routes></routes> XML tags:
<routes>
[{"name":"Image generation","description":"generating image"}]
</routes>
<conversation>
[{"role":"user","content":"given the image In style of Andy Warhol, portrait of Bart and Lisa Simpson and this is a very long message that exceeds the max token length of the routing model, so it should be truncated and only the last user message should be included in the conversation for routing."}]
</conversation>
Your task is to decide which route is best suit with user intent on the conversation in <conversation></conversation> XML tags. Follow the instruction:
1. If the latest intent from user is irrelevant or user intent is full filled, response with other route {"route": "other"}.
2. You must analyze the route descriptions and find the best match route for user latest intent.
3. You only response the name of the route that best matches the user's request, use the exact name in the <routes></routes>.
Based on your analysis, provide your response in the following JSON formats if you decide to match any route:
{"route": "route_name"}
"#;
let routes_str = r#"
{
"gpt-4o": [
{"name": "Image generation", "description": "generating image"}
]
}
"#;
let llm_routes =
serde_json::from_str::<HashMap<String, Vec<RoutingPreference>>>(routes_str).unwrap();
let routing_model = "test-model".to_string();
let router = RouterModelV1::new(llm_routes, routing_model, 200);
let conversation_str = r#"
[
{
"role": "user",
"content": "hi"
},
{
"role": "assistant",
"content": "Hello! How can I assist you today?"
},
{
"role": "user",
"content": "given the image In style of Andy Warhol, portrait of Bart and Lisa Simpson and this is a very long message that exceeds the max token length of the routing model, so it should be truncated and only the last user message should be included in the conversation for routing."
}
]
"#;
let conversation: Vec<Message> = serde_json::from_str(conversation_str).unwrap();
let req = router.generate_request(&conversation, &None);
let prompt = req.messages[0].content.extract_text();
assert_eq!(expected_prompt, prompt);
}
#[test]
fn test_conversation_trim_upto_user_message() {
let expected_prompt = r#"
You are a helpful assistant designed to find the best suited route.
You are provided with route description within <routes></routes> XML tags:
<routes>
[{"name":"Image generation","description":"generating image"}]
</routes>
<conversation>
[{"role":"user","content":"given the image In style of Andy Warhol"},{"role":"assistant","content":"ok here is the image"},{"role":"user","content":"pls give me another image about Bart and Lisa"}]
</conversation>
Your task is to decide which route is best suit with user intent on the conversation in <conversation></conversation> XML tags. Follow the instruction:
1. If the latest intent from user is irrelevant or user intent is full filled, response with other route {"route": "other"}.
2. You must analyze the route descriptions and find the best match route for user latest intent.
3. You only response the name of the route that best matches the user's request, use the exact name in the <routes></routes>.
Based on your analysis, provide your response in the following JSON formats if you decide to match any route:
{"route": "route_name"}
"#;
let routes_str = r#"
{
"gpt-4o": [
{"name": "Image generation", "description": "generating image"}
]
}
"#;
let llm_routes =
serde_json::from_str::<HashMap<String, Vec<RoutingPreference>>>(routes_str).unwrap();
let routing_model = "test-model".to_string();
let router = RouterModelV1::new(llm_routes, routing_model, 230);
let conversation_str = r#"
[
{
"role": "user",
"content": "hi"
},
{
"role": "assistant",
"content": "Hello! How can I assist you today?"
},
{
"role": "user",
"content": "given the image In style of Andy Warhol"
},
{
"role": "assistant",
"content": "ok here is the image"
},
{
"role": "user",
"content": "pls give me another image about Bart and Lisa"
}
]
"#;
let conversation: Vec<Message> = serde_json::from_str(conversation_str).unwrap();
let req = router.generate_request(&conversation, &None);
let prompt = req.messages[0].content.extract_text();
assert_eq!(expected_prompt, prompt);
}
#[test]
fn test_non_text_input() {
let expected_prompt = r#"
You are a helpful assistant designed to find the best suited route.
You are provided with route description within <routes></routes> XML tags:
<routes>
[{"name":"Image generation","description":"generating image"}]
</routes>
<conversation>
[{"role":"user","content":"hi"},{"role":"assistant","content":"Hello! How can I assist you today?"},{"role":"user","content":"given the image In style of Andy Warhol, portrait of Bart and Lisa Simpson"}]
</conversation>
Your task is to decide which route is best suit with user intent on the conversation in <conversation></conversation> XML tags. Follow the instruction:
1. If the latest intent from user is irrelevant or user intent is full filled, response with other route {"route": "other"}.
2. You must analyze the route descriptions and find the best match route for user latest intent.
3. You only response the name of the route that best matches the user's request, use the exact name in the <routes></routes>.
Based on your analysis, provide your response in the following JSON formats if you decide to match any route:
{"route": "route_name"}
"#;
let routes_str = r#"
{
"gpt-4o": [
{"name": "Image generation", "description": "generating image"}
]
}
"#;
let llm_routes =
serde_json::from_str::<HashMap<String, Vec<RoutingPreference>>>(routes_str).unwrap();
let routing_model = "test-model".to_string();
let router = RouterModelV1::new(llm_routes, routing_model, usize::MAX);
let conversation_str = r#"
[
{
"role": "user",
"content": [
{
"type": "text",
"text": "hi"
},
{
"type": "image_url",
"image_url": {
"url": "https://example.com/image.png"
}
}
]
},
{
"role": "assistant",
"content": "Hello! How can I assist you today?"
},
{
"role": "user",
"content": "given the image In style of Andy Warhol, portrait of Bart and Lisa Simpson"
}
]
"#;
let conversation: Vec<Message> = serde_json::from_str(conversation_str).unwrap();
let req = router.generate_request(&conversation, &None);
let prompt = req.messages[0].content.extract_text();
assert_eq!(expected_prompt, prompt);
}
#[test]
fn test_skip_tool_call() {
let expected_prompt = r#"
You are a helpful assistant designed to find the best suited route.
You are provided with route description within <routes></routes> XML tags:
<routes>
[{"name":"Image generation","description":"generating image"}]
</routes>
<conversation>
[{"role":"user","content":"What's the weather like in Tokyo?"},{"role":"assistant","content":"The current weather in Tokyo is 22°C and sunny."},{"role":"user","content":"What about in New York?"}]
</conversation>
Your task is to decide which route is best suit with user intent on the conversation in <conversation></conversation> XML tags. Follow the instruction:
1. If the latest intent from user is irrelevant or user intent is full filled, response with other route {"route": "other"}.
2. You must analyze the route descriptions and find the best match route for user latest intent.
3. You only response the name of the route that best matches the user's request, use the exact name in the <routes></routes>.
Based on your analysis, provide your response in the following JSON formats if you decide to match any route:
{"route": "route_name"}
"#;
let routes_str = r#"
{
"gpt-4o": [
{"name": "Image generation", "description": "generating image"}
]
}
"#;
let llm_routes =
serde_json::from_str::<HashMap<String, Vec<RoutingPreference>>>(routes_str).unwrap();
let routing_model = "test-model".to_string();
let router = RouterModelV1::new(llm_routes, routing_model, usize::MAX);
let conversation_str = r#"
[
{
"role": "user",
"content": "What's the weather like in Tokyo?"
},
{
"role": "assistant",
"content": "",
"tool_calls": [
{
"id": "toolcall-abc123",
"type": "function",
"function": {
"name": "get_weather",
"arguments": "{ \"location\": \"Tokyo\" }"
}
}
]
},
{
"role": "tool",
"tool_call_id": "toolcall-abc123",
"content": "{ \"temperature\": \"22°C\", \"condition\": \"Sunny\" }"
},
{
"role": "assistant",
"content": "The current weather in Tokyo is 22°C and sunny."
},
{
"role": "user",
"content": "What about in New York?"
}
]
"#;
// expects conversation to look like this
// [
// {
// "role": "user",
// "content": "What's the weather like in Tokyo?"
// },
// {
// "role": "assistant",
// "content": "The current weather in Tokyo is 22°C and sunny."
// },
// {
// "role": "user",
// "content": "What about in New York?"
// }
// ]
let conversation: Vec<Message> = serde_json::from_str(conversation_str).unwrap();
let req: ChatCompletionsRequest = router.generate_request(&conversation, &None);
let prompt = req.messages[0].content.extract_text();
assert_eq!(expected_prompt, prompt);
}
#[test]
fn test_parse_response() {
let routes_str = r#"
{
"gpt-4o": [
{"name": "Image generation", "description": "generating image"}
]
}
"#;
let llm_routes =
serde_json::from_str::<HashMap<String, Vec<RoutingPreference>>>(routes_str).unwrap();
let router = RouterModelV1::new(llm_routes, "test-model".to_string(), 2000);
// Case 1: Valid JSON with non-empty route
let input = r#"{"route": "Image generation"}"#;
let result = router.parse_response(input, &None).unwrap();
assert_eq!(
result,
Some(("Image generation".to_string(), "gpt-4o".to_string()))
);
// Case 2: Valid JSON with empty route
let input = r#"{"route": ""}"#;
let result = router.parse_response(input, &None).unwrap();
assert_eq!(result, None);
// Case 3: Valid JSON with null route
let input = r#"{"route": null}"#;
let result = router.parse_response(input, &None).unwrap();
assert_eq!(result, None);
// Case 4: JSON missing route field
let input = r#"{}"#;
let result = router.parse_response(input, &None).unwrap();
assert_eq!(result, None);
// Case 4.1: empty string
let input = r#""#;
let result = router.parse_response(input, &None).unwrap();
assert_eq!(result, None);
// Case 5: Malformed JSON
let input = r#"{"route": "route1""#; // missing closing }
let result = router.parse_response(input, &None);
assert!(result.is_err());
// Case 6: Single quotes and \n in JSON
let input = "{'route': 'Image generation'}\\n";
let result = router.parse_response(input, &None).unwrap();
assert_eq!(
result,
Some(("Image generation".to_string(), "gpt-4o".to_string()))
);
// Case 7: Code block marker
let input = "```json\n{\"route\": \"Image generation\"}\n```";
let result = router.parse_response(input, &None).unwrap();
assert_eq!(
result,
Some(("Image generation".to_string(), "gpt-4o".to_string()))
);
}
}

View file

@ -729,13 +729,6 @@ mod test {
internal: None,
..Default::default()
},
LlmProvider {
name: "arch-router".to_string(),
provider_interface: LlmProviderType::Plano,
model: Some("Arch-Router".to_string()),
internal: Some(true),
..Default::default()
},
LlmProvider {
name: "plano-orchestrator".to_string(),
provider_interface: LlmProviderType::Plano,
@ -747,13 +740,10 @@ mod test {
let models = providers.into_models();
// Should only have 1 model: openai-gpt4
assert_eq!(models.data.len(), 1);
// Verify internal models are excluded from /v1/models
let model_ids: Vec<String> = models.data.iter().map(|m| m.id.clone()).collect();
assert!(model_ids.contains(&"openai-gpt4".to_string()));
assert!(!model_ids.contains(&"arch-router".to_string()));
assert!(!model_ids.contains(&"plano-orchestrator".to_string()));
}
}

View file

@ -1,6 +1,6 @@
#!/usr/bin/env bash
# Pretty-print Plano MODEL_RESOLUTION lines from docker logs
# - hides Arch-Router
# - hides Plano-Orchestrator
# - prints timestamp
# - colors MODEL_RESOLUTION red
# - colors req_model cyan
@ -9,7 +9,7 @@
docker logs -f plano 2>&1 \
| awk '
/MODEL_RESOLUTION:/ && $0 !~ /Arch-Router/ {
/MODEL_RESOLUTION:/ && $0 !~ /Plano-Orchestrator/ {
# extract timestamp between first [ and ]
ts=""
if (match($0, /\[[0-9-]+ [0-9:.]+\]/)) {

View file

@ -1,6 +1,6 @@
#!/usr/bin/env bash
# Pretty-print Plano MODEL_RESOLUTION lines from docker logs
# - hides Arch-Router
# - hides Plano-Orchestrator
# - prints timestamp
# - colors MODEL_RESOLUTION red
# - colors req_model cyan
@ -9,7 +9,7 @@
docker logs -f plano 2>&1 \
| awk '
/MODEL_RESOLUTION:/ && $0 !~ /Arch-Router/ {
/MODEL_RESOLUTION:/ && $0 !~ /Plano-Orchestrator/ {
# extract timestamp between first [ and ]
ts=""
if (match($0, /\[[0-9-]+ [0-9:.]+\]/)) {

View file

@ -6,7 +6,7 @@ Plano is an AI-native proxy and data plane for agentic apps — with built-in or
┌───────────┐ ┌─────────────────────────────────┐ ┌──────────────┐
│ Client │ ───► │ Plano │ ───► │ OpenAI │
│ (any │ │ │ │ Anthropic │
│ language)│ │ Arch-Router (1.5B model) │ │ Any Provider│
│ language)│ │ Plano-Orchestrator │ │ Any Provider│
└───────────┘ │ analyzes intent → picks model │ └──────────────┘
└─────────────────────────────────┘
```
@ -39,17 +39,17 @@ routing_preferences:
When a request arrives, Plano:
1. Sends the conversation + route descriptions to Arch-Router for intent classification
1. Sends the conversation + route descriptions to Plano-Orchestrator for intent classification
2. Looks up the matched route and returns its candidate models
3. Returns an ordered list — client uses `models[0]`, falls back to `models[1]` on 429/5xx
```
1. Request arrives → "Write binary search in Python"
2. Arch-Router classifies → route: "code_generation"
2. Plano-Orchestrator classifies → route: "code_generation"
3. Response → models: ["anthropic/claude-sonnet-4-20250514", "openai/gpt-4o"]
```
No match? Arch-Router returns `null` route → client falls back to the model in the original request.
No match? Plano-Orchestrator returns an empty route → client falls back to the model in the original request.
The `/routing/v1/*` endpoints return the routing decision **without** forwarding to the LLM — useful for testing routing behavior before going to production.
@ -163,9 +163,9 @@ routing:
Without the `X-Model-Affinity` header, routing runs fresh every time (no breaking change).
## Kubernetes Deployment (Self-hosted Arch-Router on GPU)
## Kubernetes Deployment (Self-hosted Plano-Orchestrator on GPU)
To run Arch-Router in-cluster using vLLM instead of the default hosted endpoint:
To run Plano-Orchestrator in-cluster using vLLM instead of the default hosted endpoint:
**0. Check your GPU node labels and taints**
@ -176,10 +176,10 @@ kubectl get node <gpu-node-name> -o jsonpath='{.spec.taints}'
GPU nodes commonly have a `nvidia.com/gpu:NoSchedule` taint — `vllm-deployment.yaml` includes a matching toleration. If you have multiple GPU node pools and need to pin to a specific one, uncomment and set the `nodeSelector` in `vllm-deployment.yaml` using the label for your cloud provider.
**1. Deploy Arch-Router and Plano:**
**1. Deploy Plano-Orchestrator and Plano:**
```bash
# arch-router deployment
# plano-orchestrator deployment
kubectl apply -f vllm-deployment.yaml
# plano deployment
@ -197,8 +197,8 @@ kubectl apply -f plano-deployment.yaml
**3. Wait for both pods to be ready:**
```bash
# Arch-Router downloads the model (~1 min) then vLLM loads it (~2 min)
kubectl get pods -l app=arch-router -w
# Plano-Orchestrator downloads the model (~1 min) then vLLM loads it (~2 min)
kubectl get pods -l app=plano-orchestrator -w
kubectl rollout status deployment/plano
```
@ -209,10 +209,10 @@ kubectl port-forward svc/plano 12000:12000
./demo.sh
```
To confirm requests are hitting your in-cluster Arch-Router (not just health checks):
To confirm requests are hitting your in-cluster Plano-Orchestrator (not just health checks):
```bash
kubectl logs -l app=arch-router -f --tail=0
kubectl logs -l app=plano-orchestrator -f --tail=0
# Look for POST /v1/chat/completions entries
```

View file

@ -1,7 +1,7 @@
version: v0.3.0
overrides:
llm_routing_model: plano/Arch-Router
llm_routing_model: plano/Plano-Orchestrator
listeners:
- type: model
@ -10,8 +10,8 @@ listeners:
model_providers:
- model: plano/Arch-Router
base_url: http://arch-router:10000
- model: plano/Plano-Orchestrator
base_url: http://plano-orchestrator:10000
- model: openai/gpt-4o-mini
access_key: $OPENAI_API_KEY

View file

@ -1,18 +1,18 @@
apiVersion: apps/v1
kind: Deployment
metadata:
name: arch-router
name: plano-orchestrator
labels:
app: arch-router
app: plano-orchestrator
spec:
replicas: 1
selector:
matchLabels:
app: arch-router
app: plano-orchestrator
template:
metadata:
labels:
app: arch-router
app: plano-orchestrator
spec:
tolerations:
- key: nvidia.com/gpu
@ -53,7 +53,7 @@ spec:
- "--tokenizer"
- "katanemo/Arch-Router-1.5B"
- "--served-model-name"
- "Arch-Router"
- "Plano-Orchestrator"
- "--gpu-memory-utilization"
- "0.3"
- "--tensor-parallel-size"
@ -94,10 +94,10 @@ spec:
apiVersion: v1
kind: Service
metadata:
name: arch-router
name: plano-orchestrator
spec:
selector:
app: arch-router
app: plano-orchestrator
ports:
- name: http
port: 10000

View file

@ -1,7 +1,7 @@
version: v0.1.0
overrides:
llm_routing_model: Arch-Router
llm_routing_model: Plano-Orchestrator
listeners:
egress_traffic:

View file

@ -32,9 +32,9 @@ planoai up config.yaml
3. Test with curl or open AnythingLLM http://localhost:3001/
## Running with local Arch-Router (via Ollama)
## Running with local routing model (via Ollama)
By default, Plano uses a hosted Arch-Router endpoint. To self-host Arch-Router locally using Ollama:
By default, Plano uses a hosted Plano-Orchestrator endpoint. To self-host a routing model locally using Ollama:
1. Install [Ollama](https://ollama.ai) and pull the model:
```bash

View file

@ -22,11 +22,11 @@ Content-Type: application/json
### get model list from arch-function
GET https://archfc.katanemo.dev/v1/models HTTP/1.1
model: Arch-Router
model: Plano-Orchestrator
### get model list from Arch-Router (notice model header)
### get model list from Plano-Orchestrator (notice model header)
GET https://archfc.katanemo.dev/v1/models HTTP/1.1
model: Arch-Router
model: Plano-Orchestrator
### test try code generating

View file

@ -133,16 +133,16 @@ Clients use semantic names:
.. _preference_aligned_routing:
Preference-aligned routing (Arch-Router)
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Preference-aligned routing (Plano-Orchestrator)
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Preference-aligned routing uses the `Arch-Router <https://huggingface.co/katanemo/Arch-Router-1.5B>`_ model to pick the best LLM based on domain, action, and your configured preferences instead of hard-coding a model.
Preference-aligned routing uses the Plano-Orchestrator model to pick the best LLM based on domain, action, and your configured preferences instead of hard-coding a model.
- **Domain**: High-level topic of the request (e.g., legal, healthcare, programming).
- **Action**: What the user wants to do (e.g., summarize, generate code, translate).
- **Routing preferences**: Your mapping from (domain, action) to preferred models.
Arch-Router analyzes each prompt to infer domain and action, then applies your preferences to select a model. This decouples **routing policy** (how to choose) from **model assignment** (what to run), making routing transparent, controllable, and easy to extend as you add or swap models.
Plano-Orchestrator analyzes each prompt to infer domain and action, then applies your preferences to select a model. This decouples **routing policy** (how to choose) from **model assignment** (what to run), making routing transparent, controllable, and easy to extend as you add or swap models.
Configuration
^^^^^^^^^^^^^
@ -187,21 +187,21 @@ Clients can let the router decide or still specify aliases:
.. code-block:: python
# Let Arch-Router choose based on content
# Let Plano-Orchestrator choose based on content
response = client.chat.completions.create(
messages=[{"role": "user", "content": "Write a creative story about space exploration"}]
# No model specified - router will analyze and choose claude-sonnet-4-5
)
Arch-Router
-----------
The `Arch-Router <https://huggingface.co/katanemo/Arch-Router-1.5B>`_ is a state-of-the-art **preference-based routing model** specifically designed to address the limitations of traditional LLM routing. This compact 1.5B model delivers production-ready performance with low latency and high accuracy while solving key routing challenges.
Plano-Orchestrator
-------------------
Plano-Orchestrator is a **preference-based routing model** specifically designed to address the limitations of traditional LLM routing. It delivers production-ready performance with low latency and high accuracy while solving key routing challenges.
**Addressing Traditional Routing Limitations:**
**Human Preference Alignment**
Unlike benchmark-driven approaches, Arch-Router learns to match queries with human preferences by using domain-action mappings that capture subjective evaluation criteria, ensuring routing decisions align with real-world user needs.
Unlike benchmark-driven approaches, Plano-Orchestrator learns to match queries with human preferences by using domain-action mappings that capture subjective evaluation criteria, ensuring routing decisions align with real-world user needs.
**Flexible Model Integration**
The system supports seamlessly adding new models for routing without requiring retraining or architectural modifications, enabling dynamic adaptation to evolving model landscapes.
@ -209,15 +209,15 @@ The system supports seamlessly adding new models for routing without requiring r
**Preference-Encoded Routing**
Provides a practical mechanism to encode user preferences through domain-action mappings, offering transparent and controllable routing decisions that can be customized for specific use cases.
To support effective routing, Arch-Router introduces two key concepts:
To support effective routing, Plano-Orchestrator introduces two key concepts:
- **Domain** the high-level thematic category or subject matter of a request (e.g., legal, healthcare, programming).
- **Action** the specific type of operation the user wants performed (e.g., summarization, code generation, booking appointment, translation).
Both domain and action configs are associated with preferred models or model variants. At inference time, Arch-Router analyzes the incoming prompt to infer its domain and action using semantic similarity, task indicators, and contextual cues. It then applies the user-defined routing preferences to select the model best suited to handle the request.
Both domain and action configs are associated with preferred models or model variants. At inference time, Plano-Orchestrator analyzes the incoming prompt to infer its domain and action using semantic similarity, task indicators, and contextual cues. It then applies the user-defined routing preferences to select the model best suited to handle the request.
In summary, Arch-Router demonstrates:
In summary, Plano-Orchestrator demonstrates:
- **Structured Preference Routing**: Aligns prompt request with model strengths using explicit domainaction mappings.
@ -228,10 +228,10 @@ In summary, Arch-Router demonstrates:
- **Production-Ready Performance**: Optimized for low-latency, high-throughput applications in multi-model environments.
Self-hosting Arch-Router
------------------------
Self-hosting Plano-Orchestrator
-------------------------------
By default, Plano uses a hosted Arch-Router endpoint. To run Arch-Router locally, you can serve the model yourself using either **Ollama** or **vLLM**.
By default, Plano uses a hosted Plano-Orchestrator endpoint. To run Plano-Orchestrator locally, you can serve the model yourself using either **Ollama** or **vLLM**.
Using Ollama (recommended for local development)
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@ -240,7 +240,7 @@ Using Ollama (recommended for local development)
Download and install from `ollama.ai <https://ollama.ai>`_.
2. **Pull and serve Arch-Router**
2. **Pull and serve the routing model**
.. code-block:: bash
@ -249,7 +249,7 @@ Using Ollama (recommended for local development)
This downloads the quantized GGUF model from HuggingFace and starts serving on ``http://localhost:11434``.
3. **Configure Plano to use local Arch-Router**
3. **Configure Plano to use local routing model**
.. code-block:: yaml
@ -313,7 +313,7 @@ vLLM provides higher throughput and GPU optimizations suitable for production de
--load-format gguf \
--chat-template ${SNAPSHOT_DIR}template.jinja \
--tokenizer katanemo/Arch-Router-1.5B \
--served-model-name Arch-Router \
--served-model-name Plano-Orchestrator \
--gpu-memory-utilization 0.3 \
--tensor-parallel-size 1 \
--enable-prefix-caching
@ -323,10 +323,10 @@ vLLM provides higher throughput and GPU optimizations suitable for production de
.. code-block:: yaml
overrides:
llm_routing_model: plano/Arch-Router
llm_routing_model: plano/Plano-Orchestrator
model_providers:
- model: plano/Arch-Router
- model: plano/Plano-Orchestrator
base_url: http://<your-server-ip>:10000
- model: openai/gpt-5.2
@ -350,14 +350,14 @@ vLLM provides higher throughput and GPU optimizations suitable for production de
Using vLLM on Kubernetes (GPU nodes)
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
For teams running Kubernetes, Arch-Router and Plano can be deployed as in-cluster services.
For teams running Kubernetes, Plano-Orchestrator and Plano can be deployed as in-cluster services.
The ``demos/llm_routing/model_routing_service/`` directory includes ready-to-use manifests:
- ``vllm-deployment.yaml``Arch-Router served by vLLM, with an init container to download
- ``vllm-deployment.yaml``Plano-Orchestrator served by vLLM, with an init container to download
the model from HuggingFace
- ``plano-deployment.yaml`` — Plano proxy configured to use the in-cluster Arch-Router
- ``plano-deployment.yaml`` — Plano proxy configured to use the in-cluster Plano-Orchestrator
- ``config_k8s.yaml`` — Plano config with ``llm_routing_model`` pointing at
``http://arch-router:10000`` instead of the default hosted endpoint
``http://plano-orchestrator:10000`` instead of the default hosted endpoint
Key things to know before deploying:
@ -504,7 +504,7 @@ This configuration allows clients to:
Example Use Cases
-----------------
Here are common scenarios where Arch-Router excels:
Here are common scenarios where Plano-Orchestrator excels:
- **Coding Tasks**: Distinguish between code generation requests ("write a Python function"), debugging needs ("fix this error"), and code optimization ("make this faster"), routing each to appropriately specialized models.
@ -545,10 +545,10 @@ Best practices
Unsupported Features
--------------------
The following features are **not supported** by the Arch-Router model:
The following features are **not supported** by the Plano-Orchestrator routing model:
- **Multi-modality**: The model is not trained to process raw image or audio inputs. It can handle textual queries *about* these modalities (e.g., "generate an image of a cat"), but cannot interpret encoded multimedia data directly.
- **Function calling**: Arch-Router is designed for **semantic preference matching**, not exact intent classification or tool execution. For structured function invocation, use models in the Plano Function Calling collection instead.
- **Function calling**: Plano-Orchestrator is designed for **semantic preference matching**, not exact intent classification or tool execution. For structured function invocation, use models in the Plano Function Calling collection instead.
- **System prompt dependency**: Arch-Router routes based solely on the users conversation history. It does not use or rely on system prompts for routing decisions.
- **System prompt dependency**: Plano-Orchestrator routes based solely on the users conversation history. It does not use or rely on system prompts for routing decisions.

View file

@ -34,7 +34,7 @@ model_providers:
# routing_preferences: tags a model with named capabilities so Plano's LLM router
# can select the best model for each request based on intent. Requires the
# Arch-Router model (or equivalent) to be configured in overrides.llm_routing_model.
# Plano-Orchestrator model (or equivalent) to be configured in overrides.llm_routing_model.
# Each preference has a name (short label) and a description (used for intent matching).
- model: groq/llama-3.3-70b-versatile
access_key: $GROQ_API_KEY
@ -170,7 +170,7 @@ overrides:
# Path to the trusted CA bundle for upstream TLS verification
upstream_tls_ca_path: /etc/ssl/certs/ca-certificates.crt
# Model used for intent-based LLM routing (must be listed in model_providers)
llm_routing_model: Arch-Router
llm_routing_model: Plano-Orchestrator
# Model used for agent orchestration (must be listed in model_providers)
agent_orchestration_model: Plano-Orchestrator

View file

@ -156,10 +156,6 @@ model_providers:
port: 443
protocol: https
provider_interface: openai
- internal: true
model: Arch-Router
name: arch-router
provider_interface: plano
- internal: true
model: Arch-Function
name: arch-function
@ -170,7 +166,7 @@ model_providers:
provider_interface: plano
overrides:
agent_orchestration_model: Plano-Orchestrator
llm_routing_model: Arch-Router
llm_routing_model: Plano-Orchestrator
optimize_context_window: true
prompt_target_intent_matching_threshold: 0.7
upstream_connect_timeout: 10s